From b7b9dc0b35a836dedb02b109c9d4a79a0db2aaca Mon Sep 17 00:00:00 2001
From: Gwen <grvosku@sandia.gov>
Date: Mon, 26 Oct 2020 15:19:38 -0600
Subject: [PATCH 001/154] Fix for bugs in lazy write handling

---
 src/gpgpu-sim/gpu-cache.cc |  5 ++++-
 src/gpgpu-sim/gpu-cache.h  | 27 +++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 75c369136..d44c959b3 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -1455,16 +1455,19 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
       m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf);
   assert(m_status != HIT);
   cache_block_t *block = m_tag_array->get_block(cache_index);
-  block->set_status(MODIFIED, mf->get_access_sector_mask());
   if (m_status == HIT_RESERVED) {
     block->set_ignore_on_fill(true, mf->get_access_sector_mask());
     block->set_modified_on_fill(true, mf->get_access_sector_mask());
+  } else {
+    block->set_status(MODIFIED, mf->get_access_sector_mask());
   }
 
   if (mf->get_access_byte_mask().count() == m_config.get_atom_sz()) {
     block->set_m_readable(true, mf->get_access_sector_mask());
   } else {
     block->set_m_readable(false, mf->get_access_sector_mask());
+    if (m_status == HIT_RESERVED)
+        block->set_readable_on_fill(true, mf->get_access_sector_mask());
   }
 
   if (m_status != RESERVATION_FAIL) {
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 5c28b41f6..25d0b7826 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -128,6 +128,8 @@ struct cache_block_t {
                                   mem_access_sector_mask_t sector_mask) = 0;
   virtual void set_modified_on_fill(bool m_modified,
                                     mem_access_sector_mask_t sector_mask) = 0;
+  virtual void set_readable_on_fill(bool readable,
+                                    mem_access_sector_mask_t sector_mask) = 0;
   virtual unsigned get_modified_size() = 0;
   virtual void set_m_readable(bool readable,
                               mem_access_sector_mask_t sector_mask) = 0;
@@ -147,6 +149,7 @@ struct line_cache_block : public cache_block_t {
     m_status = INVALID;
     m_ignore_on_fill_status = false;
     m_set_modified_on_fill = false;
+    m_set_readable_on_fill = false;
     m_readable = true;
   }
   void allocate(new_addr_type tag, new_addr_type block_addr, unsigned time,
@@ -159,12 +162,16 @@ struct line_cache_block : public cache_block_t {
     m_status = RESERVED;
     m_ignore_on_fill_status = false;
     m_set_modified_on_fill = false;
+    m_set_readable_on_fill = false;
   }
   void fill(unsigned time, mem_access_sector_mask_t sector_mask) {
     // if(!m_ignore_on_fill_status)
     //	assert( m_status == RESERVED );
 
     m_status = m_set_modified_on_fill ? MODIFIED : VALID;
+    
+    if (m_set_readable_on_fill)
+        m_readable = true;
 
     m_fill_time = time;
   }
@@ -197,6 +204,10 @@ struct line_cache_block : public cache_block_t {
                                     mem_access_sector_mask_t sector_mask) {
     m_set_modified_on_fill = m_modified;
   }
+  virtual void set_readable_on_fill(bool readable,
+                                    mem_access_sector_mask_t sector_mask) {
+    m_set_readable_on_fill = readable;
+  }
   virtual unsigned get_modified_size() {
     return SECTOR_CHUNCK_SIZE * SECTOR_SIZE;  // i.e. cache line size
   }
@@ -218,6 +229,7 @@ struct line_cache_block : public cache_block_t {
   cache_block_state m_status;
   bool m_ignore_on_fill_status;
   bool m_set_modified_on_fill;
+  bool m_set_readable_on_fill;
   bool m_readable;
 };
 
@@ -232,6 +244,7 @@ struct sector_cache_block : public cache_block_t {
       m_status[i] = INVALID;
       m_ignore_on_fill_status[i] = false;
       m_set_modified_on_fill[i] = false;
+      m_set_readable_on_fill[i] = false;
       m_readable[i] = true;
     }
     m_line_alloc_time = 0;
@@ -261,6 +274,7 @@ struct sector_cache_block : public cache_block_t {
     m_status[sidx] = RESERVED;
     m_ignore_on_fill_status[sidx] = false;
     m_set_modified_on_fill[sidx] = false;
+    m_set_readable_on_fill[sidx] = false;
 
     // set line stats
     m_line_alloc_time = time;  // only set this for the first allocated sector
@@ -283,6 +297,8 @@ struct sector_cache_block : public cache_block_t {
     else
       m_set_modified_on_fill[sidx] = false;
 
+    m_set_readable_on_fill[sidx] = false;
+
     m_status[sidx] = RESERVED;
     m_ignore_on_fill_status[sidx] = false;
     // m_set_modified_on_fill[sidx] = false;
@@ -300,6 +316,11 @@ struct sector_cache_block : public cache_block_t {
     //	         assert( m_status[sidx] == RESERVED );
 
     m_status[sidx] = m_set_modified_on_fill[sidx] ? MODIFIED : VALID;
+    
+    if (m_set_readable_on_fill[sidx]) {
+        m_readable[sidx] = true;
+        m_set_readable_on_fill[sidx] = false;
+    }
 
     m_sector_fill_time[sidx] = time;
     m_line_fill_time = time;
@@ -366,6 +387,11 @@ struct sector_cache_block : public cache_block_t {
     m_set_modified_on_fill[sidx] = m_modified;
   }
 
+  virtual void set_readable_on_fill(bool readable,
+                                    mem_access_sector_mask_t sector_mask) {
+    unsigned sidx = get_sector_index(sector_mask);
+    m_set_readable_on_fill[sidx] = readable;
+  }
   virtual void set_m_readable(bool readable,
                               mem_access_sector_mask_t sector_mask) {
     unsigned sidx = get_sector_index(sector_mask);
@@ -400,6 +426,7 @@ struct sector_cache_block : public cache_block_t {
   cache_block_state m_status[SECTOR_CHUNCK_SIZE];
   bool m_ignore_on_fill_status[SECTOR_CHUNCK_SIZE];
   bool m_set_modified_on_fill[SECTOR_CHUNCK_SIZE];
+  bool m_set_readable_on_fill[SECTOR_CHUNCK_SIZE];
   bool m_readable[SECTOR_CHUNCK_SIZE];
 
   unsigned get_sector_index(mem_access_sector_mask_t sector_mask) {

From 950464e7f8e512f2beb0c9e0883db3489bf84cec Mon Sep 17 00:00:00 2001
From: allen <allencho1222@gmail.com>
Date: Mon, 9 Nov 2020 21:43:08 +0900
Subject: [PATCH 002/154] change address type into ull

---
 src/abstract_hardware_model.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index 49f3e9f90..c012de0d8 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -75,8 +75,8 @@ enum AdaptiveCache { FIXED = 0, ADAPTIVE_VOLTA = 1 };
 
 typedef unsigned long long new_addr_type;
 typedef unsigned long long cudaTextureObject_t;
-typedef unsigned address_type;
-typedef unsigned addr_t;
+typedef unsigned long long address_type;
+typedef unsigned long long addr_t;
 
 // the following are operations the timing model can see
 #define SPECIALIZED_UNIT_NUM 8

From 07f77e1c3d1f1222de21bb77e4dcc5a6ab94a90f Mon Sep 17 00:00:00 2001
From: allen <allencho1222@gmail.com>
Date: Mon, 9 Nov 2020 21:46:01 +0900
Subject: [PATCH 003/154] do not truncate 32 MSB bits of the memory address

---
 src/abstract_hardware_model.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/abstract_hardware_model.cc b/src/abstract_hardware_model.cc
index 5ad6f105d..e0e1d23cf 100644
--- a/src/abstract_hardware_model.cc
+++ b/src/abstract_hardware_model.cc
@@ -205,8 +205,8 @@ gpgpu_t::gpgpu_t(const gpgpu_functional_sim_config &config, gpgpu_context *ctx)
   gpu_tot_sim_cycle = 0;
 }
 
-address_type line_size_based_tag_func(new_addr_type address,
-                                      new_addr_type line_size) {
+new_addr_type line_size_based_tag_func(new_addr_type address,
+                                       new_addr_type line_size) {
   // gives the tag for an address based on a given line size
   return address & ~(line_size - 1);
 }
@@ -448,7 +448,7 @@ void warp_inst_t::generate_mem_accesses() {
     for (unsigned thread = 0; thread < m_config->warp_size; thread++) {
       if (!active(thread)) continue;
       new_addr_type addr = m_per_scalar_thread[thread].memreqaddr[0];
-      unsigned block_address = line_size_based_tag_func(addr, cache_block_size);
+      new_addr_type block_address = line_size_based_tag_func(addr, cache_block_size);
       accesses[block_address].set(thread);
       unsigned idx = addr - block_address;
       for (unsigned i = 0; i < data_size; i++) byte_mask.set(idx + i);
@@ -530,7 +530,7 @@ void warp_inst_t::memory_coalescing_arch(bool is_write,
            (m_per_scalar_thread[thread].memreqaddr[access] != 0);
            access++) {
         new_addr_type addr = m_per_scalar_thread[thread].memreqaddr[access];
-        unsigned block_address = line_size_based_tag_func(addr, segment_size);
+        new_addr_type block_address = line_size_based_tag_func(addr, segment_size);
         unsigned chunk =
             (addr & 127) / 32;  // which 32-byte chunk within in a 128-byte
                                 // chunk does this thread access?
@@ -552,7 +552,7 @@ void warp_inst_t::memory_coalescing_arch(bool is_write,
         if (block_address != line_size_based_tag_func(
                                  addr + data_size_coales - 1, segment_size)) {
           addr = addr + data_size_coales - 1;
-          unsigned block_address = line_size_based_tag_func(addr, segment_size);
+          new_addr_type block_address = line_size_based_tag_func(addr, segment_size);
           unsigned chunk = (addr & 127) / 32;
           transaction_info &info = subwarp_transactions[block_address];
           info.chunks.set(chunk);
@@ -625,7 +625,7 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write,
       if (!active(thread)) continue;
 
       new_addr_type addr = m_per_scalar_thread[thread].memreqaddr[0];
-      unsigned block_address = line_size_based_tag_func(addr, segment_size);
+      new_addr_type block_address = line_size_based_tag_func(addr, segment_size);
       unsigned chunk =
           (addr & 127) / 32;  // which 32-byte chunk within in a 128-byte chunk
                               // does this thread access?

From 132c2ce4ef3ff12f984881ca4b6a8780797dacff Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Sun, 15 Nov 2020 15:41:39 -0500
Subject: [PATCH 004/154] added MSHR_HIT

---
 src/gpgpu-sim/gpu-cache.cc | 3 ++-
 src/gpgpu-sim/gpu-cache.h  | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 75c369136..613403a49 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -37,7 +37,7 @@
 
 const char *cache_request_status_str(enum cache_request_status status) {
   static const char *static_cache_request_status_str[] = {
-      "HIT", "HIT_RESERVED", "MISS", "RESERVATION_FAIL", "SECTOR_MISS"};
+      "HIT", "HIT_RESERVED", "MISS", "RESERVATION_FAIL", "SECTOR_MISS", "MSHR_HIT"};
 
   assert(sizeof(static_cache_request_status_str) / sizeof(const char *) ==
          NUM_CACHE_REQUEST_STATUS);
@@ -1123,6 +1123,7 @@ void baseline_cache::send_read_request(new_addr_type addr,
       m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf);
 
     m_mshrs.add(mshr_addr, mf);
+    m_stats.inc_stats(mf->get_access_type(), MSHR_HIT);
     do_miss = true;
 
   } else if (!mshr_hit && mshr_avail &&
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 5c28b41f6..17c8c02d8 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -49,6 +49,7 @@ enum cache_request_status {
   MISS,
   RESERVATION_FAIL,
   SECTOR_MISS,
+  MSHR_HIT,
   NUM_CACHE_REQUEST_STATUS
 };
 

From f3a00778b98cf101c8052e9fe1dd2d4c08185b7e Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Fri, 12 Feb 2021 16:13:46 -0500
Subject: [PATCH 005/154] bug fix was_writeback_sent

---
 src/gpgpu-sim/gpu-cache.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index af22c4c2c..eb9500485 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -496,8 +496,10 @@ bool was_writeback_sent(const std::list<cache_event> &events,
                         cache_event &wb_event) {
   for (std::list<cache_event>::const_iterator e = events.begin();
        e != events.end(); e++) {
-    if ((*e).m_cache_event_type == WRITE_BACK_REQUEST_SENT) wb_event = *e;
-    return true;
+    if ((*e).m_cache_event_type == WRITE_BACK_REQUEST_SENT) {
+      wb_event = *e;
+      return true;
+    }
   }
   return false;
 }

From 51d99259845a051a32e45763bdf3005b4dff74b5 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Mon, 15 Feb 2021 16:03:51 -0500
Subject: [PATCH 006/154] fix hash funciton

---
 src/gpgpu-sim/gpu-cache.cc | 4 ++--
 src/gpgpu-sim/gpu-cache.h  | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 8f7ccd591..1c36d224c 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -63,8 +63,8 @@ unsigned l1d_cache_config::set_bank(new_addr_type addr) const {
   // For sector cache, we select one sector per bank (sector interleaving)
   // This is what was found in Volta (one sector per bank, sector interleaving)
   // otherwise, line interleaving
-  return cache_config::hash_function(addr, l1_banks, l1_banks_byte_interleaving,
-                                     m_l1_banks_log2,
+  return cache_config::hash_function(addr, l1_banks, l1_banks_byte_interleaving_log2,
+                                     l1_banks_log2,
                                      l1_banks_hashing_function);
 }
 
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 26369c33a..00c09ae55 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -817,15 +817,15 @@ class l1d_cache_config : public cache_config {
   l1d_cache_config() : cache_config() {}
   unsigned set_bank(new_addr_type addr) const;
   void init(char *config, FuncCache status) {
-    m_banks_byte_interleaving_log2 = LOGB2(l1_banks_byte_interleaving);
-    m_l1_banks_log2 = LOGB2(l1_banks);
+    l1_banks_byte_interleaving_log2 = LOGB2(l1_banks_byte_interleaving);
+    l1_banks_log2 = LOGB2(l1_banks);
     cache_config::init(config, status);
   }
   unsigned l1_latency;
   unsigned l1_banks;
-  unsigned m_l1_banks_log2;
+  unsigned l1_banks_log2;
   unsigned l1_banks_byte_interleaving;
-  unsigned m_banks_byte_interleaving_log2;
+  unsigned l1_banks_byte_interleaving_log2;
   unsigned l1_banks_hashing_function;
 };
 

From b430b36911b48228ed7eb77457cc378261151a13 Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Thu, 25 Feb 2021 16:25:43 -0500
Subject: [PATCH 007/154] adding new RTX 3070 config

---
 .../SM86_RTX3070/config_ampere_islip.icnt     |  74 +++++++
 .../tested-cfgs/SM86_RTX3070/gpgpusim.config  | 192 ++++++++++++++++++
 2 files changed, 266 insertions(+)
 create mode 100644 configs/tested-cfgs/SM86_RTX3070/config_ampere_islip.icnt
 create mode 100644 configs/tested-cfgs/SM86_RTX3070/gpgpusim.config

diff --git a/configs/tested-cfgs/SM86_RTX3070/config_ampere_islip.icnt b/configs/tested-cfgs/SM86_RTX3070/config_ampere_islip.icnt
new file mode 100644
index 000000000..6775d5d6f
--- /dev/null
+++ b/configs/tested-cfgs/SM86_RTX3070/config_ampere_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 78;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
new file mode 100644
index 000000000..2010aa698
--- /dev/null
+++ b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
@@ -0,0 +1,192 @@
+# This config models the Ampere RTX 3070
+# For more info about Ampere architecture:
+# https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf
+# https://www.nvidia.com/content/dam/en-zz/Solutions/geforce/ampere/pdf/NVIDIA-ampere-GA102-GPU-Architecture-Whitepaper-V1.pdf
+# https://en.wikipedia.org/wiki/GeForce_30_series
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 86 
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 8
+-gpgpu_compute_capability_minor 6
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 46
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 16
+-gpgpu_n_sub_partition_per_mchannel 2
+
+# Ampere clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1320.0:1320.0:1320.0:3500.0
+# boost mode
+# -gpgpu_clock_domains 1780.0:1780.0:1780.0:3500.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 86
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Ampere GA102 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 32
+-ptx_opcode_initiation_tensor 32
+
+# Ampere has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# Ampere has 24 double-ported banks, 4 schedulers, 6 banks per scheduler
+-gpgpu_num_reg_banks 24
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 86
+
+# Ampere has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler gto
+## In Ampere, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Default config is 28KB DL1 and 100KB shared memory
+# In Ampere, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#global-memory-8-x
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+# Ampere unified cache has four banks
+-gpgpu_l1_banks 4
+-gpgpu_cache:dl1  S:1:128:256,L:L:s:N:L,A:256:8,16:0,32
+-gpgpu_shmem_size 102400
+-gpgpu_shmem_sizeDefault 102400
+-gpgpu_shmem_per_block 102400
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_n_cluster_ejection_buffer_size 32
+-gpgpu_l1_latency 20
+-gpgpu_smem_latency 20
+-gpgpu_flush_l1_cache 1
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 3MB L2 cache
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 1
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprecated, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_ampere_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# Ampere RTX3060 has GDDR6
+# http://monitorinsider.com/GDDR6.html
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 2
+-gpgpu_dram_burst_length 16
+-dram_data_command_freq_ratio 4
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCC.BCCSSSSS
+
+# Use the same GDDR5 timing, scaled to 3500MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=4:RRD=10:RCD=20:RAS=50:RP=20:RC=62:
+                        CL=20:WL=8:CDLR=9:WR=20:nbkgrp=4:CCDL=4:RTPL=4"
+
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# power model configs, disable it untill we create a real energy model for Ampere
+-power_simulation_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+

From 09f10eb4c28b6cc76c7c7cc3181c340cf8ec2be5 Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Thu, 25 Mar 2021 12:44:09 -0400
Subject: [PATCH 008/154] change the L1 cache policy to be on-miss based on
 recent ubench

---
 configs/tested-cfgs/SM75_RTX2060/gpgpusim.config | 2 +-
 configs/tested-cfgs/SM7_QV100/gpgpusim.config    | 2 +-
 configs/tested-cfgs/SM86_RTX3070/gpgpusim.config | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index 6fe04eecd..8be9a73d2 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -101,7 +101,7 @@
 # ** Optional parameter - Required when mshr_type==Texture Fifo
 -gpgpu_adaptive_cache_config 0
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:512,L:L:s:N:L,A:256:8,16:0,32
+-gpgpu_cache:dl1  S:1:128:512,L:L:m:N:L,A:256:8,16:0,32
 -gpgpu_shmem_size 65536
 -gpgpu_shmem_sizeDefault 65536
 -gpgpu_shmem_per_block 65536
diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index c4818d10f..18f55641d 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -115,7 +115,7 @@
 -gpgpu_adaptive_cache_config 1
 # Volta unified cache has four banks
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:256,L:L:s:N:L,A:256:8,16:0,32
+-gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:256:8,16:0,32
 -gpgpu_shmem_size 98304
 -gpgpu_shmem_sizeDefault 98304
 -gpgpu_shmem_per_block 65536
diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
index 2010aa698..11dbcaf1c 100644
--- a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
+++ b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
@@ -109,7 +109,7 @@
 -gpgpu_adaptive_cache_config 1
 # Ampere unified cache has four banks
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:256,L:L:s:N:L,A:256:8,16:0,32
+-gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:256:8,16:0,32
 -gpgpu_shmem_size 102400
 -gpgpu_shmem_sizeDefault 102400
 -gpgpu_shmem_per_block 102400

From 1ee03f0116511ac3c2d6ac7688d916191f4f0a6b Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Thu, 25 Mar 2021 12:54:14 -0400
Subject: [PATCH 009/154] change the L1 cache policy based on recent ubench

---
 configs/tested-cfgs/SM75_RTX2060/gpgpusim.config | 2 +-
 configs/tested-cfgs/SM7_QV100/gpgpusim.config    | 2 +-
 configs/tested-cfgs/SM86_RTX3070/gpgpusim.config | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index 8be9a73d2..6189dca0f 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -101,7 +101,7 @@
 # ** Optional parameter - Required when mshr_type==Texture Fifo
 -gpgpu_adaptive_cache_config 0
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:512,L:L:m:N:L,A:256:8,16:0,32
+-gpgpu_cache:dl1  S:1:128:512,L:L:m:N:L,A:512:8,16:0,32
 -gpgpu_shmem_size 65536
 -gpgpu_shmem_sizeDefault 65536
 -gpgpu_shmem_per_block 65536
diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index 18f55641d..bc5677cf3 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -115,7 +115,7 @@
 -gpgpu_adaptive_cache_config 1
 # Volta unified cache has four banks
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:256:8,16:0,32
+-gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:512:8,16:0,32
 -gpgpu_shmem_size 98304
 -gpgpu_shmem_sizeDefault 98304
 -gpgpu_shmem_per_block 65536
diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
index 11dbcaf1c..f5418ad8e 100644
--- a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
+++ b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
@@ -109,7 +109,7 @@
 -gpgpu_adaptive_cache_config 1
 # Ampere unified cache has four banks
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:256:8,16:0,32
+-gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:512:8,16:0,32
 -gpgpu_shmem_size 102400
 -gpgpu_shmem_sizeDefault 102400
 -gpgpu_shmem_per_block 102400

From 553346445486367799d4d67bf3537e54b7c83859 Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Sun, 9 May 2021 13:11:39 -0400
Subject: [PATCH 010/154] parition CU allocation, add prints

---
 src/abstract_hardware_model.h | 12 +++++++++++-
 src/gpgpu-sim/shader.cc       | 25 +++++++++++++++++++++++--
 2 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index c012de0d8..636052ad7 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -1315,7 +1315,17 @@ class register_set {
     }
     return false;
   }
-
+  unsigned get_ready_reg_id() {
+    // for sub core model we need to figure which reg_id has the ready warp
+    // this function should only be called if has_ready() was true
+    assert(has_ready());
+    for (unsigned i = 0; i < regs.size(); i++) {
+      if (not regs[i]->empty()) {
+        return i;
+      }
+    }
+    abort();
+  }
   void move_in(warp_inst_t *&src) {
     warp_inst_t **free = get_free();
     move_warp(*free, src);
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index c6e7b8f67..40120ec9c 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3974,7 +3974,18 @@ void opndcoll_rfu_t::allocate_cu(unsigned port_num) {
       for (unsigned j = 0; j < inp.m_cu_sets.size(); j++) {
         std::vector<collector_unit_t> &cu_set = m_cus[inp.m_cu_sets[j]];
         bool allocated = false;
-        for (unsigned k = 0; k < cu_set.size(); k++) {
+        unsigned cuLowerBound = 0;
+        unsigned cuUpperBound = cu_set.size();
+        if(sub_core_model) {
+          // Sub core model only allocates on the subset of CUs assigned to the scheduler that issued
+          unsigned reg_id = (*inp.m_in[i]).get_ready_reg_id();
+          assert(cu_set.size() % m_num_warp_scheds == 0);
+          unsigned cusPerSched = cu_set.size() / m_num_warp_scheds;
+          cuLowerBound = reg_id * cusPerSched;
+          cuUpperBound = cuLowerBound + cusPerSched;
+          assert(0 <= cuLowerBound && cuUpperBound <= cu_set.size());
+        }
+        for (unsigned k = cuLowerBound; k < cuUpperBound; k++) {
           if (cu_set[k].is_free()) {
             collector_unit_t *cu = &cu_set[k];
             allocated = cu->allocate(inp.m_in[i], inp.m_out[i]);
@@ -3984,7 +3995,7 @@ void opndcoll_rfu_t::allocate_cu(unsigned port_num) {
         }
         if (allocated) break;  // cu has been allocated, no need to search more.
       }
-      break;  // can only service a single input, if it failed it will fail for
+      //break;  // can only service a single input, if it failed it will fail for
               // others.
     }
   }
@@ -4098,6 +4109,16 @@ bool opndcoll_rfu_t::collector_unit_t::allocate(register_set *pipeline_reg_set,
 void opndcoll_rfu_t::collector_unit_t::dispatch() {
   assert(m_not_ready.none());
   // move_warp(*m_output_register,m_warp);
+  // Print out which OC dispatched which warp sched id to which exec pipeline
+  std::cout << "Dispatched from OC: "
+  << this->get_id()
+  << "\t Warp_id: "
+  << m_warp->get_uid()
+  << "\t Sched_id: "
+  << m_warp->get_schd_id()
+  << "\tto execution register: "
+  << m_output_register->get_name()
+  << std::endl;
   m_output_register->move_in(m_warp);
   m_free = true;
   m_output_register = NULL;

From 645a0eaa6b431c5d4279330c72905ac6b6e7abb2 Mon Sep 17 00:00:00 2001
From: Aaron M Barnes <barnes88@purdue.edu>
Date: Sun, 9 May 2021 13:23:12 -0400
Subject: [PATCH 011/154] minor fixes

---
 src/abstract_hardware_model.h | 1 +
 src/gpgpu-sim/shader.cc       | 2 +-
 src/gpgpu-sim/shader.h        | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index 636052ad7..4d2bb4c4b 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -1291,6 +1291,7 @@ class register_set {
     }
     m_name = name;
   }
+  const char * get_name() {return m_name;}
   bool has_free() {
     for (unsigned i = 0; i < regs.size(); i++) {
       if (regs[i]->empty()) {
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 40120ec9c..372bc128a 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3867,7 +3867,7 @@ void opndcoll_rfu_t::init(unsigned num_banks, shader_core_ctx *shader) {
   assert((m_bank_warp_shift == 5) || (m_warp_size != 32));
 
   sub_core_model = shader->get_config()->sub_core_model;
-  m_num_warp_sceds = shader->get_config()->gpgpu_num_sched_per_core;
+  m_num_warp_scheds = shader->get_config()->gpgpu_num_sched_per_core;
   if (sub_core_model)
     assert(num_banks % shader->get_config()->gpgpu_num_sched_per_core == 0);
   m_num_banks_per_sched =
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 6481790bc..05c0e4c93 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -947,7 +947,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
   arbiter_t m_arbiter;
 
   unsigned m_num_banks_per_sched;
-  unsigned m_num_warp_sceds;
+  unsigned m_num_warp_scheds;
   bool sub_core_model;
 
   // unsigned m_num_ports;

From 46423a22b7c11663e4849dbd3bb77f2d530f6907 Mon Sep 17 00:00:00 2001
From: Aaron M Barnes <barnes88@purdue.edu>
Date: Sun, 9 May 2021 14:07:05 -0400
Subject: [PATCH 012/154] useful print statement

---
 src/gpgpu-sim/shader.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 372bc128a..895a2ef84 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3983,10 +3983,12 @@ void opndcoll_rfu_t::allocate_cu(unsigned port_num) {
           unsigned cusPerSched = cu_set.size() / m_num_warp_scheds;
           cuLowerBound = reg_id * cusPerSched;
           cuUpperBound = cuLowerBound + cusPerSched;
+          std::cout << "reg_id: " << reg_id << " cusPerSched: " << cusPerSched << " lowerBound: " << cuLowerBound << std::endl;
           assert(0 <= cuLowerBound && cuUpperBound <= cu_set.size());
         }
         for (unsigned k = cuLowerBound; k < cuUpperBound; k++) {
           if (cu_set[k].is_free()) {
+            std::cout << "Allocated on cu: " << k << std::endl;
             collector_unit_t *cu = &cu_set[k];
             allocated = cu->allocate(inp.m_in[i], inp.m_out[i]);
             m_arbiter.add_read_requests(cu);

From b67288046af824a88f8bb94541ded14cc711ef35 Mon Sep 17 00:00:00 2001
From: Aaron M Barnes <barnes88@purdue.edu>
Date: Sun, 9 May 2021 14:42:29 -0400
Subject: [PATCH 013/154] validated collector unit partitioning based on
 scheduler

---
 src/abstract_hardware_model.h | 16 ++++++++++++++--
 src/gpgpu-sim/shader.cc       |  8 +++++---
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index 4d2bb4c4b..ba32358b7 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -1320,12 +1320,24 @@ class register_set {
     // for sub core model we need to figure which reg_id has the ready warp
     // this function should only be called if has_ready() was true
     assert(has_ready());
+    warp_inst_t **ready;
+    ready = NULL;
+    unsigned reg_id;
     for (unsigned i = 0; i < regs.size(); i++) {
       if (not regs[i]->empty()) {
-        return i;
+        if (ready and (*ready)->get_uid() < regs[i]->get_uid()) {
+          // ready is oldest
+        } else {
+          ready = &regs[i];
+          reg_id = i;
+        }
       }
     }
-    abort();
+    return reg_id;
+  }
+  unsigned get_schd_id(unsigned reg_id) {
+      assert(not regs[reg_id]->empty());
+      return regs[reg_id]->get_schd_id();
   }
   void move_in(warp_inst_t *&src) {
     warp_inst_t **free = get_free();
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 895a2ef84..5c27b9b5e 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3976,19 +3976,21 @@ void opndcoll_rfu_t::allocate_cu(unsigned port_num) {
         bool allocated = false;
         unsigned cuLowerBound = 0;
         unsigned cuUpperBound = cu_set.size();
+        unsigned schd_id;
         if(sub_core_model) {
           // Sub core model only allocates on the subset of CUs assigned to the scheduler that issued
           unsigned reg_id = (*inp.m_in[i]).get_ready_reg_id();
+          schd_id = (*inp.m_in[i]).get_schd_id(reg_id);
           assert(cu_set.size() % m_num_warp_scheds == 0);
           unsigned cusPerSched = cu_set.size() / m_num_warp_scheds;
-          cuLowerBound = reg_id * cusPerSched;
+          cuLowerBound = schd_id * cusPerSched;
           cuUpperBound = cuLowerBound + cusPerSched;
-          std::cout << "reg_id: " << reg_id << " cusPerSched: " << cusPerSched << " lowerBound: " << cuLowerBound << std::endl;
+          std::cout << "reg_id: " << reg_id << " schd_id: " << schd_id << " cusPerSched: " << cusPerSched << " lowerBound: " << cuLowerBound << std::endl;
           assert(0 <= cuLowerBound && cuUpperBound <= cu_set.size());
         }
         for (unsigned k = cuLowerBound; k < cuUpperBound; k++) {
           if (cu_set[k].is_free()) {
-            std::cout << "Allocated on cu: " << k << std::endl;
+            std::cout << "Allocated schd_id: " << schd_id << " on cu: " << k << std::endl;
             collector_unit_t *cu = &cu_set[k];
             allocated = cu->allocate(inp.m_in[i], inp.m_out[i]);
             m_arbiter.add_read_requests(cu);

From fa76ab438b0b8c2d2e8abf5f395c7a98a3d5fd9b Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Mon, 10 May 2021 15:05:06 -0400
Subject: [PATCH 014/154] sub core model dispatches only to assigned exec
 pipelines

---
 src/abstract_hardware_model.h | 11 +++++++++++
 src/gpgpu-sim/shader.cc       | 17 ++++++++++-------
 src/gpgpu-sim/shader.h        | 10 ++++++----
 3 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index ba32358b7..d70c3ebc3 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -1346,6 +1346,17 @@ class register_set {
   // void copy_in( warp_inst_t* src ){
   //   src->copy_contents_to(*get_free());
   //}
+  void move_in(bool sub_core_model, unsigned reg_id, warp_inst_t *&src) {
+  warp_inst_t **free;
+  if (!sub_core_model) {
+    free = get_free();
+  } else {
+    assert(reg_id < regs.size());
+    free = get_free(sub_core_model, reg_id);
+  }
+  move_warp(*free, src);
+  }
+
   void move_out_to(warp_inst_t *&dest) {
     warp_inst_t **ready = get_ready();
     move_warp(dest, *ready);
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 5c27b9b5e..ec1073334 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3939,7 +3939,7 @@ bool opndcoll_rfu_t::writeback(warp_inst_t &inst) {
 void opndcoll_rfu_t::dispatch_ready_cu() {
   for (unsigned p = 0; p < m_dispatch_units.size(); ++p) {
     dispatch_unit_t &du = m_dispatch_units[p];
-    collector_unit_t *cu = du.find_ready();
+    collector_unit_t *cu = du.find_ready(sub_core_model, p);
     if (cu) {
       for (unsigned i = 0; i < (cu->get_num_operands() - cu->get_num_regs());
            i++) {
@@ -3961,7 +3961,9 @@ void opndcoll_rfu_t::dispatch_ready_cu() {
               m_shader->get_config()->warp_size);  // cu->get_active_count());
         }
       }
-      cu->dispatch();
+      unsigned cusPerSched = du->get_num_collectors() / m_num_warp_scheds;
+      unsigned reg_id = p / cusPerSched;
+      cu->dispatch(sub_core_model, reg_id);
     }
   }
 }
@@ -3985,7 +3987,6 @@ void opndcoll_rfu_t::allocate_cu(unsigned port_num) {
           unsigned cusPerSched = cu_set.size() / m_num_warp_scheds;
           cuLowerBound = schd_id * cusPerSched;
           cuUpperBound = cuLowerBound + cusPerSched;
-          std::cout << "reg_id: " << reg_id << " schd_id: " << schd_id << " cusPerSched: " << cusPerSched << " lowerBound: " << cuLowerBound << std::endl;
           assert(0 <= cuLowerBound && cuUpperBound <= cu_set.size());
         }
         for (unsigned k = cuLowerBound; k < cuUpperBound; k++) {
@@ -4046,8 +4047,8 @@ void opndcoll_rfu_t::allocate_reads() {
   }
 }
 
-bool opndcoll_rfu_t::collector_unit_t::ready() const {
-  return (!m_free) && m_not_ready.none() && (*m_output_register).has_free();
+bool opndcoll_rfu_t::collector_unit_t::ready(bool sub_core_model, unsigned reg_id) const {
+  return (!m_free) && m_not_ready.none() && (*m_output_register).has_free(sub_core_model, reg_id);
 }
 
 void opndcoll_rfu_t::collector_unit_t::dump(
@@ -4110,7 +4111,7 @@ bool opndcoll_rfu_t::collector_unit_t::allocate(register_set *pipeline_reg_set,
   return false;
 }
 
-void opndcoll_rfu_t::collector_unit_t::dispatch() {
+void opndcoll_rfu_t::collector_unit_t::dispatch(bool sub_core_model, unsigned reg_id) {
   assert(m_not_ready.none());
   // move_warp(*m_output_register,m_warp);
   // Print out which OC dispatched which warp sched id to which exec pipeline
@@ -4122,8 +4123,10 @@ void opndcoll_rfu_t::collector_unit_t::dispatch() {
   << m_warp->get_schd_id()
   << "\tto execution register: "
   << m_output_register->get_name()
+  << "\treg id: "
+  << reg_id
   << std::endl;
-  m_output_register->move_in(m_warp);
+  m_output_register->move_in(sub_core_model, reg_id, m_warp);
   m_free = true;
   m_output_register = NULL;
   for (unsigned i = 0; i < MAX_REG_OPERANDS * 2; i++) m_src_op[i].reset();
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 05c0e4c93..74bf32093 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -867,7 +867,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
       m_bank_warp_shift = 0;
     }
     // accessors
-    bool ready() const;
+    bool ready(bool sub_core_modle, unsigned reg_id) const;
     const op_t *get_operands() const { return m_src_op; }
     void dump(FILE *fp, const shader_core_ctx *shader) const;
 
@@ -888,7 +888,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
     void collect_operand(unsigned op) { m_not_ready.reset(op); }
     unsigned get_num_operands() const { return m_warp->get_num_operands(); }
     unsigned get_num_regs() const { return m_warp->get_num_regs(); }
-    void dispatch();
+    void dispatch(bool sub_core_model, unsigned reg_id);
     bool is_free() { return m_free; }
 
    private:
@@ -917,10 +917,10 @@ class opndcoll_rfu_t {  // operand collector based register file unit
       m_next_cu = 0;
     }
 
-    collector_unit_t *find_ready() {
+    collector_unit_t *find_ready(bool sub_core_model, unsigned reg_id) {
       for (unsigned n = 0; n < m_num_collectors; n++) {
         unsigned c = (m_last_cu + n + 1) % m_num_collectors;
-        if ((*m_collector_units)[c].ready()) {
+        if ((*m_collector_units)[c].ready(sub_core_model, reg_id)) {
           m_last_cu = c;
           return &((*m_collector_units)[c]);
         }
@@ -928,6 +928,8 @@ class opndcoll_rfu_t {  // operand collector based register file unit
       return NULL;
     }
 
+    unsigned get_num_collectors(){return m_num_collectors;}
+    
    private:
     unsigned m_num_collectors;
     std::vector<collector_unit_t> *m_collector_units;

From c905726ae9921e6ba67df77fd4ba5bb87215d69d Mon Sep 17 00:00:00 2001
From: Aaron M Barnes <barnes88@purdue.edu>
Date: Mon, 10 May 2021 15:08:28 -0400
Subject: [PATCH 015/154] minor fix accessing du

---
 src/gpgpu-sim/shader.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index ec1073334..c3b8d3949 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3961,7 +3961,7 @@ void opndcoll_rfu_t::dispatch_ready_cu() {
               m_shader->get_config()->warp_size);  // cu->get_active_count());
         }
       }
-      unsigned cusPerSched = du->get_num_collectors() / m_num_warp_scheds;
+      unsigned cusPerSched = du.get_num_collectors() / m_num_warp_scheds;
       unsigned reg_id = p / cusPerSched;
       cu->dispatch(sub_core_model, reg_id);
     }

From a72b84e0f6e90754728d0309aac5dca1e00b7874 Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Mon, 10 May 2021 18:50:34 -0400
Subject: [PATCH 016/154] fix find_ready reg_id

---
 src/gpgpu-sim/shader.cc | 2 +-
 src/gpgpu-sim/shader.h  | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index c3b8d3949..d9d441149 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3962,7 +3962,7 @@ void opndcoll_rfu_t::dispatch_ready_cu() {
         }
       }
       unsigned cusPerSched = du.get_num_collectors() / m_num_warp_scheds;
-      unsigned reg_id = p / cusPerSched;
+      unsigned reg_id = cu->get_id() / cusPerSched;
       cu->dispatch(sub_core_model, reg_id);
     }
   }
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 74bf32093..9b14bfdc5 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -917,9 +917,10 @@ class opndcoll_rfu_t {  // operand collector based register file unit
       m_next_cu = 0;
     }
 
-    collector_unit_t *find_ready(bool sub_core_model, unsigned reg_id) {
+    collector_unit_t *find_ready(bool sub_core_model) {
       for (unsigned n = 0; n < m_num_collectors; n++) {
         unsigned c = (m_last_cu + n + 1) % m_num_collectors;
+        unsigned reg_id = c / m_num_collectors;
         if ((*m_collector_units)[c].ready(sub_core_model, reg_id)) {
           m_last_cu = c;
           return &((*m_collector_units)[c]);
@@ -929,7 +930,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
     }
 
     unsigned get_num_collectors(){return m_num_collectors;}
-    
+
    private:
     unsigned m_num_collectors;
     std::vector<collector_unit_t> *m_collector_units;

From 6ad5bad1d992e1add154957ac4903ce17007b912 Mon Sep 17 00:00:00 2001
From: Aaron M Barnes <barnes88@purdue.edu>
Date: Mon, 10 May 2021 19:34:48 -0400
Subject: [PATCH 017/154] dont need du id

---
 src/gpgpu-sim/shader.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index d9d441149..943e38c7d 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3939,7 +3939,7 @@ bool opndcoll_rfu_t::writeback(warp_inst_t &inst) {
 void opndcoll_rfu_t::dispatch_ready_cu() {
   for (unsigned p = 0; p < m_dispatch_units.size(); ++p) {
     dispatch_unit_t &du = m_dispatch_units[p];
-    collector_unit_t *cu = du.find_ready(sub_core_model, p);
+    collector_unit_t *cu = du.find_ready(sub_core_model);
     if (cu) {
       for (unsigned i = 0; i < (cu->get_num_operands() - cu->get_num_regs());
            i++) {

From 92192368f2545cd6fc1004047af8b57762637dbf Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Mon, 10 May 2021 19:40:46 -0400
Subject: [PATCH 018/154] remove prints

---
 src/gpgpu-sim/shader.cc | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 943e38c7d..928e1083a 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3991,7 +3991,7 @@ void opndcoll_rfu_t::allocate_cu(unsigned port_num) {
         }
         for (unsigned k = cuLowerBound; k < cuUpperBound; k++) {
           if (cu_set[k].is_free()) {
-            std::cout << "Allocated schd_id: " << schd_id << " on cu: " << k << std::endl;
+            //std::cout << "Allocated schd_id: " << schd_id << " on cu: " << k << std::endl;
             collector_unit_t *cu = &cu_set[k];
             allocated = cu->allocate(inp.m_in[i], inp.m_out[i]);
             m_arbiter.add_read_requests(cu);
@@ -4113,9 +4113,8 @@ bool opndcoll_rfu_t::collector_unit_t::allocate(register_set *pipeline_reg_set,
 
 void opndcoll_rfu_t::collector_unit_t::dispatch(bool sub_core_model, unsigned reg_id) {
   assert(m_not_ready.none());
-  // move_warp(*m_output_register,m_warp);
   // Print out which OC dispatched which warp sched id to which exec pipeline
-  std::cout << "Dispatched from OC: "
+  /* std::cout << "Dispatched from OC: "
   << this->get_id()
   << "\t Warp_id: "
   << m_warp->get_uid()
@@ -4125,7 +4124,7 @@ void opndcoll_rfu_t::collector_unit_t::dispatch(bool sub_core_model, unsigned re
   << m_output_register->get_name()
   << "\treg id: "
   << reg_id
-  << std::endl;
+  << std::endl; */
   m_output_register->move_in(sub_core_model, reg_id, m_warp);
   m_free = true;
   m_output_register = NULL;

From 52a890cff520ea48d6bfa46ff7b85b5d5e06d1be Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Mon, 10 May 2021 21:38:04 -0400
Subject: [PATCH 019/154] need at least 1 cu per sched for sub_core model, fix
 find_ready() reg_id

---
 src/gpgpu-sim/shader.cc | 9 ++++++---
 src/gpgpu-sim/shader.h  | 7 ++++++-
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 928e1083a..c1bc495fc 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3961,8 +3961,11 @@ void opndcoll_rfu_t::dispatch_ready_cu() {
               m_shader->get_config()->warp_size);  // cu->get_active_count());
         }
       }
-      unsigned cusPerSched = du.get_num_collectors() / m_num_warp_scheds;
-      unsigned reg_id = cu->get_id() / cusPerSched;
+      unsigned reg_id;
+      if (sub_core_model) {
+        unsigned cusPerSched = du.get_num_collectors() / m_num_warp_scheds;
+        reg_id = cu->get_id() / cusPerSched;
+      }
       cu->dispatch(sub_core_model, reg_id);
     }
   }
@@ -3983,7 +3986,7 @@ void opndcoll_rfu_t::allocate_cu(unsigned port_num) {
           // Sub core model only allocates on the subset of CUs assigned to the scheduler that issued
           unsigned reg_id = (*inp.m_in[i]).get_ready_reg_id();
           schd_id = (*inp.m_in[i]).get_schd_id(reg_id);
-          assert(cu_set.size() % m_num_warp_scheds == 0);
+          assert(cu_set.size() % m_num_warp_scheds == 0 && cu_set.size() >= m_num_warp_scheds);
           unsigned cusPerSched = cu_set.size() / m_num_warp_scheds;
           cuLowerBound = schd_id * cusPerSched;
           cuUpperBound = cuLowerBound + cusPerSched;
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 9b14bfdc5..0b96ec0c8 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -920,7 +920,12 @@ class opndcoll_rfu_t {  // operand collector based register file unit
     collector_unit_t *find_ready(bool sub_core_model) {
       for (unsigned n = 0; n < m_num_collectors; n++) {
         unsigned c = (m_last_cu + n + 1) % m_num_collectors;
-        unsigned reg_id = c / m_num_collectors;
+        unsigned reg_id;
+        if (sub_core_model) {
+          assert (m_num_collectors >= m_num_warp_scheds);
+          unsigned cusPerSched = m_num_collectors / m_num_warp_scheds;
+          reg_id = c / cusPerSched;
+        }
         if ((*m_collector_units)[c].ready(sub_core_model, reg_id)) {
           m_last_cu = c;
           return &((*m_collector_units)[c]);

From 2db9120218c894c7d90ef833477c3e0ca5425213 Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Mon, 10 May 2021 22:15:23 -0400
Subject: [PATCH 020/154] move reg_id calc to cu object init

---
 src/gpgpu-sim/shader.cc | 19 +++++++++++++------
 src/gpgpu-sim/shader.h  | 15 ++++++---------
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index c1bc495fc..72476161f 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3868,14 +3868,21 @@ void opndcoll_rfu_t::init(unsigned num_banks, shader_core_ctx *shader) {
 
   sub_core_model = shader->get_config()->sub_core_model;
   m_num_warp_scheds = shader->get_config()->gpgpu_num_sched_per_core;
-  if (sub_core_model)
+  unsigned reg_id;
+  if (sub_core_model) {
     assert(num_banks % shader->get_config()->gpgpu_num_sched_per_core == 0);
+    assert(m_num_warp_scheds >= m_cu.size() && m_cu.size() % m_num_warp_scheds == 0);
+  }
   m_num_banks_per_sched =
       num_banks / shader->get_config()->gpgpu_num_sched_per_core;
 
   for (unsigned j = 0; j < m_cu.size(); j++) {
+    if (sub_core_model) {
+      unsigned cusPerSched = m_cu.size() / m_num_warp_scheds;
+      reg_id = j / cusPerSched;
+    }
     m_cu[j]->init(j, num_banks, m_bank_warp_shift, shader->get_config(), this,
-                  sub_core_model, m_num_banks_per_sched);
+                  sub_core_model, reg_id, m_num_banks_per_sched);
   }
   m_initialized = true;
 }
@@ -3962,10 +3969,8 @@ void opndcoll_rfu_t::dispatch_ready_cu() {
         }
       }
       unsigned reg_id;
-      if (sub_core_model) {
-        unsigned cusPerSched = du.get_num_collectors() / m_num_warp_scheds;
-        reg_id = cu->get_id() / cusPerSched;
-      }
+      if (sub_core_model) 
+        reg_id = cu->get_reg_id();
       cu->dispatch(sub_core_model, reg_id);
     }
   }
@@ -4074,6 +4079,7 @@ void opndcoll_rfu_t::collector_unit_t::init(unsigned n, unsigned num_banks,
                                             const core_config *config,
                                             opndcoll_rfu_t *rfu,
                                             bool sub_core_model,
+                                            unsigned reg_id,
                                             unsigned banks_per_sched) {
   m_rfu = rfu;
   m_cuid = n;
@@ -4082,6 +4088,7 @@ void opndcoll_rfu_t::collector_unit_t::init(unsigned n, unsigned num_banks,
   m_warp = new warp_inst_t(config);
   m_bank_warp_shift = log2_warp_size;
   m_sub_core_model = sub_core_model;
+  m_reg_id = reg_id;
   m_num_banks_per_sched = banks_per_sched;
 }
 
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 0b96ec0c8..a5a8166e7 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -867,7 +867,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
       m_bank_warp_shift = 0;
     }
     // accessors
-    bool ready(bool sub_core_modle, unsigned reg_id) const;
+    bool ready(bool sub_core_model, unsigned reg_id) const;
     const op_t *get_operands() const { return m_src_op; }
     void dump(FILE *fp, const shader_core_ctx *shader) const;
 
@@ -878,11 +878,12 @@ class opndcoll_rfu_t {  // operand collector based register file unit
     }
     unsigned get_sp_op() const { return m_warp->sp_op; }
     unsigned get_id() const { return m_cuid; }  // returns CU hw id
+    unsigned get_reg_id() const { return m_reg_id; }
 
     // modifiers
     void init(unsigned n, unsigned num_banks, unsigned log2_warp_size,
               const core_config *config, opndcoll_rfu_t *rfu,
-              bool m_sub_core_model, unsigned num_banks_per_sched);
+              bool m_sub_core_model, unsigned reg_id, unsigned num_banks_per_sched);
     bool allocate(register_set *pipeline_reg, register_set *output_reg);
 
     void collect_operand(unsigned op) { m_not_ready.reset(op); }
@@ -906,6 +907,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
 
     unsigned m_num_banks_per_sched;
     bool m_sub_core_model;
+    unsigned m_reg_id; // if sub_core_model enabled, limit regs this cu can r/w
   };
 
   class dispatch_unit_t {
@@ -921,11 +923,8 @@ class opndcoll_rfu_t {  // operand collector based register file unit
       for (unsigned n = 0; n < m_num_collectors; n++) {
         unsigned c = (m_last_cu + n + 1) % m_num_collectors;
         unsigned reg_id;
-        if (sub_core_model) {
-          assert (m_num_collectors >= m_num_warp_scheds);
-          unsigned cusPerSched = m_num_collectors / m_num_warp_scheds;
-          reg_id = c / cusPerSched;
-        }
+        if (sub_core_model)
+          reg_id = (*m_collector_units)[c].get_reg_id();
         if ((*m_collector_units)[c].ready(sub_core_model, reg_id)) {
           m_last_cu = c;
           return &((*m_collector_units)[c]);
@@ -934,8 +933,6 @@ class opndcoll_rfu_t {  // operand collector based register file unit
       return NULL;
     }
 
-    unsigned get_num_collectors(){return m_num_collectors;}
-
    private:
     unsigned m_num_collectors;
     std::vector<collector_unit_t> *m_collector_units;

From 4825a1dad0938a40c8feb01e554ca8f5fdc6c4c5 Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Mon, 10 May 2021 22:26:47 -0400
Subject: [PATCH 021/154] fix assert

---
 src/gpgpu-sim/shader.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 72476161f..acd41d868 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3871,7 +3871,7 @@ void opndcoll_rfu_t::init(unsigned num_banks, shader_core_ctx *shader) {
   unsigned reg_id;
   if (sub_core_model) {
     assert(num_banks % shader->get_config()->gpgpu_num_sched_per_core == 0);
-    assert(m_num_warp_scheds >= m_cu.size() && m_cu.size() % m_num_warp_scheds == 0);
+    assert(m_num_warp_scheds <= m_cu.size() && m_cu.size() % m_num_warp_scheds == 0);
   }
   m_num_banks_per_sched =
       num_banks / shader->get_config()->gpgpu_num_sched_per_core;

From e2b410dd117b11098e6bb88be36293afbeb5c444 Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Mon, 10 May 2021 22:45:02 -0400
Subject: [PATCH 022/154] clean up redundant method args

---
 src/gpgpu-sim/shader.cc | 13 +++++--------
 src/gpgpu-sim/shader.h  |  4 ++--
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index acd41d868..e3a3e9c11 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3968,10 +3968,7 @@ void opndcoll_rfu_t::dispatch_ready_cu() {
               m_shader->get_config()->warp_size);  // cu->get_active_count());
         }
       }
-      unsigned reg_id;
-      if (sub_core_model) 
-        reg_id = cu->get_reg_id();
-      cu->dispatch(sub_core_model, reg_id);
+      cu->dispatch();
     }
   }
 }
@@ -4055,8 +4052,8 @@ void opndcoll_rfu_t::allocate_reads() {
   }
 }
 
-bool opndcoll_rfu_t::collector_unit_t::ready(bool sub_core_model, unsigned reg_id) const {
-  return (!m_free) && m_not_ready.none() && (*m_output_register).has_free(sub_core_model, reg_id);
+bool opndcoll_rfu_t::collector_unit_t::ready() const {
+  return (!m_free) && m_not_ready.none() && (*m_output_register).has_free(m_sub_core_model, m_reg_id);
 }
 
 void opndcoll_rfu_t::collector_unit_t::dump(
@@ -4121,7 +4118,7 @@ bool opndcoll_rfu_t::collector_unit_t::allocate(register_set *pipeline_reg_set,
   return false;
 }
 
-void opndcoll_rfu_t::collector_unit_t::dispatch(bool sub_core_model, unsigned reg_id) {
+void opndcoll_rfu_t::collector_unit_t::dispatch() {
   assert(m_not_ready.none());
   // Print out which OC dispatched which warp sched id to which exec pipeline
   /* std::cout << "Dispatched from OC: "
@@ -4135,7 +4132,7 @@ void opndcoll_rfu_t::collector_unit_t::dispatch(bool sub_core_model, unsigned re
   << "\treg id: "
   << reg_id
   << std::endl; */
-  m_output_register->move_in(sub_core_model, reg_id, m_warp);
+  m_output_register->move_in(m_sub_core_model, m_reg_id, m_warp);
   m_free = true;
   m_output_register = NULL;
   for (unsigned i = 0; i < MAX_REG_OPERANDS * 2; i++) m_src_op[i].reset();
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index a5a8166e7..00e7deb05 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -867,7 +867,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
       m_bank_warp_shift = 0;
     }
     // accessors
-    bool ready(bool sub_core_model, unsigned reg_id) const;
+    bool ready() const;
     const op_t *get_operands() const { return m_src_op; }
     void dump(FILE *fp, const shader_core_ctx *shader) const;
 
@@ -889,7 +889,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
     void collect_operand(unsigned op) { m_not_ready.reset(op); }
     unsigned get_num_operands() const { return m_warp->get_num_operands(); }
     unsigned get_num_regs() const { return m_warp->get_num_regs(); }
-    void dispatch(bool sub_core_model, unsigned reg_id);
+    void dispatch();
     bool is_free() { return m_free; }
 
    private:

From 9c0156bd732fe370d5022ca036fff515fcd9d2d4 Mon Sep 17 00:00:00 2001
From: Aaron M Barnes <barnes88@purdue.edu>
Date: Mon, 10 May 2021 22:58:05 -0400
Subject: [PATCH 023/154] more cleanup

---
 src/gpgpu-sim/shader.cc | 4 ++--
 src/gpgpu-sim/shader.h  | 5 +----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index e3a3e9c11..9eab7fcad 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3996,7 +3996,7 @@ void opndcoll_rfu_t::allocate_cu(unsigned port_num) {
         }
         for (unsigned k = cuLowerBound; k < cuUpperBound; k++) {
           if (cu_set[k].is_free()) {
-            //std::cout << "Allocated schd_id: " << schd_id << " on cu: " << k << std::endl;
+            // std::cout << "Allocated schd_id: " << schd_id << " on cu: " << k << std::endl;
             collector_unit_t *cu = &cu_set[k];
             allocated = cu->allocate(inp.m_in[i], inp.m_out[i]);
             m_arbiter.add_read_requests(cu);
@@ -4130,7 +4130,7 @@ void opndcoll_rfu_t::collector_unit_t::dispatch() {
   << "\tto execution register: "
   << m_output_register->get_name()
   << "\treg id: "
-  << reg_id
+  << this->get_reg_id()
   << std::endl; */
   m_output_register->move_in(m_sub_core_model, m_reg_id, m_warp);
   m_free = true;
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 00e7deb05..7655cb9e6 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -922,10 +922,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
     collector_unit_t *find_ready(bool sub_core_model) {
       for (unsigned n = 0; n < m_num_collectors; n++) {
         unsigned c = (m_last_cu + n + 1) % m_num_collectors;
-        unsigned reg_id;
-        if (sub_core_model)
-          reg_id = (*m_collector_units)[c].get_reg_id();
-        if ((*m_collector_units)[c].ready(sub_core_model, reg_id)) {
+        if ((*m_collector_units)[c].ready()) {
           m_last_cu = c;
           return &((*m_collector_units)[c]);
         }

From 28c3c94e4e76f5c2a9fffb557587c6be3b541ccf Mon Sep 17 00:00:00 2001
From: Aaron M Barnes <barnes88@purdue.edu>
Date: Mon, 10 May 2021 23:02:17 -0400
Subject: [PATCH 024/154] cleanup find_ready

---
 src/gpgpu-sim/shader.cc | 2 +-
 src/gpgpu-sim/shader.h  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 9eab7fcad..db24d8c52 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3946,7 +3946,7 @@ bool opndcoll_rfu_t::writeback(warp_inst_t &inst) {
 void opndcoll_rfu_t::dispatch_ready_cu() {
   for (unsigned p = 0; p < m_dispatch_units.size(); ++p) {
     dispatch_unit_t &du = m_dispatch_units[p];
-    collector_unit_t *cu = du.find_ready(sub_core_model);
+    collector_unit_t *cu = du.find_ready();
     if (cu) {
       for (unsigned i = 0; i < (cu->get_num_operands() - cu->get_num_regs());
            i++) {
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 7655cb9e6..75734e476 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -919,7 +919,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
       m_next_cu = 0;
     }
 
-    collector_unit_t *find_ready(bool sub_core_model) {
+    collector_unit_t *find_ready() {
       for (unsigned n = 0; n < m_num_collectors; n++) {
         unsigned c = (m_last_cu + n + 1) % m_num_collectors;
         if ((*m_collector_units)[c].ready()) {

From 28d056519c7f1771557f90d5b0b295b7f75c1a2d Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Tue, 11 May 2021 18:13:37 -0400
Subject: [PATCH 025/154] partition issue() in the shader execute stage

---
 src/abstract_hardware_model.h | 16 ++++++++
 src/gpgpu-sim/shader.cc       | 72 +++++++++++++++++++----------------
 src/gpgpu-sim/shader.h        | 26 +++++++++----
 3 files changed, 74 insertions(+), 40 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index d70c3ebc3..90ae44896 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -1316,6 +1316,12 @@ class register_set {
     }
     return false;
   }
+  bool has_ready(bool sub_core_model, unsigned reg_id) {
+    if (!sub_core_model) return has_ready();
+    assert(reg_id < regs.size());
+    return (not regs[reg_id]->empty())
+  }
+
   unsigned get_ready_reg_id() {
     // for sub core model we need to figure which reg_id has the ready warp
     // this function should only be called if has_ready() was true
@@ -1376,6 +1382,16 @@ class register_set {
     }
     return ready;
   }
+  warp_inst_t **get_ready(bool sub_core_model, unsigned reg_id) {
+    if (!sub_core_model)
+      return get_ready();
+    warp_inst_t **ready;
+    ready = NULL;
+    assert(reg_id < regs.size());
+    if (not regs[reg_id]->empty)
+      ready = &regs[reg_id];
+    return ready;
+  }
 
   void print(FILE *fp) const {
     fprintf(fp, "%s : @%p\n", m_name, this);
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index e3a3e9c11..ca421deba 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -377,41 +377,41 @@ void shader_core_ctx::create_exec_pipeline() {
 
   // m_fu = new simd_function_unit*[m_num_function_units];
 
-  for (int k = 0; k < m_config->gpgpu_num_sp_units; k++) {
-    m_fu.push_back(new sp_unit(&m_pipeline_reg[EX_WB], m_config, this));
+  for (unsigned k = 0; k < m_config->gpgpu_num_sp_units; k++) {
+    m_fu.push_back(new sp_unit(&m_pipeline_reg[EX_WB], m_config, this, k));
     m_dispatch_port.push_back(ID_OC_SP);
     m_issue_port.push_back(OC_EX_SP);
   }
 
-  for (int k = 0; k < m_config->gpgpu_num_dp_units; k++) {
-    m_fu.push_back(new dp_unit(&m_pipeline_reg[EX_WB], m_config, this));
+  for (unsigned k = 0; k < m_config->gpgpu_num_dp_units; k++) {
+    m_fu.push_back(new dp_unit(&m_pipeline_reg[EX_WB], m_config, this, k));
     m_dispatch_port.push_back(ID_OC_DP);
     m_issue_port.push_back(OC_EX_DP);
   }
-  for (int k = 0; k < m_config->gpgpu_num_int_units; k++) {
-    m_fu.push_back(new int_unit(&m_pipeline_reg[EX_WB], m_config, this));
+  for (unsigned k = 0; k < m_config->gpgpu_num_int_units; k++) {
+    m_fu.push_back(new int_unit(&m_pipeline_reg[EX_WB], m_config, this, k));
     m_dispatch_port.push_back(ID_OC_INT);
     m_issue_port.push_back(OC_EX_INT);
   }
 
-  for (int k = 0; k < m_config->gpgpu_num_sfu_units; k++) {
-    m_fu.push_back(new sfu(&m_pipeline_reg[EX_WB], m_config, this));
+  for (unsigned k = 0; k < m_config->gpgpu_num_sfu_units; k++) {
+    m_fu.push_back(new sfu(&m_pipeline_reg[EX_WB], m_config, this, k));
     m_dispatch_port.push_back(ID_OC_SFU);
     m_issue_port.push_back(OC_EX_SFU);
   }
 
-  for (int k = 0; k < m_config->gpgpu_num_tensor_core_units; k++) {
-    m_fu.push_back(new tensor_core(&m_pipeline_reg[EX_WB], m_config, this));
+  for (unsigned k = 0; k < m_config->gpgpu_num_tensor_core_units; k++) {
+    m_fu.push_back(new tensor_core(&m_pipeline_reg[EX_WB], m_config, this, k));
     m_dispatch_port.push_back(ID_OC_TENSOR_CORE);
     m_issue_port.push_back(OC_EX_TENSOR_CORE);
   }
 
-  for (int j = 0; j < m_config->m_specialized_unit.size(); j++) {
+  for (unsigned j = 0; j < m_config->m_specialized_unit.size(); j++) {
     for (unsigned k = 0; k < m_config->m_specialized_unit[j].num_units; k++) {
       m_fu.push_back(new specialized_unit(
           &m_pipeline_reg[EX_WB], m_config, this, SPEC_UNIT_START_ID + j,
           m_config->m_specialized_unit[j].name,
-          m_config->m_specialized_unit[j].latency));
+          m_config->m_specialized_unit[j].latency, k));
       m_dispatch_port.push_back(m_config->m_specialized_unit[j].ID_OC_SPEC_ID);
       m_issue_port.push_back(m_config->m_specialized_unit[j].OC_EX_SPEC_ID);
     }
@@ -419,7 +419,7 @@ void shader_core_ctx::create_exec_pipeline() {
 
   m_ldst_unit = new ldst_unit(m_icnt, m_mem_fetch_allocator, this,
                               &m_operand_collector, m_scoreboard, m_config,
-                              m_memory_config, m_stats, m_sid, m_tpc);
+                              m_memory_config, m_stats, m_sid, m_tpc, static_cast<unsigned>(0));
   m_fu.push_back(m_ldst_unit);
   m_dispatch_port.push_back(ID_OC_MEM);
   m_issue_port.push_back(OC_EX_MEM);
@@ -1669,8 +1669,13 @@ void shader_core_ctx::execute() {
     m_fu[n]->active_lanes_in_pipeline();
     unsigned issue_port = m_issue_port[n];
     register_set &issue_inst = m_pipeline_reg[issue_port];
-    warp_inst_t **ready_reg = issue_inst.get_ready();
-    if (issue_inst.has_ready() && m_fu[n]->can_issue(**ready_reg)) {
+    unsigned reg_id;
+    bool partition_issue = m_config->sub_core_model && m_fu[n]->is_issue_partitioned();
+    if (m_config->sub_core_model) {
+      reg_id = m_fu[n]->get_issue_reg_id();
+    }
+    warp_inst_t **ready_reg = issue_inst.get_ready(partition_issue, reg_id);
+    if (issue_inst.has_ready(partition_issue, reg_id) && m_fu[n]->can_issue(**ready_reg)) {
       bool schedule_wb_now = !m_fu[n]->stallable();
       int resbus = -1;
       if (schedule_wb_now &&
@@ -2113,16 +2118,17 @@ simd_function_unit::simd_function_unit(const shader_core_config *config) {
 }
 
 sfu::sfu(register_set *result_port, const shader_core_config *config,
-         shader_core_ctx *core)
-    : pipelined_simd_unit(result_port, config, config->max_sfu_latency, core) {
+         shader_core_ctx *core, unsigned issue_reg_id)
+    : pipelined_simd_unit(result_port, config, config->max_sfu_latency, core,
+     issue_reg_id) {
   m_name = "SFU";
 }
 
 tensor_core::tensor_core(register_set *result_port,
                          const shader_core_config *config,
-                         shader_core_ctx *core)
+                         shader_core_ctx *core, unsigned issue_reg_id)
     : pipelined_simd_unit(result_port, config, config->max_tensor_core_latency,
-                          core) {
+                          core, issue_reg_id) {
   m_name = "TENSOR_CORE";
 }
 
@@ -2208,29 +2214,29 @@ void tensor_core::active_lanes_in_pipeline() {
 }
 
 sp_unit::sp_unit(register_set *result_port, const shader_core_config *config,
-                 shader_core_ctx *core)
-    : pipelined_simd_unit(result_port, config, config->max_sp_latency, core) {
+                 shader_core_ctx *core, unsigned issue_reg_id)
+    : pipelined_simd_unit(result_port, config, config->max_sp_latency, core, issue_reg_id) {
   m_name = "SP ";
 }
 
 specialized_unit::specialized_unit(register_set *result_port,
                                    const shader_core_config *config,
                                    shader_core_ctx *core, unsigned supported_op,
-                                   char *unit_name, unsigned latency)
-    : pipelined_simd_unit(result_port, config, latency, core) {
+                                   char *unit_name, unsigned latency, unsigned issue_reg_id)
+    : pipelined_simd_unit(result_port, config, latency, core, issue_reg_id) {
   m_name = unit_name;
   m_supported_op = supported_op;
 }
 
 dp_unit::dp_unit(register_set *result_port, const shader_core_config *config,
-                 shader_core_ctx *core)
-    : pipelined_simd_unit(result_port, config, config->max_dp_latency, core) {
+                 shader_core_ctx *core, unsigned issue_reg_id)
+    : pipelined_simd_unit(result_port, config, config->max_dp_latency, core, issue_reg_id) {
   m_name = "DP ";
 }
 
 int_unit::int_unit(register_set *result_port, const shader_core_config *config,
-                   shader_core_ctx *core)
-    : pipelined_simd_unit(result_port, config, config->max_int_latency, core) {
+                   shader_core_ctx *core, unsigned issue_reg_id)
+    : pipelined_simd_unit(result_port, config, config->max_int_latency, core, issue_reg_id) {
   m_name = "INT ";
 }
 
@@ -2269,7 +2275,8 @@ void int_unit ::issue(register_set &source_reg) {
 pipelined_simd_unit::pipelined_simd_unit(register_set *result_port,
                                          const shader_core_config *config,
                                          unsigned max_latency,
-                                         shader_core_ctx *core)
+                                         shader_core_ctx *core,
+                                         unsigned issue_reg_id)
     : simd_function_unit(config) {
   m_result_port = result_port;
   m_pipeline_depth = max_latency;
@@ -2277,6 +2284,7 @@ pipelined_simd_unit::pipelined_simd_unit(register_set *result_port,
   for (unsigned i = 0; i < m_pipeline_depth; i++)
     m_pipeline_reg[i] = new warp_inst_t(config);
   m_core = core;
+  m_issue_reg_id = issue_reg_id;
   active_insts_in_pipeline = 0;
 }
 
@@ -2359,8 +2367,8 @@ ldst_unit::ldst_unit(mem_fetch_interface *icnt,
                      shader_core_ctx *core, opndcoll_rfu_t *operand_collector,
                      Scoreboard *scoreboard, const shader_core_config *config,
                      const memory_config *mem_config, shader_core_stats *stats,
-                     unsigned sid, unsigned tpc)
-    : pipelined_simd_unit(NULL, config, config->smem_latency, core),
+                     unsigned sid, unsigned tpc, unsigned issue_reg_id)
+    : pipelined_simd_unit(NULL, config, config->smem_latency, core, issue_reg_id),
       m_next_wb(config) {
   assert(config->smem_latency > 1);
   init(icnt, mf_allocator, core, operand_collector, scoreboard, config,
@@ -2387,8 +2395,8 @@ ldst_unit::ldst_unit(mem_fetch_interface *icnt,
                      shader_core_ctx *core, opndcoll_rfu_t *operand_collector,
                      Scoreboard *scoreboard, const shader_core_config *config,
                      const memory_config *mem_config, shader_core_stats *stats,
-                     unsigned sid, unsigned tpc, l1_cache *new_l1d_cache)
-    : pipelined_simd_unit(NULL, config, 3, core),
+                     unsigned sid, unsigned tpc, l1_cache *new_l1d_cache, unsigned issue_reg_id)
+    : pipelined_simd_unit(NULL, config, 3, core, issue_reg_id),
       m_L1D(new_l1d_cache),
       m_next_wb(config) {
   init(icnt, mf_allocator, core, operand_collector, scoreboard, config,
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 00e7deb05..ba37b0cee 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -1075,7 +1075,7 @@ class pipelined_simd_unit : public simd_function_unit {
  public:
   pipelined_simd_unit(register_set *result_port,
                       const shader_core_config *config, unsigned max_latency,
-                      shader_core_ctx *core);
+                      shader_core_ctx *core, unsigned issue_reg_id);
 
   // modifiers
   virtual void cycle();
@@ -1096,6 +1096,7 @@ class pipelined_simd_unit : public simd_function_unit {
   virtual bool can_issue(const warp_inst_t &inst) const {
     return simd_function_unit::can_issue(inst);
   }
+  unsigned get_issue_reg_id() { return m_issue_reg_id; }
   virtual void print(FILE *fp) const {
     simd_function_unit::print(fp);
     for (int s = m_pipeline_depth - 1; s >= 0; s--) {
@@ -1111,6 +1112,8 @@ class pipelined_simd_unit : public simd_function_unit {
   warp_inst_t **m_pipeline_reg;
   register_set *m_result_port;
   class shader_core_ctx *m_core;
+  unsigned m_issue_reg_id; // if sub_core_model is enabled we can only issue from a
+                           // subset of operand collectors
 
   unsigned active_insts_in_pipeline;
 };
@@ -1118,7 +1121,7 @@ class pipelined_simd_unit : public simd_function_unit {
 class sfu : public pipelined_simd_unit {
  public:
   sfu(register_set *result_port, const shader_core_config *config,
-      shader_core_ctx *core);
+      shader_core_ctx *core, unsigned issue_reg_id);
   virtual bool can_issue(const warp_inst_t &inst) const {
     switch (inst.op) {
       case SFU_OP:
@@ -1134,12 +1137,13 @@ class sfu : public pipelined_simd_unit {
   }
   virtual void active_lanes_in_pipeline();
   virtual void issue(register_set &source_reg);
+  bool is_issue_partitioned() { return true; }
 };
 
 class dp_unit : public pipelined_simd_unit {
  public:
   dp_unit(register_set *result_port, const shader_core_config *config,
-          shader_core_ctx *core);
+          shader_core_ctx *core, unsigned issue_reg_id);
   virtual bool can_issue(const warp_inst_t &inst) const {
     switch (inst.op) {
       case DP_OP:
@@ -1151,12 +1155,13 @@ class dp_unit : public pipelined_simd_unit {
   }
   virtual void active_lanes_in_pipeline();
   virtual void issue(register_set &source_reg);
+  bool is_issue_partitioned() { return true; }
 };
 
 class tensor_core : public pipelined_simd_unit {
  public:
   tensor_core(register_set *result_port, const shader_core_config *config,
-              shader_core_ctx *core);
+              shader_core_ctx *core, unsigned issue_reg_id);
   virtual bool can_issue(const warp_inst_t &inst) const {
     switch (inst.op) {
       case TENSOR_CORE_OP:
@@ -1168,12 +1173,13 @@ class tensor_core : public pipelined_simd_unit {
   }
   virtual void active_lanes_in_pipeline();
   virtual void issue(register_set &source_reg);
+  bool is_issue_partitioned() { return true; }
 };
 
 class int_unit : public pipelined_simd_unit {
  public:
   int_unit(register_set *result_port, const shader_core_config *config,
-           shader_core_ctx *core);
+           shader_core_ctx *core, unsigned issue_reg_id);
   virtual bool can_issue(const warp_inst_t &inst) const {
     switch (inst.op) {
       case SFU_OP:
@@ -1199,12 +1205,13 @@ class int_unit : public pipelined_simd_unit {
   }
   virtual void active_lanes_in_pipeline();
   virtual void issue(register_set &source_reg);
+  bool is_issue_partitioned() { return true; }
 };
 
 class sp_unit : public pipelined_simd_unit {
  public:
   sp_unit(register_set *result_port, const shader_core_config *config,
-          shader_core_ctx *core);
+          shader_core_ctx *core, unsigned issue_reg_id);
   virtual bool can_issue(const warp_inst_t &inst) const {
     switch (inst.op) {
       case SFU_OP:
@@ -1228,13 +1235,14 @@ class sp_unit : public pipelined_simd_unit {
   }
   virtual void active_lanes_in_pipeline();
   virtual void issue(register_set &source_reg);
+  bool is_issue_partitioned() { return true; }
 };
 
 class specialized_unit : public pipelined_simd_unit {
  public:
   specialized_unit(register_set *result_port, const shader_core_config *config,
                    shader_core_ctx *core, unsigned supported_op,
-                   char *unit_name, unsigned latency);
+                   char *unit_name, unsigned latency, unsigned issue_reg_id);
   virtual bool can_issue(const warp_inst_t &inst) const {
     if (inst.op != m_supported_op) {
       return false;
@@ -1243,6 +1251,7 @@ class specialized_unit : public pipelined_simd_unit {
   }
   virtual void active_lanes_in_pipeline();
   virtual void issue(register_set &source_reg);
+  bool is_issue_partitioned() { return false; }
 
  private:
   unsigned m_supported_op;
@@ -1260,10 +1269,11 @@ class ldst_unit : public pipelined_simd_unit {
             shader_core_ctx *core, opndcoll_rfu_t *operand_collector,
             Scoreboard *scoreboard, const shader_core_config *config,
             const memory_config *mem_config, class shader_core_stats *stats,
-            unsigned sid, unsigned tpc);
+            unsigned sid, unsigned tpc, unsigned issue_reg_id);
 
   // modifiers
   virtual void issue(register_set &inst);
+  bool is_issue_partitioned() { return false; }
   virtual void cycle();
 
   void fill(mem_fetch *mf);

From ec55c68bcdf4406743efa591fcb30e4f467012a0 Mon Sep 17 00:00:00 2001
From: Aaron M Barnes <barnes88@purdue.edu>
Date: Tue, 11 May 2021 19:30:09 -0400
Subject: [PATCH 026/154] minor fixes, pure virtual calls

---
 src/abstract_hardware_model.h |  4 ++--
 src/gpgpu-sim/shader.cc       | 16 ++++++++--------
 src/gpgpu-sim/shader.h        |  7 +++++--
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index 90ae44896..6d431fc60 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -1319,7 +1319,7 @@ class register_set {
   bool has_ready(bool sub_core_model, unsigned reg_id) {
     if (!sub_core_model) return has_ready();
     assert(reg_id < regs.size());
-    return (not regs[reg_id]->empty())
+    return (not regs[reg_id]->empty());
   }
 
   unsigned get_ready_reg_id() {
@@ -1388,7 +1388,7 @@ class register_set {
     warp_inst_t **ready;
     ready = NULL;
     assert(reg_id < regs.size());
-    if (not regs[reg_id]->empty)
+    if (not regs[reg_id]->empty())
       ready = &regs[reg_id];
     return ready;
   }
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 17cf5ba26..d98f10a95 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -411,7 +411,7 @@ void shader_core_ctx::create_exec_pipeline() {
       m_fu.push_back(new specialized_unit(
           &m_pipeline_reg[EX_WB], m_config, this, SPEC_UNIT_START_ID + j,
           m_config->m_specialized_unit[j].name,
-          m_config->m_specialized_unit[j].latency, k));
+          m_config->m_specialized_unit[j].latency));
       m_dispatch_port.push_back(m_config->m_specialized_unit[j].ID_OC_SPEC_ID);
       m_issue_port.push_back(m_config->m_specialized_unit[j].OC_EX_SPEC_ID);
     }
@@ -419,7 +419,7 @@ void shader_core_ctx::create_exec_pipeline() {
 
   m_ldst_unit = new ldst_unit(m_icnt, m_mem_fetch_allocator, this,
                               &m_operand_collector, m_scoreboard, m_config,
-                              m_memory_config, m_stats, m_sid, m_tpc, static_cast<unsigned>(0));
+                              m_memory_config, m_stats, m_sid, m_tpc);
   m_fu.push_back(m_ldst_unit);
   m_dispatch_port.push_back(ID_OC_MEM);
   m_issue_port.push_back(OC_EX_MEM);
@@ -2222,8 +2222,8 @@ sp_unit::sp_unit(register_set *result_port, const shader_core_config *config,
 specialized_unit::specialized_unit(register_set *result_port,
                                    const shader_core_config *config,
                                    shader_core_ctx *core, unsigned supported_op,
-                                   char *unit_name, unsigned latency, unsigned issue_reg_id)
-    : pipelined_simd_unit(result_port, config, latency, core, issue_reg_id) {
+                                   char *unit_name, unsigned latency)
+    : pipelined_simd_unit(result_port, config, latency, core, 0) {
   m_name = unit_name;
   m_supported_op = supported_op;
 }
@@ -2367,8 +2367,8 @@ ldst_unit::ldst_unit(mem_fetch_interface *icnt,
                      shader_core_ctx *core, opndcoll_rfu_t *operand_collector,
                      Scoreboard *scoreboard, const shader_core_config *config,
                      const memory_config *mem_config, shader_core_stats *stats,
-                     unsigned sid, unsigned tpc, unsigned issue_reg_id)
-    : pipelined_simd_unit(NULL, config, config->smem_latency, core, issue_reg_id),
+                     unsigned sid, unsigned tpc)
+    : pipelined_simd_unit(NULL, config, config->smem_latency, core, 0),
       m_next_wb(config) {
   assert(config->smem_latency > 1);
   init(icnt, mf_allocator, core, operand_collector, scoreboard, config,
@@ -2395,8 +2395,8 @@ ldst_unit::ldst_unit(mem_fetch_interface *icnt,
                      shader_core_ctx *core, opndcoll_rfu_t *operand_collector,
                      Scoreboard *scoreboard, const shader_core_config *config,
                      const memory_config *mem_config, shader_core_stats *stats,
-                     unsigned sid, unsigned tpc, l1_cache *new_l1d_cache, unsigned issue_reg_id)
-    : pipelined_simd_unit(NULL, config, 3, core, issue_reg_id),
+                     unsigned sid, unsigned tpc, l1_cache *new_l1d_cache)
+    : pipelined_simd_unit(NULL, config, 3, core, 0),
       m_L1D(new_l1d_cache),
       m_next_wb(config) {
   init(icnt, mf_allocator, core, operand_collector, scoreboard, config,
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 5c5e9a46b..62abd35ab 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -1053,6 +1053,8 @@ class simd_function_unit {
   virtual bool can_issue(const warp_inst_t &inst) const {
     return m_dispatch_reg->empty() && !occupied.test(inst.latency);
   }
+  virtual bool is_issue_partitioned() = 0;
+  virtual unsigned get_issue_reg_id() = 0; 
   virtual bool stallable() const = 0;
   virtual void print(FILE *fp) const {
     fprintf(fp, "%s dispatch= ", m_name.c_str());
@@ -1093,6 +1095,7 @@ class pipelined_simd_unit : public simd_function_unit {
   virtual bool can_issue(const warp_inst_t &inst) const {
     return simd_function_unit::can_issue(inst);
   }
+  virtual bool is_issue_partitioned() = 0;
   unsigned get_issue_reg_id() { return m_issue_reg_id; }
   virtual void print(FILE *fp) const {
     simd_function_unit::print(fp);
@@ -1239,7 +1242,7 @@ class specialized_unit : public pipelined_simd_unit {
  public:
   specialized_unit(register_set *result_port, const shader_core_config *config,
                    shader_core_ctx *core, unsigned supported_op,
-                   char *unit_name, unsigned latency, unsigned issue_reg_id);
+                   char *unit_name, unsigned latency);
   virtual bool can_issue(const warp_inst_t &inst) const {
     if (inst.op != m_supported_op) {
       return false;
@@ -1266,7 +1269,7 @@ class ldst_unit : public pipelined_simd_unit {
             shader_core_ctx *core, opndcoll_rfu_t *operand_collector,
             Scoreboard *scoreboard, const shader_core_config *config,
             const memory_config *mem_config, class shader_core_stats *stats,
-            unsigned sid, unsigned tpc, unsigned issue_reg_id);
+            unsigned sid, unsigned tpc);
 
   // modifiers
   virtual void issue(register_set &inst);

From 71455d84455f4a75bb2763ebe2fd58617a4ad843 Mon Sep 17 00:00:00 2001
From: Aaron M Barnes <barnes88@purdue.edu>
Date: Tue, 11 May 2021 20:07:08 -0400
Subject: [PATCH 027/154] add prints for ex issue validation

---
 src/gpgpu-sim/shader.cc | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index d98f10a95..f838ba118 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -1683,9 +1683,28 @@ void shader_core_ctx::execute() {
         assert((*ready_reg)->latency < MAX_ALU_LATENCY);
         m_result_bus[resbus]->set((*ready_reg)->latency);
         m_fu[n]->issue(issue_inst);
+        warp_inst_t** instr = issue_inst.get_ready(true, reg_id);
+        std::cout << "EX stage issued warp_id: "
+        << (*instr)->warp_id()
+        << " schd_id: "
+        << (*instr)->get_schd_id()
+        << " to pipeline: "
+        << m_fu[n]->get_name()
+        << " issue reg_id: "
+        << m_fu[n]->get_issue_reg_id()
+        << std::endl;
       } else if (!schedule_wb_now) {
         m_fu[n]->issue(issue_inst);
-      } else {
+        std::cout << "EX stage issued warp_id: "
+        << (*instr)->warp_id()
+        << " schd_id: "
+        << (*instr)->get_schd_id()
+        << " to pipeline: "
+        << m_fu[n]->get_name()
+        << " issue reg_id: "
+        << m_fu[n]->get_issue_reg_id()
+        << std::endl;
+        } else {
         // stall issue (cannot reserve result bus)
       }
     }
@@ -4004,7 +4023,7 @@ void opndcoll_rfu_t::allocate_cu(unsigned port_num) {
         }
         for (unsigned k = cuLowerBound; k < cuUpperBound; k++) {
           if (cu_set[k].is_free()) {
-            // std::cout << "Allocated schd_id: " << schd_id << " on cu: " << k << std::endl;
+            std::cout << "Allocated schd_id: " << schd_id << " on cu: " << k << std::endl;
             collector_unit_t *cu = &cu_set[k];
             allocated = cu->allocate(inp.m_in[i], inp.m_out[i]);
             m_arbiter.add_read_requests(cu);
@@ -4129,7 +4148,7 @@ bool opndcoll_rfu_t::collector_unit_t::allocate(register_set *pipeline_reg_set,
 void opndcoll_rfu_t::collector_unit_t::dispatch() {
   assert(m_not_ready.none());
   // Print out which OC dispatched which warp sched id to which exec pipeline
-  /* std::cout << "Dispatched from OC: "
+  std::cout << "Dispatched from OC: "
   << this->get_id()
   << "\t Warp_id: "
   << m_warp->get_uid()
@@ -4139,7 +4158,7 @@ void opndcoll_rfu_t::collector_unit_t::dispatch() {
   << m_output_register->get_name()
   << "\treg id: "
   << this->get_reg_id()
-  << std::endl; */
+  << std::endl;
   m_output_register->move_in(m_sub_core_model, m_reg_id, m_warp);
   m_free = true;
   m_output_register = NULL;

From 640674b74b12ef4b0188b267884eda9391f4bf34 Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Tue, 11 May 2021 20:25:49 -0400
Subject: [PATCH 028/154] issue function needed to be constrained

---
 src/abstract_hardware_model.h |  5 +++++
 src/gpgpu-sim/shader.cc       | 12 ++++++------
 src/gpgpu-sim/shader.h        |  2 +-
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index 6d431fc60..e9da4294e 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -1367,6 +1367,11 @@ class register_set {
     warp_inst_t **ready = get_ready();
     move_warp(dest, *ready);
   }
+  void move_out_to(bool sub_core_model, unsigned reg_id, warp_inst_t *&dest) {
+    if (!sub_core_model) { return move_out_to(dest);}
+    warp_inst_t **ready = get_ready(sub_core_model, reg_id);
+    move_warp(dest, *ready);
+  }
 
   warp_inst_t **get_ready() {
     warp_inst_t **ready;
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index f838ba118..659d1590f 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -2152,7 +2152,7 @@ tensor_core::tensor_core(register_set *result_port,
 }
 
 void sfu::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready();
+  warp_inst_t **ready_reg = source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
 
   (*ready_reg)->op_pipe = SFU__OP;
@@ -2161,7 +2161,7 @@ void sfu::issue(register_set &source_reg) {
 }
 
 void tensor_core::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready();
+  warp_inst_t **ready_reg = source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
 
   (*ready_reg)->op_pipe = TENSOR_CORE__OP;
@@ -2260,7 +2260,7 @@ int_unit::int_unit(register_set *result_port, const shader_core_config *config,
 }
 
 void sp_unit ::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready();
+  warp_inst_t **ready_reg = source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
   (*ready_reg)->op_pipe = SP__OP;
   m_core->incsp_stat(m_core->get_config()->warp_size, (*ready_reg)->latency);
@@ -2268,7 +2268,7 @@ void sp_unit ::issue(register_set &source_reg) {
 }
 
 void dp_unit ::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready();
+  warp_inst_t **ready_reg = source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
   (*ready_reg)->op_pipe = DP__OP;
   m_core->incsp_stat(m_core->get_config()->warp_size, (*ready_reg)->latency);
@@ -2284,7 +2284,7 @@ void specialized_unit ::issue(register_set &source_reg) {
 }
 
 void int_unit ::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready();
+  warp_inst_t **ready_reg = source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
   (*ready_reg)->op_pipe = INTP__OP;
   m_core->incsp_stat(m_core->get_config()->warp_size, (*ready_reg)->latency);
@@ -2330,7 +2330,7 @@ void pipelined_simd_unit::cycle() {
 
 void pipelined_simd_unit::issue(register_set &source_reg) {
   // move_warp(m_dispatch_reg,source_reg);
-  warp_inst_t **ready_reg = source_reg.get_ready();
+  warp_inst_t **ready_reg = source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   m_core->incexecstat((*ready_reg));
   // source_reg.move_out_to(m_dispatch_reg);
   simd_function_unit::issue(source_reg);
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 62abd35ab..2b0c71041 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -1042,7 +1042,7 @@ class simd_function_unit {
 
   // modifiers
   virtual void issue(register_set &source_reg) {
-    source_reg.move_out_to(m_dispatch_reg);
+    source_reg.move_out_to(m_config->sub_core_model, this->get_issue_reg_id(), m_dispatch_reg);
     occupied.set(m_dispatch_reg->latency);
   }
   virtual void cycle() = 0;

From 9b6af844b8adc5d15bd793646c18a7b1d9593890 Mon Sep 17 00:00:00 2001
From: Aaron M Barnes <barnes88@purdue.edu>
Date: Tue, 11 May 2021 20:35:49 -0400
Subject: [PATCH 029/154] fix print, move simd::issue() impl to .cc file

---
 src/gpgpu-sim/shader.cc | 6 ++++++
 src/gpgpu-sim/shader.h  | 5 +----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 659d1590f..349f95462 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -1695,6 +1695,7 @@ void shader_core_ctx::execute() {
         << std::endl;
       } else if (!schedule_wb_now) {
         m_fu[n]->issue(issue_inst);
+        warp_inst_t** instr = issue_inst.get_ready(true, reg_id);
         std::cout << "EX stage issued warp_id: "
         << (*instr)->warp_id()
         << " schd_id: "
@@ -2136,6 +2137,11 @@ simd_function_unit::simd_function_unit(const shader_core_config *config) {
   m_dispatch_reg = new warp_inst_t(config);
 }
 
+void simd_function_unit::issue(register_set &source_reg) {
+    source_reg.move_out_to(m_config->sub_core_model, this->get_issue_reg_id(), m_dispatch_reg);
+    occupied.set(m_dispatch_reg->latency);
+  }
+
 sfu::sfu(register_set *result_port, const shader_core_config *config,
          shader_core_ctx *core, unsigned issue_reg_id)
     : pipelined_simd_unit(result_port, config, config->max_sfu_latency, core,
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 2b0c71041..7987427d1 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -1041,10 +1041,7 @@ class simd_function_unit {
   ~simd_function_unit() { delete m_dispatch_reg; }
 
   // modifiers
-  virtual void issue(register_set &source_reg) {
-    source_reg.move_out_to(m_config->sub_core_model, this->get_issue_reg_id(), m_dispatch_reg);
-    occupied.set(m_dispatch_reg->latency);
-  }
+  virtual void issue(register_set &source_reg);
   virtual void cycle() = 0;
   virtual void active_lanes_in_pipeline() = 0;
 

From 6ae23912133b158670343da08469747cefef97d1 Mon Sep 17 00:00:00 2001
From: Aaron M Barnes <barnes88@purdue.edu>
Date: Wed, 12 May 2021 12:53:36 -0400
Subject: [PATCH 030/154] fix prints / segfault

---
 src/abstract_hardware_model.h |  1 +
 src/gpgpu-sim/shader.cc       | 32 +++++++++-----------------------
 2 files changed, 10 insertions(+), 23 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index e9da4294e..129ed69d9 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -1370,6 +1370,7 @@ class register_set {
   void move_out_to(bool sub_core_model, unsigned reg_id, warp_inst_t *&dest) {
     if (!sub_core_model) { return move_out_to(dest);}
     warp_inst_t **ready = get_ready(sub_core_model, reg_id);
+    assert(ready != NULL);
     move_warp(dest, *ready);
   }
 
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 349f95462..8816959f6 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -1671,7 +1671,7 @@ void shader_core_ctx::execute() {
     register_set &issue_inst = m_pipeline_reg[issue_port];
     unsigned reg_id;
     bool partition_issue = m_config->sub_core_model && m_fu[n]->is_issue_partitioned();
-    if (m_config->sub_core_model) {
+    if (partition_issue) {
       reg_id = m_fu[n]->get_issue_reg_id();
     }
     warp_inst_t **ready_reg = issue_inst.get_ready(partition_issue, reg_id);
@@ -1683,28 +1683,10 @@ void shader_core_ctx::execute() {
         assert((*ready_reg)->latency < MAX_ALU_LATENCY);
         m_result_bus[resbus]->set((*ready_reg)->latency);
         m_fu[n]->issue(issue_inst);
-        warp_inst_t** instr = issue_inst.get_ready(true, reg_id);
-        std::cout << "EX stage issued warp_id: "
-        << (*instr)->warp_id()
-        << " schd_id: "
-        << (*instr)->get_schd_id()
-        << " to pipeline: "
-        << m_fu[n]->get_name()
-        << " issue reg_id: "
-        << m_fu[n]->get_issue_reg_id()
-        << std::endl;
+        warp_inst_t** instr = issue_inst.get_ready(partition_issue, reg_id);
       } else if (!schedule_wb_now) {
         m_fu[n]->issue(issue_inst);
-        warp_inst_t** instr = issue_inst.get_ready(true, reg_id);
-        std::cout << "EX stage issued warp_id: "
-        << (*instr)->warp_id()
-        << " schd_id: "
-        << (*instr)->get_schd_id()
-        << " to pipeline: "
-        << m_fu[n]->get_name()
-        << " issue reg_id: "
-        << m_fu[n]->get_issue_reg_id()
-        << std::endl;
+        warp_inst_t** instr = issue_inst.get_ready(partition_issue, reg_id);
         } else {
         // stall issue (cannot reserve result bus)
       }
@@ -2138,7 +2120,10 @@ simd_function_unit::simd_function_unit(const shader_core_config *config) {
 }
 
 void simd_function_unit::issue(register_set &source_reg) {
-    source_reg.move_out_to(m_config->sub_core_model, this->get_issue_reg_id(), m_dispatch_reg);
+    bool partition_issue = m_config->sub_core_model && this->is_issue_partitioned();
+    source_reg.move_out_to(partition_issue, this->get_issue_reg_id(), m_dispatch_reg);
+    std::cout << "EX stage issue stats:" << std::endl;
+    this->print(stdout);
     occupied.set(m_dispatch_reg->latency);
   }
 
@@ -2336,7 +2321,8 @@ void pipelined_simd_unit::cycle() {
 
 void pipelined_simd_unit::issue(register_set &source_reg) {
   // move_warp(m_dispatch_reg,source_reg);
-  warp_inst_t **ready_reg = source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
+  bool partition_issue = m_config->sub_core_model && this->is_issue_partitioned();
+  warp_inst_t **ready_reg = source_reg.get_ready(partition_issue, m_issue_reg_id);
   m_core->incexecstat((*ready_reg));
   // source_reg.move_out_to(m_dispatch_reg);
   simd_function_unit::issue(source_reg);

From a450d74a66ed7c58aef66ea28f358230ac614f3d Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Wed, 12 May 2021 12:56:56 -0400
Subject: [PATCH 031/154] remove prints

---
 src/gpgpu-sim/shader.cc | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 8816959f6..d978e6cf4 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -2122,8 +2122,6 @@ simd_function_unit::simd_function_unit(const shader_core_config *config) {
 void simd_function_unit::issue(register_set &source_reg) {
     bool partition_issue = m_config->sub_core_model && this->is_issue_partitioned();
     source_reg.move_out_to(partition_issue, this->get_issue_reg_id(), m_dispatch_reg);
-    std::cout << "EX stage issue stats:" << std::endl;
-    this->print(stdout);
     occupied.set(m_dispatch_reg->latency);
   }
 
@@ -4015,7 +4013,6 @@ void opndcoll_rfu_t::allocate_cu(unsigned port_num) {
         }
         for (unsigned k = cuLowerBound; k < cuUpperBound; k++) {
           if (cu_set[k].is_free()) {
-            std::cout << "Allocated schd_id: " << schd_id << " on cu: " << k << std::endl;
             collector_unit_t *cu = &cu_set[k];
             allocated = cu->allocate(inp.m_in[i], inp.m_out[i]);
             m_arbiter.add_read_requests(cu);
@@ -4139,18 +4136,6 @@ bool opndcoll_rfu_t::collector_unit_t::allocate(register_set *pipeline_reg_set,
 
 void opndcoll_rfu_t::collector_unit_t::dispatch() {
   assert(m_not_ready.none());
-  // Print out which OC dispatched which warp sched id to which exec pipeline
-  std::cout << "Dispatched from OC: "
-  << this->get_id()
-  << "\t Warp_id: "
-  << m_warp->get_uid()
-  << "\t Sched_id: "
-  << m_warp->get_schd_id()
-  << "\tto execution register: "
-  << m_output_register->get_name()
-  << "\treg id: "
-  << this->get_reg_id()
-  << std::endl;
   m_output_register->move_in(m_sub_core_model, m_reg_id, m_warp);
   m_free = true;
   m_output_register = NULL;

From 6a09900b34d2eaf5397fd24a5892bf09062be732 Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Wed, 12 May 2021 15:36:37 -0400
Subject: [PATCH 032/154] rm unnecessary instr get

---
 src/gpgpu-sim/shader.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index d978e6cf4..c72ed95db 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -1683,10 +1683,8 @@ void shader_core_ctx::execute() {
         assert((*ready_reg)->latency < MAX_ALU_LATENCY);
         m_result_bus[resbus]->set((*ready_reg)->latency);
         m_fu[n]->issue(issue_inst);
-        warp_inst_t** instr = issue_inst.get_ready(partition_issue, reg_id);
       } else if (!schedule_wb_now) {
         m_fu[n]->issue(issue_inst);
-        warp_inst_t** instr = issue_inst.get_ready(partition_issue, reg_id);
         } else {
         // stall issue (cannot reserve result bus)
       }

From 5945d709530cc1419f624ffb048739f2b70ee1b9 Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Thu, 13 May 2021 10:42:38 -0400
Subject: [PATCH 033/154] specialized unit should be partitioned too

---
 src/gpgpu-sim/shader.cc | 6 +++---
 src/gpgpu-sim/shader.h  | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index c72ed95db..3059b517f 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -411,7 +411,7 @@ void shader_core_ctx::create_exec_pipeline() {
       m_fu.push_back(new specialized_unit(
           &m_pipeline_reg[EX_WB], m_config, this, SPEC_UNIT_START_ID + j,
           m_config->m_specialized_unit[j].name,
-          m_config->m_specialized_unit[j].latency));
+          m_config->m_specialized_unit[j].latency, k));
       m_dispatch_port.push_back(m_config->m_specialized_unit[j].ID_OC_SPEC_ID);
       m_issue_port.push_back(m_config->m_specialized_unit[j].OC_EX_SPEC_ID);
     }
@@ -2228,8 +2228,8 @@ sp_unit::sp_unit(register_set *result_port, const shader_core_config *config,
 specialized_unit::specialized_unit(register_set *result_port,
                                    const shader_core_config *config,
                                    shader_core_ctx *core, unsigned supported_op,
-                                   char *unit_name, unsigned latency)
-    : pipelined_simd_unit(result_port, config, latency, core, 0) {
+                                   char *unit_name, unsigned latency, unsigned issue_reg_id)
+    : pipelined_simd_unit(result_port, config, latency, core, issue_reg_id) {
   m_name = unit_name;
   m_supported_op = supported_op;
 }
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 7987427d1..fa71af36c 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -1239,7 +1239,7 @@ class specialized_unit : public pipelined_simd_unit {
  public:
   specialized_unit(register_set *result_port, const shader_core_config *config,
                    shader_core_ctx *core, unsigned supported_op,
-                   char *unit_name, unsigned latency);
+                   char *unit_name, unsigned latency, unsigned issue_reg_id);
   virtual bool can_issue(const warp_inst_t &inst) const {
     if (inst.op != m_supported_op) {
       return false;
@@ -1248,7 +1248,7 @@ class specialized_unit : public pipelined_simd_unit {
   }
   virtual void active_lanes_in_pipeline();
   virtual void issue(register_set &source_reg);
-  bool is_issue_partitioned() { return false; }
+  bool is_issue_partitioned() { return true; }
 
  private:
   unsigned m_supported_op;

From 92c814a49dc98e282a46031543d289426dc04b00 Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Thu, 13 May 2021 10:54:41 -0400
Subject: [PATCH 034/154] run changes through clang-format

---
 src/abstract_hardware_model.h |  32 +++++------
 src/gpgpu-sim/shader.cc       | 104 +++++++++++++++++++---------------
 src/gpgpu-sim/shader.h        |  20 ++++---
 3 files changed, 87 insertions(+), 69 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index 129ed69d9..982e41606 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -1291,7 +1291,7 @@ class register_set {
     }
     m_name = name;
   }
-  const char * get_name() {return m_name;}
+  const char *get_name() { return m_name; }
   bool has_free() {
     for (unsigned i = 0; i < regs.size(); i++) {
       if (regs[i]->empty()) {
@@ -1342,8 +1342,8 @@ class register_set {
     return reg_id;
   }
   unsigned get_schd_id(unsigned reg_id) {
-      assert(not regs[reg_id]->empty());
-      return regs[reg_id]->get_schd_id();
+    assert(not regs[reg_id]->empty());
+    return regs[reg_id]->get_schd_id();
   }
   void move_in(warp_inst_t *&src) {
     warp_inst_t **free = get_free();
@@ -1353,14 +1353,14 @@ class register_set {
   //   src->copy_contents_to(*get_free());
   //}
   void move_in(bool sub_core_model, unsigned reg_id, warp_inst_t *&src) {
-  warp_inst_t **free;
-  if (!sub_core_model) {
-    free = get_free();
-  } else {
-    assert(reg_id < regs.size());
-    free = get_free(sub_core_model, reg_id);
-  }
-  move_warp(*free, src);
+    warp_inst_t **free;
+    if (!sub_core_model) {
+      free = get_free();
+    } else {
+      assert(reg_id < regs.size());
+      free = get_free(sub_core_model, reg_id);
+    }
+    move_warp(*free, src);
   }
 
   void move_out_to(warp_inst_t *&dest) {
@@ -1368,7 +1368,9 @@ class register_set {
     move_warp(dest, *ready);
   }
   void move_out_to(bool sub_core_model, unsigned reg_id, warp_inst_t *&dest) {
-    if (!sub_core_model) { return move_out_to(dest);}
+    if (!sub_core_model) {
+      return move_out_to(dest);
+    }
     warp_inst_t **ready = get_ready(sub_core_model, reg_id);
     assert(ready != NULL);
     move_warp(dest, *ready);
@@ -1389,13 +1391,11 @@ class register_set {
     return ready;
   }
   warp_inst_t **get_ready(bool sub_core_model, unsigned reg_id) {
-    if (!sub_core_model)
-      return get_ready();
+    if (!sub_core_model) return get_ready();
     warp_inst_t **ready;
     ready = NULL;
     assert(reg_id < regs.size());
-    if (not regs[reg_id]->empty())
-      ready = &regs[reg_id];
+    if (not regs[reg_id]->empty()) ready = &regs[reg_id];
     return ready;
   }
 
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 3059b517f..e84e38d92 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -166,18 +166,15 @@ void shader_core_ctx::create_schedulers() {
   // must currently occur after all inputs have been initialized.
   std::string sched_config = m_config->gpgpu_scheduler_string;
   const concrete_scheduler scheduler =
-      sched_config.find("lrr") != std::string::npos
-          ? CONCRETE_SCHEDULER_LRR
-          : sched_config.find("two_level_active") != std::string::npos
-                ? CONCRETE_SCHEDULER_TWO_LEVEL_ACTIVE
-                : sched_config.find("gto") != std::string::npos
-                      ? CONCRETE_SCHEDULER_GTO
-                      : sched_config.find("old") != std::string::npos
-                            ? CONCRETE_SCHEDULER_OLDEST_FIRST
-                            : sched_config.find("warp_limiting") !=
-                                      std::string::npos
-                                  ? CONCRETE_SCHEDULER_WARP_LIMITING
-                                  : NUM_CONCRETE_SCHEDULERS;
+      sched_config.find("lrr") != std::string::npos ? CONCRETE_SCHEDULER_LRR
+      : sched_config.find("two_level_active") != std::string::npos
+          ? CONCRETE_SCHEDULER_TWO_LEVEL_ACTIVE
+      : sched_config.find("gto") != std::string::npos ? CONCRETE_SCHEDULER_GTO
+      : sched_config.find("old") != std::string::npos
+          ? CONCRETE_SCHEDULER_OLDEST_FIRST
+      : sched_config.find("warp_limiting") != std::string::npos
+          ? CONCRETE_SCHEDULER_WARP_LIMITING
+          : NUM_CONCRETE_SCHEDULERS;
   assert(scheduler != NUM_CONCRETE_SCHEDULERS);
 
   for (unsigned i = 0; i < m_config->gpgpu_num_sched_per_core; i++) {
@@ -1670,12 +1667,14 @@ void shader_core_ctx::execute() {
     unsigned issue_port = m_issue_port[n];
     register_set &issue_inst = m_pipeline_reg[issue_port];
     unsigned reg_id;
-    bool partition_issue = m_config->sub_core_model && m_fu[n]->is_issue_partitioned();
+    bool partition_issue =
+        m_config->sub_core_model && m_fu[n]->is_issue_partitioned();
     if (partition_issue) {
       reg_id = m_fu[n]->get_issue_reg_id();
     }
     warp_inst_t **ready_reg = issue_inst.get_ready(partition_issue, reg_id);
-    if (issue_inst.has_ready(partition_issue, reg_id) && m_fu[n]->can_issue(**ready_reg)) {
+    if (issue_inst.has_ready(partition_issue, reg_id) &&
+        m_fu[n]->can_issue(**ready_reg)) {
       bool schedule_wb_now = !m_fu[n]->stallable();
       int resbus = -1;
       if (schedule_wb_now &&
@@ -1685,7 +1684,7 @@ void shader_core_ctx::execute() {
         m_fu[n]->issue(issue_inst);
       } else if (!schedule_wb_now) {
         m_fu[n]->issue(issue_inst);
-        } else {
+      } else {
         // stall issue (cannot reserve result bus)
       }
     }
@@ -2118,15 +2117,17 @@ simd_function_unit::simd_function_unit(const shader_core_config *config) {
 }
 
 void simd_function_unit::issue(register_set &source_reg) {
-    bool partition_issue = m_config->sub_core_model && this->is_issue_partitioned();
-    source_reg.move_out_to(partition_issue, this->get_issue_reg_id(), m_dispatch_reg);
-    occupied.set(m_dispatch_reg->latency);
-  }
+  bool partition_issue =
+      m_config->sub_core_model && this->is_issue_partitioned();
+  source_reg.move_out_to(partition_issue, this->get_issue_reg_id(),
+                         m_dispatch_reg);
+  occupied.set(m_dispatch_reg->latency);
+}
 
 sfu::sfu(register_set *result_port, const shader_core_config *config,
          shader_core_ctx *core, unsigned issue_reg_id)
     : pipelined_simd_unit(result_port, config, config->max_sfu_latency, core,
-     issue_reg_id) {
+                          issue_reg_id) {
   m_name = "SFU";
 }
 
@@ -2139,7 +2140,8 @@ tensor_core::tensor_core(register_set *result_port,
 }
 
 void sfu::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
+  warp_inst_t **ready_reg =
+      source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
 
   (*ready_reg)->op_pipe = SFU__OP;
@@ -2148,7 +2150,8 @@ void sfu::issue(register_set &source_reg) {
 }
 
 void tensor_core::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
+  warp_inst_t **ready_reg =
+      source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
 
   (*ready_reg)->op_pipe = TENSOR_CORE__OP;
@@ -2221,14 +2224,16 @@ void tensor_core::active_lanes_in_pipeline() {
 
 sp_unit::sp_unit(register_set *result_port, const shader_core_config *config,
                  shader_core_ctx *core, unsigned issue_reg_id)
-    : pipelined_simd_unit(result_port, config, config->max_sp_latency, core, issue_reg_id) {
+    : pipelined_simd_unit(result_port, config, config->max_sp_latency, core,
+                          issue_reg_id) {
   m_name = "SP ";
 }
 
 specialized_unit::specialized_unit(register_set *result_port,
                                    const shader_core_config *config,
                                    shader_core_ctx *core, unsigned supported_op,
-                                   char *unit_name, unsigned latency, unsigned issue_reg_id)
+                                   char *unit_name, unsigned latency,
+                                   unsigned issue_reg_id)
     : pipelined_simd_unit(result_port, config, latency, core, issue_reg_id) {
   m_name = unit_name;
   m_supported_op = supported_op;
@@ -2236,18 +2241,21 @@ specialized_unit::specialized_unit(register_set *result_port,
 
 dp_unit::dp_unit(register_set *result_port, const shader_core_config *config,
                  shader_core_ctx *core, unsigned issue_reg_id)
-    : pipelined_simd_unit(result_port, config, config->max_dp_latency, core, issue_reg_id) {
+    : pipelined_simd_unit(result_port, config, config->max_dp_latency, core,
+                          issue_reg_id) {
   m_name = "DP ";
 }
 
 int_unit::int_unit(register_set *result_port, const shader_core_config *config,
                    shader_core_ctx *core, unsigned issue_reg_id)
-    : pipelined_simd_unit(result_port, config, config->max_int_latency, core, issue_reg_id) {
+    : pipelined_simd_unit(result_port, config, config->max_int_latency, core,
+                          issue_reg_id) {
   m_name = "INT ";
 }
 
 void sp_unit ::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
+  warp_inst_t **ready_reg =
+      source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
   (*ready_reg)->op_pipe = SP__OP;
   m_core->incsp_stat(m_core->get_config()->warp_size, (*ready_reg)->latency);
@@ -2255,7 +2263,8 @@ void sp_unit ::issue(register_set &source_reg) {
 }
 
 void dp_unit ::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
+  warp_inst_t **ready_reg =
+      source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
   (*ready_reg)->op_pipe = DP__OP;
   m_core->incsp_stat(m_core->get_config()->warp_size, (*ready_reg)->latency);
@@ -2271,7 +2280,8 @@ void specialized_unit ::issue(register_set &source_reg) {
 }
 
 void int_unit ::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
+  warp_inst_t **ready_reg =
+      source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
   (*ready_reg)->op_pipe = INTP__OP;
   m_core->incsp_stat(m_core->get_config()->warp_size, (*ready_reg)->latency);
@@ -2317,8 +2327,10 @@ void pipelined_simd_unit::cycle() {
 
 void pipelined_simd_unit::issue(register_set &source_reg) {
   // move_warp(m_dispatch_reg,source_reg);
-  bool partition_issue = m_config->sub_core_model && this->is_issue_partitioned();
-  warp_inst_t **ready_reg = source_reg.get_ready(partition_issue, m_issue_reg_id);
+  bool partition_issue =
+      m_config->sub_core_model && this->is_issue_partitioned();
+  warp_inst_t **ready_reg =
+      source_reg.get_ready(partition_issue, m_issue_reg_id);
   m_core->incexecstat((*ready_reg));
   // source_reg.move_out_to(m_dispatch_reg);
   simd_function_unit::issue(source_reg);
@@ -3886,7 +3898,8 @@ void opndcoll_rfu_t::init(unsigned num_banks, shader_core_ctx *shader) {
   unsigned reg_id;
   if (sub_core_model) {
     assert(num_banks % shader->get_config()->gpgpu_num_sched_per_core == 0);
-    assert(m_num_warp_scheds <= m_cu.size() && m_cu.size() % m_num_warp_scheds == 0);
+    assert(m_num_warp_scheds <= m_cu.size() &&
+           m_cu.size() % m_num_warp_scheds == 0);
   }
   m_num_banks_per_sched =
       num_banks / shader->get_config()->gpgpu_num_sched_per_core;
@@ -3999,11 +4012,13 @@ void opndcoll_rfu_t::allocate_cu(unsigned port_num) {
         unsigned cuLowerBound = 0;
         unsigned cuUpperBound = cu_set.size();
         unsigned schd_id;
-        if(sub_core_model) {
-          // Sub core model only allocates on the subset of CUs assigned to the scheduler that issued
+        if (sub_core_model) {
+          // Sub core model only allocates on the subset of CUs assigned to the
+          // scheduler that issued
           unsigned reg_id = (*inp.m_in[i]).get_ready_reg_id();
           schd_id = (*inp.m_in[i]).get_schd_id(reg_id);
-          assert(cu_set.size() % m_num_warp_scheds == 0 && cu_set.size() >= m_num_warp_scheds);
+          assert(cu_set.size() % m_num_warp_scheds == 0 &&
+                 cu_set.size() >= m_num_warp_scheds);
           unsigned cusPerSched = cu_set.size() / m_num_warp_scheds;
           cuLowerBound = schd_id * cusPerSched;
           cuUpperBound = cuLowerBound + cusPerSched;
@@ -4019,8 +4034,9 @@ void opndcoll_rfu_t::allocate_cu(unsigned port_num) {
         }
         if (allocated) break;  // cu has been allocated, no need to search more.
       }
-      //break;  // can only service a single input, if it failed it will fail for
-              // others.
+      // break;  // can only service a single input, if it failed it will fail
+      // for
+      // others.
     }
   }
 }
@@ -4067,7 +4083,8 @@ void opndcoll_rfu_t::allocate_reads() {
 }
 
 bool opndcoll_rfu_t::collector_unit_t::ready() const {
-  return (!m_free) && m_not_ready.none() && (*m_output_register).has_free(m_sub_core_model, m_reg_id);
+  return (!m_free) && m_not_ready.none() &&
+         (*m_output_register).has_free(m_sub_core_model, m_reg_id);
 }
 
 void opndcoll_rfu_t::collector_unit_t::dump(
@@ -4085,13 +4102,10 @@ void opndcoll_rfu_t::collector_unit_t::dump(
   }
 }
 
-void opndcoll_rfu_t::collector_unit_t::init(unsigned n, unsigned num_banks,
-                                            unsigned log2_warp_size,
-                                            const core_config *config,
-                                            opndcoll_rfu_t *rfu,
-                                            bool sub_core_model,
-                                            unsigned reg_id,
-                                            unsigned banks_per_sched) {
+void opndcoll_rfu_t::collector_unit_t::init(
+    unsigned n, unsigned num_banks, unsigned log2_warp_size,
+    const core_config *config, opndcoll_rfu_t *rfu, bool sub_core_model,
+    unsigned reg_id, unsigned banks_per_sched) {
   m_rfu = rfu;
   m_cuid = n;
   m_num_banks = num_banks;
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index fa71af36c..8c02fd7c1 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -238,7 +238,10 @@ class shd_warp_t {
   unsigned get_dynamic_warp_id() const { return m_dynamic_warp_id; }
   unsigned get_warp_id() const { return m_warp_id; }
 
-  class shader_core_ctx * get_shader() { return m_shader; }
+  class shader_core_ctx *get_shader() {
+    return m_shader;
+  }
+
  private:
   static const unsigned IBUFFER_SIZE = 2;
   class shader_core_ctx *m_shader;
@@ -883,7 +886,8 @@ class opndcoll_rfu_t {  // operand collector based register file unit
     // modifiers
     void init(unsigned n, unsigned num_banks, unsigned log2_warp_size,
               const core_config *config, opndcoll_rfu_t *rfu,
-              bool m_sub_core_model, unsigned reg_id, unsigned num_banks_per_sched);
+              bool m_sub_core_model, unsigned reg_id,
+              unsigned num_banks_per_sched);
     bool allocate(register_set *pipeline_reg, register_set *output_reg);
 
     void collect_operand(unsigned op) { m_not_ready.reset(op); }
@@ -907,7 +911,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
 
     unsigned m_num_banks_per_sched;
     bool m_sub_core_model;
-    unsigned m_reg_id; // if sub_core_model enabled, limit regs this cu can r/w
+    unsigned m_reg_id;  // if sub_core_model enabled, limit regs this cu can r/w
   };
 
   class dispatch_unit_t {
@@ -1051,7 +1055,7 @@ class simd_function_unit {
     return m_dispatch_reg->empty() && !occupied.test(inst.latency);
   }
   virtual bool is_issue_partitioned() = 0;
-  virtual unsigned get_issue_reg_id() = 0; 
+  virtual unsigned get_issue_reg_id() = 0;
   virtual bool stallable() const = 0;
   virtual void print(FILE *fp) const {
     fprintf(fp, "%s dispatch= ", m_name.c_str());
@@ -1109,8 +1113,8 @@ class pipelined_simd_unit : public simd_function_unit {
   warp_inst_t **m_pipeline_reg;
   register_set *m_result_port;
   class shader_core_ctx *m_core;
-  unsigned m_issue_reg_id; // if sub_core_model is enabled we can only issue from a
-                           // subset of operand collectors
+  unsigned m_issue_reg_id;  // if sub_core_model is enabled we can only issue
+                            // from a subset of operand collectors
 
   unsigned active_insts_in_pipeline;
 };
@@ -2145,8 +2149,8 @@ class shader_core_ctx : public core_t {
   friend class TwoLevelScheduler;
   friend class LooseRoundRobbinScheduler;
   virtual void issue_warp(register_set &warp, const warp_inst_t *pI,
-                  const active_mask_t &active_mask, unsigned warp_id,
-                  unsigned sch_id);
+                          const active_mask_t &active_mask, unsigned warp_id,
+                          unsigned sch_id);
 
   void create_front_pipeline();
   void create_schedulers();

From db1019769b9fb8776f3934e9ba5fd47437a5cee5 Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Thu, 13 May 2021 11:39:33 -0400
Subject: [PATCH 035/154] rm old dirs in format-code.sh

---
 format-code.sh | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/format-code.sh b/format-code.sh
index fb1cc909a..9f470854b 100755
--- a/format-code.sh
+++ b/format-code.sh
@@ -9,7 +9,4 @@ clang-format -i ${THIS_DIR}/src/gpgpu-sim/*.cc
 clang-format -i ${THIS_DIR}/src/cuda-sim/*.h
 clang-format -i ${THIS_DIR}/src/cuda-sim/*.cc
 clang-format -i ${THIS_DIR}/src/gpuwattch/*.h
-clang-format -i ${THIS_DIR}/src/gpuwattch/*.cc
-clang-format -i ${THIS_DIR}/src/trace-driven/*.h
-clang-format -i ${THIS_DIR}/src/trace-driven/*.cc
-clang-format -i ${THIS_DIR}/src/trace-driven/ISA_Def/*.h
+clang-format -i ${THIS_DIR}/src/gpuwattch/*.cc
\ No newline at end of file

From c52626267907b42ac6b611d7d7d0eaae3c825600 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Thu, 13 May 2021 13:45:59 -0400
Subject: [PATCH 036/154] fix adaptive cache cfg option parsing data type

---
 src/gpgpu-sim/gpu-sim.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index 165068879..fd36e006a 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -326,7 +326,7 @@ void shader_core_config::reg_options(class OptionParser *opp) {
   option_parser_register(
       opp, "-gpgpu_shmem_size", OPT_UINT32, &gpgpu_shmem_size,
       "Size of shared memory per shader core (default 16kB)", "16384");
-  option_parser_register(opp, "-gpgpu_adaptive_cache_config", OPT_UINT32,
+  option_parser_register(opp, "-gpgpu_adaptive_cache_config", OPT_BOOL,
                          &adaptive_cache_config, "adaptive_cache_config", "0");
   option_parser_register(
       opp, "-gpgpu_shmem_sizeDefault", OPT_UINT32, &gpgpu_shmem_sizeDefault,

From f2a7d9ce6cd13977d97a0601d732551a5451ac71 Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Sat, 15 May 2021 09:09:20 -0400
Subject: [PATCH 037/154] fixing streaming cache based on recent ubench

---
 .../tested-cfgs/SM7_TITANV/gpgpusim.config    |  2 +-
 src/gpgpu-sim/gpu-cache.cc                    | 13 -------
 src/gpgpu-sim/gpu-cache.h                     | 38 +++++++++++--------
 src/gpgpu-sim/shader.cc                       | 15 ++++++++
 4 files changed, 39 insertions(+), 29 deletions(-)

diff --git a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
index 3fa51ee14..3af314c9e 100644
--- a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
@@ -116,7 +116,7 @@
 -gpgpu_adaptive_cache_config 1
 # Volta unified cache has four banks
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:256,L:L:s:N:L,A:256:8,16:0,32
+-gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:512:8,16:0,32
 -gpgpu_shmem_size 98304
 -gpgpu_shmem_sizeDefault 98304
 -gpgpu_shmem_per_block 65536
diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 1c36d224c..c6a125d8d 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -312,15 +312,6 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
     abort();  // if an unreserved block exists, it is either invalid or
               // replaceable
 
-  if (probe_mode && m_config.is_streaming()) {
-    line_table::const_iterator i =
-        pending_lines.find(m_config.block_addr(addr));
-    assert(mf);
-    if (!mf->is_write() && i != pending_lines.end()) {
-      if (i->second != mf->get_inst().get_uid()) return SECTOR_MISS;
-    }
-  }
-
   return MISS;
 }
 
@@ -1060,7 +1051,6 @@ void baseline_cache::fill(mem_fetch *mf, unsigned time) {
     m_tag_array->fill(e->second.m_cache_index, time, mf);
   else if (m_config.m_alloc_policy == ON_FILL) {
     m_tag_array->fill(e->second.m_block_addr, time, mf);
-    if (m_config.is_streaming()) m_tag_array->remove_pending_line(mf);
   } else
     abort();
   bool has_atomic = false;
@@ -1136,9 +1126,6 @@ void baseline_cache::send_read_request(new_addr_type addr,
       m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf);
 
     m_mshrs.add(mshr_addr, mf);
-    if (m_config.is_streaming() && m_config.m_cache_type == SECTOR) {
-      m_tag_array->add_pending_line(mf);
-    }
     m_extra_mf_fields[mf] = extra_mf_fields(
         mshr_addr, mf->get_addr(), cache_index, mf->get_data_size(), m_config);
     mf->set_data_size(m_config.get_atom_sz());
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 00c09ae55..aa0a7e85a 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -574,22 +574,26 @@ class cache_config {
         exit_parse_error();
     }
     if (m_alloc_policy == STREAMING) {
-      // For streaming cache, we set the alloc policy to be on-fill to remove
-      // all line_alloc_fail stalls we set the MSHRs to be equal to max
-      // allocated cache lines. This is possible by moving TAG to be shared
-      // between cache line and MSHR enrty (i.e. for each cache line, there is
-      // an MSHR rntey associated with it) This is the easiest think we can
-      // think about to model (mimic) L1 streaming cache in Pascal and Volta
-      // Based on our microbenchmakrs, MSHRs entries have been increasing
-      // substantially in Pascal and Volta For more information about streaming
-      // cache, see:
-      // http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
-      // https://ieeexplore.ieee.org/document/8344474/
+      /*
+      For streaming cache:
+      (1) we set the alloc policy to be on-fill to remove all line_alloc_fail stalls.
+      if the whole memory is allocated to the L1 cache, then make the allocation to be on_MISS
+      otherwise, make it ON_FILL to eliminate line allocation fails. 
+      i.e. MSHR throughput is the same, independent on the L1 cache size/associativity
+      So, we set the allocation policy per kernel basis, see shader.cc, max_cta() function
+      
+      (2) We also set the MSHRs to be equal to max
+      allocated cache lines. This is possible by moving TAG to be shared
+      between cache line and MSHR enrty (i.e. for each cache line, there is
+      an MSHR rntey associated with it). This is the easiest think we can
+      think of to model (mimic) L1 streaming cache in Pascal and Volta
+      
+      For more information about streaming cache, see:
+      http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+      https://ieeexplore.ieee.org/document/8344474/
+      */
       m_is_streaming = true;
       m_alloc_policy = ON_FILL;
-      m_mshr_entries = m_nset * m_assoc * MAX_DEFAULT_CACHE_SIZE_MULTIBLIER;
-      if (m_cache_type == SECTOR) m_mshr_entries *= SECTOR_CHUNCK_SIZE;
-      m_mshr_max_merge = MAX_WARP_PER_SM;
     }
     switch (mshr_type) {
       case 'F':
@@ -638,7 +642,8 @@ class cache_config {
     }
 
     // detect invalid configuration
-    if (m_alloc_policy == ON_FILL and m_write_policy == WRITE_BACK) {
+    if ((m_alloc_policy == ON_FILL || m_alloc_policy == STREAMING) 
+        and m_write_policy == WRITE_BACK) {
       // A writeback cache with allocate-on-fill policy will inevitably lead to
       // deadlock: The deadlock happens when an incoming cache-fill evicts a
       // dirty line, generating a writeback request.  If the memory subsystem is
@@ -750,6 +755,9 @@ class cache_config {
   }
   bool is_streaming() { return m_is_streaming; }
   FuncCache get_cache_status() { return cache_status; }
+  void set_allocation_policy(enum allocation_policy_t alloc) {
+    m_alloc_policy = alloc;
+  }
   char *m_config_string;
   char *m_config_stringPrefL1;
   char *m_config_stringPrefShared;
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index c6e7b8f67..0ad9547b0 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3308,6 +3308,21 @@ unsigned int shader_core_config::max_cta(const kernel_info_t &k) const {
              m_L1D_config.get_total_size_inKB());
     }
 
+    if(m_L1D_config.is_streaming()) {
+      //for streaming cache, if the whole memory is allocated
+      //to the L1 cache, then make the allocation to be on_MISS
+      //otherwise, make it ON_FILL to eliminate line allocation fails
+      //i.e. MSHR throughput is the same, independent on the L1 cache size/associativity
+      if(total_shmed == 0) {
+        m_L1D_config.set_allocation_policy(ON_MISS);
+        printf("GPGPU-Sim: Reconfigure L1 allocation to ON_MISS\n");
+      }
+      else {
+        m_L1D_config.set_allocation_policy(ON_FILL);
+        printf("GPGPU-Sim: Reconfigure L1 allocation to ON_FILL\n");
+      }
+    }
+
     k.cache_config_set = true;
   }
 

From 134739518a4a0f8a66cbf8c8a44b1a0ce178f7d5 Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Sat, 15 May 2021 09:15:38 -0400
Subject: [PATCH 038/154] adding the missing xoring hashing

---
 src/gpgpu-sim/gpu-cache.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index aa0a7e85a..b2db1c5ff 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -689,6 +689,9 @@ class cache_config {
       case 'L':
         m_set_index_function = LINEAR_SET_FUNCTION;
         break;
+      case 'X':
+        m_set_index_function = BITWISE_XORING_FUNCTION;
+        break;        
       default:
         exit_parse_error();
     }

From 6319e31a8ee5ebac7499756029878a1ebbb4384e Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Sat, 15 May 2021 09:23:23 -0400
Subject: [PATCH 039/154] moving reg file read to read_operands function as
 before

---
 src/gpgpu-sim/shader.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 0ad9547b0..e6bfca042 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -1569,7 +1569,10 @@ void swl_scheduler::order_warps() {
   }
 }
 
-void shader_core_ctx::read_operands() {}
+void shader_core_ctx::read_operands() {
+  for (int i = 0; i < m_config->reg_file_port_throughput; ++i)
+    m_operand_collector.step();
+}
 
 address_type coalesced_segment(address_type addr,
                                unsigned segment_size_lg2bytes) {
@@ -2550,8 +2553,7 @@ inst->space.get_type() != shared_space) { unsigned warp_id = inst->warp_id();
 */
 void ldst_unit::cycle() {
   writeback();
-  for (int i = 0; i < m_config->reg_file_port_throughput; ++i)
-    m_operand_collector->step();
+
   for (unsigned stage = 0; (stage + 1) < m_pipeline_depth; stage++)
     if (m_pipeline_reg[stage]->empty() && !m_pipeline_reg[stage + 1]->empty())
       move_warp(m_pipeline_reg[stage], m_pipeline_reg[stage + 1]);

From c94b883ac62e3b7dfbc69f6bad3b4c86b62eeb8c Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Mon, 17 May 2021 10:57:48 -0400
Subject: [PATCH 040/154] code refactoring cycle()

---
 src/gpgpu-sim/shader.cc | 39 +++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index e6bfca042..34040fba0 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -1228,22 +1228,6 @@ void scheduler_unit::cycle() {
                 previous_issued_inst_exec_type = exec_unit_type_t::MEM;
               }
             } else {
-              bool sp_pipe_avail =
-                  (m_shader->m_config->gpgpu_num_sp_units > 0) &&
-                  m_sp_out->has_free(m_shader->m_config->sub_core_model, m_id);
-              bool sfu_pipe_avail =
-                  (m_shader->m_config->gpgpu_num_sfu_units > 0) &&
-                  m_sfu_out->has_free(m_shader->m_config->sub_core_model, m_id);
-              bool tensor_core_pipe_avail =
-                  (m_shader->m_config->gpgpu_num_tensor_core_units > 0) &&
-                  m_tensor_core_out->has_free(
-                      m_shader->m_config->sub_core_model, m_id);
-              bool dp_pipe_avail =
-                  (m_shader->m_config->gpgpu_num_dp_units > 0) &&
-                  m_dp_out->has_free(m_shader->m_config->sub_core_model, m_id);
-              bool int_pipe_avail =
-                  (m_shader->m_config->gpgpu_num_int_units > 0) &&
-                  m_int_out->has_free(m_shader->m_config->sub_core_model, m_id);
 
               // This code need to be refactored
               if (pI->op != TENSOR_CORE_OP && pI->op != SFU_OP &&
@@ -1251,6 +1235,13 @@ void scheduler_unit::cycle() {
                 bool execute_on_SP = false;
                 bool execute_on_INT = false;
 
+              bool sp_pipe_avail =
+                  (m_shader->m_config->gpgpu_num_sp_units > 0) &&
+                  m_sp_out->has_free(m_shader->m_config->sub_core_model, m_id);
+              bool int_pipe_avail =
+                  (m_shader->m_config->gpgpu_num_int_units > 0) &&
+                  m_int_out->has_free(m_shader->m_config->sub_core_model, m_id);
+                  
                 // if INT unit pipline exist, then execute ALU and INT
                 // operations on INT unit and SP-FPU on SP unit (like in Volta)
                 // if INT unit pipline does not exist, then execute all ALU, INT
@@ -1311,6 +1302,11 @@ void scheduler_unit::cycle() {
                          (pI->op == DP_OP) &&
                          !(diff_exec_units && previous_issued_inst_exec_type ==
                                                   exec_unit_type_t::DP)) {
+               
+                bool dp_pipe_avail =
+                (m_shader->m_config->gpgpu_num_dp_units > 0) &&
+                m_dp_out->has_free(m_shader->m_config->sub_core_model, m_id);
+
                 if (dp_pipe_avail) {
                   m_shader->issue_warp(*m_dp_out, pI, active_mask, warp_id,
                                        m_id);
@@ -1326,6 +1322,11 @@ void scheduler_unit::cycle() {
                         (pI->op == SFU_OP) || (pI->op == ALU_SFU_OP)) &&
                        !(diff_exec_units && previous_issued_inst_exec_type ==
                                                 exec_unit_type_t::SFU)) {
+
+                bool sfu_pipe_avail =
+                (m_shader->m_config->gpgpu_num_sfu_units > 0) &&
+                m_sfu_out->has_free(m_shader->m_config->sub_core_model, m_id);
+
                 if (sfu_pipe_avail) {
                   m_shader->issue_warp(*m_sfu_out, pI, active_mask, warp_id,
                                        m_id);
@@ -1337,6 +1338,12 @@ void scheduler_unit::cycle() {
               } else if ((pI->op == TENSOR_CORE_OP) &&
                          !(diff_exec_units && previous_issued_inst_exec_type ==
                                                   exec_unit_type_t::TENSOR)) {
+                  
+                bool tensor_core_pipe_avail =
+                (m_shader->m_config->gpgpu_num_tensor_core_units > 0) &&
+                m_tensor_core_out->has_free(
+                    m_shader->m_config->sub_core_model, m_id);
+
                 if (tensor_core_pipe_avail) {
                   m_shader->issue_warp(*m_tensor_core_out, pI, active_mask,
                                        warp_id, m_id);

From 7d9a12fb096db5492924ec32a96c9052552e8579 Mon Sep 17 00:00:00 2001
From: Aaron Barnes <barnes88@purdue.edu>
Date: Mon, 17 May 2021 12:46:35 -0400
Subject: [PATCH 041/154] specialized unit get_ready() was missing subcore

---
 src/gpgpu-sim/shader.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index e84e38d92..14d904424 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -2272,7 +2272,8 @@ void dp_unit ::issue(register_set &source_reg) {
 }
 
 void specialized_unit ::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = source_reg.get_ready();
+  warp_inst_t **ready_reg = 
+      source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
   (*ready_reg)->op_pipe = SPECIALIZED__OP;
   m_core->incsp_stat(m_core->get_config()->warp_size, (*ready_reg)->latency);

From 0f3030542e1987543c5fd4e497f7d422422e73fa Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Mon, 15 Feb 2021 12:42:14 -0500
Subject: [PATCH 042/154] dirty counter added. NO increamenting yet

---
 src/gpgpu-sim/gpu-cache.cc | 24 +++++++++++++++---------
 src/gpgpu-sim/gpu-cache.h  |  6 ++++--
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 1c36d224c..763705f91 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -284,15 +284,20 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
         invalid_line = index;
       } else {
         // valid line : keep track of most appropriate replacement candidate
-        if (m_config.m_replacement_policy == LRU) {
-          if (line->get_last_access_time() < valid_timestamp) {
-            valid_timestamp = line->get_last_access_time();
-            valid_line = index;
-          }
-        } else if (m_config.m_replacement_policy == FIFO) {
-          if (line->get_alloc_time() < valid_timestamp) {
-            valid_timestamp = line->get_alloc_time();
-            valid_line = index;
+        if (!line->get_status(mask) == MODIFIED || 
+            100 * m_dirty/(m_config.m_nset * m_config.m_assoc) >= m_config.m_wr_percent) {
+              // don't evict write until dirty lines reach threshold
+              // make sure at least 1 candidate is assigned
+          if (m_config.m_replacement_policy == LRU) {
+            if (line->get_last_access_time() < valid_timestamp) {
+              valid_timestamp = line->get_last_access_time();
+              valid_line = index;
+            }
+          } else if (m_config.m_replacement_policy == FIFO) {
+            if (line->get_alloc_time() < valid_timestamp) {
+              valid_timestamp = line->get_alloc_time();
+              valid_line = index;
+            }
           }
         }
       }
@@ -418,6 +423,7 @@ void tag_array::flush() {
     if (m_lines[i]->is_modified_line()) {
       for (unsigned j = 0; j < SECTOR_CHUNCK_SIZE; j++)
         m_lines[i]->set_status(INVALID, mem_access_sector_mask_t().set(j));
+        m_dirty--;
     }
 
   is_used = false;
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 00c09ae55..9dbfe8251 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -498,10 +498,10 @@ class cache_config {
     char ct, rp, wp, ap, mshr_type, wap, sif;
 
     int ntok =
-        sscanf(config, "%c:%u:%u:%u,%c:%c:%c:%c:%c,%c:%u:%u,%u:%u,%u", &ct,
+        sscanf(config, "%c:%u:%u:%u,%c:%c:%c:%c:%c,%c:%u:%u,%u:%u,%u,%u", &ct,
                &m_nset, &m_line_sz, &m_assoc, &rp, &wp, &ap, &wap, &sif,
                &mshr_type, &m_mshr_entries, &m_mshr_max_merge,
-               &m_miss_queue_size, &m_result_fifo_entries, &m_data_port_width);
+               &m_miss_queue_size, &m_result_fifo_entries, &m_data_port_width, &m_wr_percent);
 
     if (ntok < 12) {
       if (!strcmp(config, "none")) {
@@ -801,6 +801,7 @@ class cache_config {
   unsigned m_data_port_width;  //< number of byte the cache can access per cycle
   enum set_index_function
       m_set_index_function;  // Hash, linear, or custom set index function
+  unsigned m_wr_percent;
 
   friend class tag_array;
   friend class baseline_cache;
@@ -897,6 +898,7 @@ class tag_array {
                            // allocated but not filled
   unsigned m_res_fail;
   unsigned m_sector_miss;
+  unsigned m_dirty;
 
   // performance counters for calculating the amount of misses within a time
   // window

From 615f173c25883fbc8db0363279e2eb216acb8c7e Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Sat, 20 Feb 2021 16:03:42 -0500
Subject: [PATCH 043/154] store ack for new waps

---
 src/gpgpu-sim/gpu-cache.h |  6 ++++++
 src/gpgpu-sim/shader.cc   | 12 ++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 9dbfe8251..381ce944e 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -754,6 +754,9 @@ class cache_config {
   char *m_config_stringPrefL1;
   char *m_config_stringPrefShared;
   FuncCache cache_status;
+  write_allocate_policy_t get_write_allocate_policy() {
+    return m_write_alloc_policy;
+  }
 
  protected:
   void exit_parse_error() {
@@ -878,6 +881,9 @@ class tag_array {
   void update_cache_parameters(cache_config &config);
   void add_pending_line(mem_fetch *mf);
   void remove_pending_line(mem_fetch *mf);
+  void inc_dirty() {
+    m_dirty++;
+  }
 
  protected:
   // This constructor is intended for use only from derived classes that wish to
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 14d904424..4769ca885 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -1974,6 +1974,18 @@ void ldst_unit::L1_latency_queue_cycle() {
       } else {
         assert(status == MISS || status == HIT_RESERVED);
         l1_latency_queue[j][0] = NULL;
+        if (mf_next->get_inst().is_store() &&
+            (m_config->m_L1D_config.get_write_allocate_policy() == FETCH_ON_WRITE ||
+              m_config->m_L1D_config.get_write_allocate_policy() == LAZY_FETCH_ON_READ) &&
+            !was_writeallocate_sent(events)) {
+          unsigned dec_ack =
+              (m_config->m_L1D_config.get_mshr_type() == SECTOR_ASSOC)
+                  ? (mf_next->get_data_size() / SECTOR_SIZE)
+                  : 1;
+          mf_next->set_reply();
+          for (unsigned i = 0; i < dec_ack; ++i) m_core->store_ack(mf_next);
+          if (!write_sent && !read_sent) delete mf_next;
+        }
       }
     }
 

From ad7204189b79be89575d969b305c529a31a2a765 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Tue, 2 Mar 2021 16:30:27 -0500
Subject: [PATCH 044/154] sending cache block byte mask

---
 src/abstract_hardware_model.h |  6 ++++++
 src/gpgpu-sim/gpu-cache.cc    | 21 ++++++++++++++++-----
 src/gpgpu-sim/gpu-cache.h     | 28 ++++++++++++++++++++++++++++
 src/gpgpu-sim/l2cache.cc      | 14 ++++++++++++++
 src/gpgpu-sim/l2cache.h       |  6 ++++++
 src/gpgpu-sim/shader.cc       | 15 +++++++++++++++
 src/gpgpu-sim/shader.h        |  6 ++++++
 7 files changed, 91 insertions(+), 5 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index 982e41606..e09acdbf8 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -869,6 +869,12 @@ class mem_fetch_allocator {
   virtual mem_fetch *alloc(const class warp_inst_t &inst,
                            const mem_access_t &access,
                            unsigned long long cycle) const = 0;
+  virtual mem_fetch *alloc(new_addr_type addr, mem_access_type type,
+                            const active_mask_t &active_mask,
+                            const mem_access_byte_mask_t &byte_mask,
+                            const mem_access_sector_mask_t &sector_mask,
+                            unsigned size, bool wr,
+                            unsigned long long cycle) const = 0;                    
 };
 
 // the maximum number of destination, source, or address uarch operands in a
diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 763705f91..ded800461 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -358,8 +358,13 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
       if (m_config.m_alloc_policy == ON_MISS) {
         if (m_lines[idx]->is_modified_line()) {
           wb = true;
+          ((sector_cache_block *)m_lines[idx])->set_byte_mask(mf);
           evicted.set_info(m_lines[idx]->m_block_addr,
-                           m_lines[idx]->get_modified_size());
+                           m_lines[idx]->get_modified_size(),
+                           ((sector_cache_block *)m_lines[idx])
+                                              ->get_byte_mask(),
+                            ((sector_cache_block *)m_lines[idx])
+                                              ->get_sector_mask());
         }
         m_lines[idx]->allocate(m_config.tag(addr), m_config.block_addr(addr),
                                time, mf->get_access_sector_mask());
@@ -1464,6 +1469,8 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
       m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf);
   assert(m_status != HIT);
   cache_block_t *block = m_tag_array->get_block(cache_index);
+  block->set_status(MODIFIED, mf->get_access_sector_mask());
+  ((sector_cache_block *)block)->set_byte_mask(mf);
   if (m_status == HIT_RESERVED) {
     block->set_ignore_on_fill(true, mf->get_access_sector_mask());
     block->set_modified_on_fill(true, mf->get_access_sector_mask());
@@ -1484,8 +1491,10 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
     // (already modified lower level)
     if (wb && (m_config.m_write_policy != WRITE_THROUGH)) {
       mem_fetch *wb = m_memfetch_creator->alloc(
-          evicted.m_block_addr, m_wrbk_type, evicted.m_modified_size, true,
-          m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+        evicted.m_block_addr,m_wrbk_type,
+        mf->get_access_warp_mask(), evicted.m_byte_mask,
+        evicted.m_sector_mask, evicted.m_modified_size,
+        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
       wb->set_chip(mf->get_tlx_addr().chip);
@@ -1560,8 +1569,10 @@ enum cache_request_status data_cache::rd_miss_base(
     // (already modified lower level)
     if (wb && (m_config.m_write_policy != WRITE_THROUGH)) {
       mem_fetch *wb = m_memfetch_creator->alloc(
-          evicted.m_block_addr, m_wrbk_type, evicted.m_modified_size, true,
-          m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+        evicted.m_block_addr,m_wrbk_type,
+        mf->get_access_warp_mask(), evicted.m_byte_mask,
+        evicted.m_sector_mask, evicted.m_modified_size,
+        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
       wb->set_chip(mf->get_tlx_addr().chip);
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 381ce944e..042c1d6b7 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -72,14 +72,26 @@ enum cache_event_type {
 struct evicted_block_info {
   new_addr_type m_block_addr;
   unsigned m_modified_size;
+  mem_access_byte_mask_t m_byte_mask;
+  mem_access_sector_mask_t m_sector_mask;
   evicted_block_info() {
     m_block_addr = 0;
     m_modified_size = 0;
+    m_byte_mask.reset();
+    m_sector_mask.reset();
   }
   void set_info(new_addr_type block_addr, unsigned modified_size) {
     m_block_addr = block_addr;
     m_modified_size = modified_size;
   }
+  void set_info(new_addr_type block_addr, unsigned modified_size, 
+                mem_access_byte_mask_t byte_mask,
+                mem_access_sector_mask_t sector_mask) {
+    m_block_addr = block_addr;
+    m_modified_size = modified_size;
+    m_byte_mask = byte_mask;
+    m_sector_mask = sector_mask;
+  }
 };
 
 struct cache_event {
@@ -251,6 +263,7 @@ struct sector_cache_block : public cache_block_t {
     m_line_alloc_time = 0;
     m_line_last_access_time = 0;
     m_line_fill_time = 0;
+    m_byte_mask.reset();
   }
 
   virtual void allocate(new_addr_type tag, new_addr_type block_addr,
@@ -362,6 +375,20 @@ struct sector_cache_block : public cache_block_t {
     m_status[sidx] = status;
   }
 
+  virtual void set_byte_mask(mem_fetch *mf) {
+    m_byte_mask = m_byte_mask | mf->get_access_byte_mask();;
+  }
+  virtual mem_access_byte_mask_t get_byte_mask() {
+    return m_byte_mask;
+  }
+  virtual mem_access_sector_mask_t get_sector_mask() {
+    mem_access_sector_mask_t sector_mask;
+    for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; i++) {
+      if (m_status[i] == MODIFIED) 
+        sector_mask.set(i);
+    }
+    return sector_mask;
+  }
   virtual unsigned long long get_last_access_time() {
     return m_line_last_access_time;
   }
@@ -429,6 +456,7 @@ struct sector_cache_block : public cache_block_t {
   bool m_set_modified_on_fill[SECTOR_CHUNCK_SIZE];
   bool m_set_readable_on_fill[SECTOR_CHUNCK_SIZE];
   bool m_readable[SECTOR_CHUNCK_SIZE];
+  mem_access_byte_mask_t m_byte_mask;
 
   unsigned get_sector_index(mem_access_sector_mask_t sector_mask) {
     assert(sector_mask.count() == 1);
diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index ab6e5c228..cd04af57a 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -57,6 +57,20 @@ mem_fetch *partition_mf_allocator::alloc(new_addr_type addr,
   return mf;
 }
 
+mem_fetch *partition_mf_allocator::alloc(new_addr_type addr, 
+                            mem_access_type type,
+                            const active_mask_t &active_mask,
+                            const mem_access_byte_mask_t &byte_mask,
+                            const mem_access_sector_mask_t &sector_mask,
+                            unsigned size, bool wr,
+                            unsigned long long cycle) const {
+  mem_access_t access(type, addr, size, wr, active_mask, byte_mask, 
+                        sector_mask, m_memory_config->gpgpu_ctx);
+  mem_fetch *mf =
+    new mem_fetch(access, NULL, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE, -1,
+                  -1, -1, m_memory_config, cycle);
+    return mf;
+}
 memory_partition_unit::memory_partition_unit(unsigned partition_id,
                                              const memory_config *config,
                                              class memory_stats_t *stats,
diff --git a/src/gpgpu-sim/l2cache.h b/src/gpgpu-sim/l2cache.h
index 3152db337..1f5d7c468 100644
--- a/src/gpgpu-sim/l2cache.h
+++ b/src/gpgpu-sim/l2cache.h
@@ -51,6 +51,12 @@ class partition_mf_allocator : public mem_fetch_allocator {
   virtual mem_fetch *alloc(new_addr_type addr, mem_access_type type,
                            unsigned size, bool wr,
                            unsigned long long cycle) const;
+  virtual mem_fetch *alloc(new_addr_type addr, mem_access_type type,
+                            const active_mask_t &active_mask,
+                            const mem_access_byte_mask_t &byte_mask,
+                            const mem_access_sector_mask_t &sector_mask,
+                            unsigned size, bool wr,
+                            unsigned long long cycle) const;
 
  private:
   const memory_config *m_memory_config;
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 4769ca885..4b4c98db7 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -61,6 +61,21 @@ mem_fetch *shader_core_mem_fetch_allocator::alloc(
                     m_core_id, m_cluster_id, m_memory_config, cycle);
   return mf;
 }
+
+mem_fetch *shader_core_mem_fetch_allocator::alloc(
+  new_addr_type addr, mem_access_type type,
+  const active_mask_t &active_mask,
+  const mem_access_byte_mask_t &byte_mask,
+  const mem_access_sector_mask_t &sector_mask,
+  unsigned size, bool wr,
+  unsigned long long cycle) const {
+    mem_access_t access(type, addr, size, wr, active_mask, byte_mask, 
+                          sector_mask, m_memory_config->gpgpu_ctx);
+    mem_fetch *mf =
+      new mem_fetch(access, NULL, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE, -1,
+                    m_core_id, m_cluster_id, m_memory_config, cycle);
+      return mf;
+  }
 /////////////////////////////////////////////////////////////////////////////
 
 std::list<unsigned> shader_core_ctx::get_regs_written(const inst_t &fvt) const {
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 8c02fd7c1..a7a2c02d6 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -1872,6 +1872,12 @@ class shader_core_mem_fetch_allocator : public mem_fetch_allocator {
   }
   mem_fetch *alloc(new_addr_type addr, mem_access_type type, unsigned size,
                    bool wr, unsigned long long cycle) const;
+  mem_fetch *alloc(new_addr_type addr, mem_access_type type,
+                            const active_mask_t &active_mask,
+                            const mem_access_byte_mask_t &byte_mask,
+                            const mem_access_sector_mask_t &sector_mask,
+                            unsigned size, bool wr,
+                            unsigned long long cycle) const;
   mem_fetch *alloc(const warp_inst_t &inst, const mem_access_t &access,
                    unsigned long long cycle) const {
     warp_inst_t inst_copy = inst;

From bb19c0cbfa2dc8082496a279f37f48695b7c4185 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Tue, 2 Mar 2021 16:32:29 -0500
Subject: [PATCH 045/154] update mf breakdown at L2

---
 src/gpgpu-sim/l2cache.cc | 92 +++++++++++++---------------------------
 1 file changed, 29 insertions(+), 63 deletions(-)

diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index cd04af57a..63119ee90 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -555,10 +555,15 @@ void memory_sub_partition::cache_cycle(unsigned cycle) {
                m_config->m_L2_config.m_write_alloc_policy ==
                    LAZY_FETCH_ON_READ) &&
               !was_writeallocate_sent(events)) {
-            mf->set_reply();
-            mf->set_status(IN_PARTITION_L2_TO_ICNT_QUEUE,
-                           m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
-            m_L2_icnt_queue->push(mf);
+            if (mf->get_access_type() == L1_WRBK_ACC) {
+              m_request_tracker.erase(mf);
+              delete mf;
+            } else {
+              mf->set_reply();
+              mf->set_status(IN_PARTITION_L2_TO_ICNT_QUEUE,
+                             m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
+              m_L2_icnt_queue->push(mf);
+            }
           }
           // L2 cache accepted request
           m_icnt_L2_queue->pop();
@@ -708,71 +713,32 @@ bool memory_sub_partition::busy() const { return !m_request_tracker.empty(); }
 std::vector<mem_fetch *>
 memory_sub_partition::breakdown_request_to_sector_requests(mem_fetch *mf) {
   std::vector<mem_fetch *> result;
-
+  mem_access_sector_mask_t sector_mask = mf->get_access_sector_mask();
   if (mf->get_data_size() == SECTOR_SIZE &&
       mf->get_access_sector_mask().count() == 1) {
     result.push_back(mf);
-  } else if (mf->get_data_size() == 128 || mf->get_data_size() == 64) {
-    // We only accept 32, 64 and 128 bytes reqs
-    unsigned start = 0, end = 0;
-    if (mf->get_data_size() == 128) {
-      start = 0;
-      end = 3;
-    } else if (mf->get_data_size() == 64 &&
-               mf->get_access_sector_mask().to_string() == "1100") {
-      start = 2;
-      end = 3;
-    } else if (mf->get_data_size() == 64 &&
-               mf->get_access_sector_mask().to_string() == "0011") {
-      start = 0;
-      end = 1;
-    } else if (mf->get_data_size() == 64 &&
-               (mf->get_access_sector_mask().to_string() == "1111" ||
-                mf->get_access_sector_mask().to_string() == "0000")) {
-      if (mf->get_addr() % 128 == 0) {
-        start = 0;
-        end = 1;
-      } else {
-        start = 2;
-        end = 3;
+  } else {
+    for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; i++) {
+      if (sector_mask.test(i)) {
+        mem_access_byte_mask_t mask;
+        for (unsigned k = i * SECTOR_SIZE; k < (i + 1) * SECTOR_SIZE; k++) {
+          mask.set(k);
+        }
+        const mem_access_t *ma = new mem_access_t(
+            mf->get_access_type(), mf->get_addr() + SECTOR_SIZE * i,
+            SECTOR_SIZE, mf->is_write(), mf->get_access_warp_mask(),
+            mf->get_access_byte_mask() & mask,
+            std::bitset<SECTOR_CHUNCK_SIZE>().set(i), m_gpu->gpgpu_ctx);
+
+        mem_fetch *n_mf =
+            new mem_fetch(*ma, NULL, mf->get_ctrl_size(), mf->get_wid(),
+                          mf->get_sid(), mf->get_tpc(), mf->get_mem_config(),
+                          m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, mf);
+
+        result.push_back(n_mf);
       }
-    } else {
-      printf(
-          "Invalid sector received, address = 0x%06llx, sector mask = %s, data "
-          "size = %d",
-          mf->get_addr(), mf->get_access_sector_mask(), mf->get_data_size());
-      assert(0 && "Undefined sector mask is received");
     }
-
-    std::bitset<SECTOR_SIZE * SECTOR_CHUNCK_SIZE> byte_sector_mask;
-    byte_sector_mask.reset();
-    for (unsigned k = start * SECTOR_SIZE; k < SECTOR_SIZE; ++k)
-      byte_sector_mask.set(k);
-
-    for (unsigned j = start, i = 0; j <= end; ++j, ++i) {
-      const mem_access_t *ma = new mem_access_t(
-          mf->get_access_type(), mf->get_addr() + SECTOR_SIZE * i, SECTOR_SIZE,
-          mf->is_write(), mf->get_access_warp_mask(),
-          mf->get_access_byte_mask() & byte_sector_mask,
-          std::bitset<SECTOR_CHUNCK_SIZE>().set(j), m_gpu->gpgpu_ctx);
-
-      mem_fetch *n_mf =
-          new mem_fetch(*ma, NULL, mf->get_ctrl_size(), mf->get_wid(),
-                        mf->get_sid(), mf->get_tpc(), mf->get_mem_config(),
-                        m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, mf);
-
-      result.push_back(n_mf);
-      byte_sector_mask <<= SECTOR_SIZE;
-    }
-  } else {
-    printf(
-        "Invalid sector received, address = 0x%06llx, sector mask = %d, byte "
-        "mask = , data size = %u",
-        mf->get_addr(), mf->get_access_sector_mask().count(),
-        mf->get_data_size());
-    assert(0 && "Undefined data size is received");
   }
-
   return result;
 }
 

From e05fa4a676c2b082f1ebb34d051f43ad05d4a82c Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Tue, 2 Mar 2021 16:33:30 -0500
Subject: [PATCH 046/154] little bug fix - flush()

---
 src/gpgpu-sim/gpu-cache.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index ded800461..8d44f151e 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -426,9 +426,10 @@ void tag_array::flush() {
 
   for (unsigned i = 0; i < m_config.get_num_lines(); i++)
     if (m_lines[i]->is_modified_line()) {
-      for (unsigned j = 0; j < SECTOR_CHUNCK_SIZE; j++)
+      for (unsigned j = 0; j < SECTOR_CHUNCK_SIZE; j++) {
         m_lines[i]->set_status(INVALID, mem_access_sector_mask_t().set(j));
         m_dirty--;
+      }
     }
 
   is_used = false;

From 804ee9033d5c0d8f4e0b974734c4db42b55bd1dc Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Mon, 8 Mar 2021 13:29:13 -0500
Subject: [PATCH 047/154] sending byte mask for all policies

---
 src/gpgpu-sim/gpu-cache.cc | 35 ++++++++++++++++++++++-------------
 src/gpgpu-sim/gpu-cache.h  | 17 +++++++++++++++--
 2 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 8d44f151e..2cc75bbf7 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -358,13 +358,11 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
       if (m_config.m_alloc_policy == ON_MISS) {
         if (m_lines[idx]->is_modified_line()) {
           wb = true;
-          ((sector_cache_block *)m_lines[idx])->set_byte_mask(mf);
+          m_lines[idx]->set_byte_mask(mf);
           evicted.set_info(m_lines[idx]->m_block_addr,
                            m_lines[idx]->get_modified_size(),
-                           ((sector_cache_block *)m_lines[idx])
-                                              ->get_byte_mask(),
-                            ((sector_cache_block *)m_lines[idx])
-                                              ->get_sector_mask());
+                           m_lines[idx]->get_byte_mask(),
+                            m_lines[idx]->get_sector_mask());
         }
         m_lines[idx]->allocate(m_config.tag(addr), m_config.block_addr(addr),
                                time, mf->get_access_sector_mask());
@@ -1083,6 +1081,7 @@ void baseline_cache::fill(mem_fetch *mf, unsigned time) {
     block->set_status(MODIFIED,
                       mf->get_access_sector_mask());  // mark line as dirty for
                                                       // atomic operation
+    block->set_byte_mask(mf);
   }
   m_extra_mf_fields.erase(mf);
   m_bandwidth_management.use_fill_port(mf);
@@ -1189,6 +1188,7 @@ cache_request_status data_cache::wr_hit_wb(new_addr_type addr,
   m_tag_array->access(block_addr, time, cache_index, mf);  // update LRU state
   cache_block_t *block = m_tag_array->get_block(cache_index);
   block->set_status(MODIFIED, mf->get_access_sector_mask());
+  block->set_byte_mask(mf);
 
   return HIT;
 }
@@ -1208,6 +1208,7 @@ cache_request_status data_cache::wr_hit_wt(new_addr_type addr,
   m_tag_array->access(block_addr, time, cache_index, mf);  // update LRU state
   cache_block_t *block = m_tag_array->get_block(cache_index);
   block->set_status(MODIFIED, mf->get_access_sector_mask());
+  block->set_byte_mask(mf);
 
   // generate a write-through
   send_write_request(mf, cache_event(WRITE_REQUEST_SENT), time, events);
@@ -1317,8 +1318,10 @@ enum cache_request_status data_cache::wr_miss_wa_naive(
       assert(status ==
              MISS);  // SECTOR_MISS and HIT_RESERVED should not send write back
       mem_fetch *wb = m_memfetch_creator->alloc(
-          evicted.m_block_addr, m_wrbk_type, evicted.m_modified_size, true,
-          m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+        evicted.m_block_addr,m_wrbk_type,
+        mf->get_access_warp_mask(), evicted.m_byte_mask,
+        evicted.m_sector_mask, evicted.m_modified_size,
+        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
       wb->set_chip(mf->get_tlx_addr().chip);
@@ -1356,6 +1359,7 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
     assert(status != HIT);
     cache_block_t *block = m_tag_array->get_block(cache_index);
     block->set_status(MODIFIED, mf->get_access_sector_mask());
+    block->set_byte_mask(mf);
     if (status == HIT_RESERVED)
       block->set_ignore_on_fill(true, mf->get_access_sector_mask());
 
@@ -1364,8 +1368,10 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
       // (already modified lower level)
       if (wb && (m_config.m_write_policy != WRITE_THROUGH)) {
         mem_fetch *wb = m_memfetch_creator->alloc(
-            evicted.m_block_addr, m_wrbk_type, evicted.m_modified_size, true,
-            m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+        evicted.m_block_addr,m_wrbk_type,
+        mf->get_access_warp_mask(), evicted.m_byte_mask,
+        evicted.m_sector_mask, evicted.m_modified_size,
+        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
         // the evicted block may have wrong chip id when advanced L2 hashing  is
         // used, so set the right chip address from the original mf
         wb->set_chip(mf->get_tlx_addr().chip);
@@ -1434,8 +1440,10 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
       // (already modified lower level)
       if (wb && (m_config.m_write_policy != WRITE_THROUGH)) {
         mem_fetch *wb = m_memfetch_creator->alloc(
-            evicted.m_block_addr, m_wrbk_type, evicted.m_modified_size, true,
-            m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+        evicted.m_block_addr,m_wrbk_type,
+        mf->get_access_warp_mask(), evicted.m_byte_mask,
+        evicted.m_sector_mask, evicted.m_modified_size,
+        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
         // the evicted block may have wrong chip id when advanced L2 hashing  is
         // used, so set the right chip address from the original mf
         wb->set_chip(mf->get_tlx_addr().chip);
@@ -1471,7 +1479,7 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
   assert(m_status != HIT);
   cache_block_t *block = m_tag_array->get_block(cache_index);
   block->set_status(MODIFIED, mf->get_access_sector_mask());
-  ((sector_cache_block *)block)->set_byte_mask(mf);
+  block->set_byte_mask(mf);
   if (m_status == HIT_RESERVED) {
     block->set_ignore_on_fill(true, mf->get_access_sector_mask());
     block->set_modified_on_fill(true, mf->get_access_sector_mask());
@@ -1539,7 +1547,8 @@ enum cache_request_status data_cache::rd_hit_base(
     assert(mf->get_access_type() == GLOBAL_ACC_R);
     cache_block_t *block = m_tag_array->get_block(cache_index);
     block->set_status(MODIFIED,
-                      mf->get_access_sector_mask());  // mark line as dirty
+                      mf->get_access_sector_mask());  // mark line as 
+    block->set_byte_mask(mf);
   }
   return HIT;
 }
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 042c1d6b7..eb811d740 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -132,7 +132,9 @@ struct cache_block_t {
       mem_access_sector_mask_t sector_mask) = 0;
   virtual void set_status(enum cache_block_state m_status,
                           mem_access_sector_mask_t sector_mask) = 0;
-
+  virtual void set_byte_mask(mem_fetch *mf) = 0;
+  virtual mem_access_byte_mask_t get_byte_mask() = 0;
+  virtual mem_access_sector_mask_t get_sector_mask() = 0;
   virtual unsigned long long get_last_access_time() = 0;
   virtual void set_last_access_time(unsigned long long time,
                                     mem_access_sector_mask_t sector_mask) = 0;
@@ -201,6 +203,17 @@ struct line_cache_block : public cache_block_t {
                           mem_access_sector_mask_t sector_mask) {
     m_status = status;
   }
+  virtual void set_byte_mask(mem_fetch *mf) {
+    m_byte_mask = m_byte_mask | mf->get_access_byte_mask();
+  }
+  virtual mem_access_byte_mask_t get_byte_mask() {
+    return m_byte_mask;
+  }
+  virtual mem_access_sector_mask_t get_sector_mask() {
+    mem_access_sector_mask_t sector_mask;
+    if (m_status == MODIFIED) sector_mask.set();
+    return sector_mask;
+  }
   virtual unsigned long long get_last_access_time() {
     return m_last_access_time;
   }
@@ -244,6 +257,7 @@ struct line_cache_block : public cache_block_t {
   bool m_set_modified_on_fill;
   bool m_set_readable_on_fill;
   bool m_readable;
+  mem_access_byte_mask_t m_byte_mask;
 };
 
 struct sector_cache_block : public cache_block_t {
@@ -328,7 +342,6 @@ struct sector_cache_block : public cache_block_t {
 
     //	if(!m_ignore_on_fill_status[sidx])
     //	         assert( m_status[sidx] == RESERVED );
-
     m_status[sidx] = m_set_modified_on_fill[sidx] ? MODIFIED : VALID;
     
     if (m_set_readable_on_fill[sidx]) {

From b3dab5eec75f11c600bddc9a6dd3b22272363cca Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Mon, 8 Mar 2021 15:02:58 -0500
Subject: [PATCH 048/154] set byte mask on fill

---
 src/gpgpu-sim/gpu-cache.cc | 12 ++++++------
 src/gpgpu-sim/gpu-cache.h  | 37 +++++++++++++++++++++++++++++++------
 2 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 2cc75bbf7..46813e742 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -392,11 +392,11 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
 }
 
 void tag_array::fill(new_addr_type addr, unsigned time, mem_fetch *mf) {
-  fill(addr, time, mf->get_access_sector_mask());
+  fill(addr, time, mf->get_access_sector_mask(), mf->get_access_byte_mask());
 }
 
 void tag_array::fill(new_addr_type addr, unsigned time,
-                     mem_access_sector_mask_t mask) {
+                     mem_access_sector_mask_t mask, mem_access_byte_mask_t byte_mask) {
   // assert( m_config.m_alloc_policy == ON_FILL );
   unsigned idx;
   enum cache_request_status status = probe(addr, idx, mask);
@@ -410,12 +410,12 @@ void tag_array::fill(new_addr_type addr, unsigned time,
     ((sector_cache_block *)m_lines[idx])->allocate_sector(time, mask);
   }
 
-  m_lines[idx]->fill(time, mask);
+  m_lines[idx]->fill(time, mask, byte_mask);
 }
 
 void tag_array::fill(unsigned index, unsigned time, mem_fetch *mf) {
   assert(m_config.m_alloc_policy == ON_MISS);
-  m_lines[index]->fill(time, mf->get_access_sector_mask());
+  m_lines[index]->fill(time, mf->get_access_sector_mask(), mf->get_access_byte_mask());
 }
 
 // TODO: we need write back the flushed data to the upper level
@@ -1432,6 +1432,7 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
 
     cache_block_t *block = m_tag_array->get_block(cache_index);
     block->set_modified_on_fill(true, mf->get_access_sector_mask());
+    block->set_byte_mask_on_fill(true);
 
     events.push_back(cache_event(WRITE_ALLOCATE_SENT));
 
@@ -1483,8 +1484,7 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
   if (m_status == HIT_RESERVED) {
     block->set_ignore_on_fill(true, mf->get_access_sector_mask());
     block->set_modified_on_fill(true, mf->get_access_sector_mask());
-  } else {
-    block->set_status(MODIFIED, mf->get_access_sector_mask());
+    block->set_byte_mask_on_fill(true);
   }
 
   if (mf->get_access_byte_mask().count() == m_config.get_atom_sz()) {
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index eb811d740..a84ddd18a 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -121,7 +121,8 @@ struct cache_block_t {
   virtual void allocate(new_addr_type tag, new_addr_type block_addr,
                         unsigned time,
                         mem_access_sector_mask_t sector_mask) = 0;
-  virtual void fill(unsigned time, mem_access_sector_mask_t sector_mask) = 0;
+  virtual void fill(unsigned time, mem_access_sector_mask_t sector_mask, 
+                      mem_access_byte_mask_t byte_mask) = 0;
 
   virtual bool is_invalid_line() = 0;
   virtual bool is_valid_line() = 0;
@@ -133,6 +134,7 @@ struct cache_block_t {
   virtual void set_status(enum cache_block_state m_status,
                           mem_access_sector_mask_t sector_mask) = 0;
   virtual void set_byte_mask(mem_fetch *mf) = 0;
+  virtual void set_byte_mask(mem_access_byte_mask_t byte_mask) = 0;
   virtual mem_access_byte_mask_t get_byte_mask() = 0;
   virtual mem_access_sector_mask_t get_sector_mask() = 0;
   virtual unsigned long long get_last_access_time() = 0;
@@ -145,6 +147,7 @@ struct cache_block_t {
                                     mem_access_sector_mask_t sector_mask) = 0;
   virtual void set_readable_on_fill(bool readable,
                                     mem_access_sector_mask_t sector_mask) = 0;
+  virtual void set_byte_mask_on_fill(bool m_modified) = 0;
   virtual unsigned get_modified_size() = 0;
   virtual void set_m_readable(bool readable,
                               mem_access_sector_mask_t sector_mask) = 0;
@@ -178,8 +181,10 @@ struct line_cache_block : public cache_block_t {
     m_ignore_on_fill_status = false;
     m_set_modified_on_fill = false;
     m_set_readable_on_fill = false;
+    m_set_byte_mask_on_fill = false;
   }
-  void fill(unsigned time, mem_access_sector_mask_t sector_mask) {
+  virtual void fill(unsigned time, mem_access_sector_mask_t sector_mask, 
+              mem_access_byte_mask_t byte_mask) {
     // if(!m_ignore_on_fill_status)
     //	assert( m_status == RESERVED );
 
@@ -187,6 +192,7 @@ struct line_cache_block : public cache_block_t {
     
     if (m_set_readable_on_fill)
         m_readable = true;
+    if (m_set_byte_mask_on_fill) set_byte_mask(byte_mask);
 
     m_fill_time = time;
   }
@@ -206,6 +212,9 @@ struct line_cache_block : public cache_block_t {
   virtual void set_byte_mask(mem_fetch *mf) {
     m_byte_mask = m_byte_mask | mf->get_access_byte_mask();
   }
+  virtual void set_byte_mask(mem_access_byte_mask_t byte_mask) {
+    m_byte_mask = m_byte_mask | byte_mask;
+  }
   virtual mem_access_byte_mask_t get_byte_mask() {
     return m_byte_mask;
   }
@@ -234,6 +243,9 @@ struct line_cache_block : public cache_block_t {
                                     mem_access_sector_mask_t sector_mask) {
     m_set_readable_on_fill = readable;
   }
+  virtual void set_byte_mask_on_fill(bool m_modified) {
+    m_set_byte_mask_on_fill = m_modified;
+  }
   virtual unsigned get_modified_size() {
     return SECTOR_CHUNCK_SIZE * SECTOR_SIZE;  // i.e. cache line size
   }
@@ -256,6 +268,7 @@ struct line_cache_block : public cache_block_t {
   bool m_ignore_on_fill_status;
   bool m_set_modified_on_fill;
   bool m_set_readable_on_fill;
+  bool m_set_byte_mask_on_fill;
   bool m_readable;
   mem_access_byte_mask_t m_byte_mask;
 };
@@ -303,6 +316,7 @@ struct sector_cache_block : public cache_block_t {
     m_ignore_on_fill_status[sidx] = false;
     m_set_modified_on_fill[sidx] = false;
     m_set_readable_on_fill[sidx] = false;
+    m_set_byte_mask_on_fill = false;
 
     // set line stats
     m_line_alloc_time = time;  // only set this for the first allocated sector
@@ -337,7 +351,8 @@ struct sector_cache_block : public cache_block_t {
     m_line_fill_time = 0;
   }
 
-  virtual void fill(unsigned time, mem_access_sector_mask_t sector_mask) {
+  virtual void fill(unsigned time, mem_access_sector_mask_t sector_mask,
+                    mem_access_byte_mask_t byte_mask) {
     unsigned sidx = get_sector_index(sector_mask);
 
     //	if(!m_ignore_on_fill_status[sidx])
@@ -348,6 +363,7 @@ struct sector_cache_block : public cache_block_t {
         m_readable[sidx] = true;
         m_set_readable_on_fill[sidx] = false;
     }
+    if (m_set_byte_mask_on_fill) set_byte_mask(byte_mask);
 
     m_sector_fill_time[sidx] = time;
     m_line_fill_time = time;
@@ -389,7 +405,10 @@ struct sector_cache_block : public cache_block_t {
   }
 
   virtual void set_byte_mask(mem_fetch *mf) {
-    m_byte_mask = m_byte_mask | mf->get_access_byte_mask();;
+    m_byte_mask = m_byte_mask | mf->get_access_byte_mask();
+  }
+  virtual void set_byte_mask(mem_access_byte_mask_t byte_mask) {
+    m_byte_mask = m_byte_mask | byte_mask;
   }
   virtual mem_access_byte_mask_t get_byte_mask() {
     return m_byte_mask;
@@ -427,6 +446,9 @@ struct sector_cache_block : public cache_block_t {
     unsigned sidx = get_sector_index(sector_mask);
     m_set_modified_on_fill[sidx] = m_modified;
   }
+  virtual void set_byte_mask_on_fill(bool m_modified) {
+    m_set_byte_mask_on_fill = m_modified;
+  }
 
   virtual void set_readable_on_fill(bool readable,
                                     mem_access_sector_mask_t sector_mask) {
@@ -468,6 +490,7 @@ struct sector_cache_block : public cache_block_t {
   bool m_ignore_on_fill_status[SECTOR_CHUNCK_SIZE];
   bool m_set_modified_on_fill[SECTOR_CHUNCK_SIZE];
   bool m_set_readable_on_fill[SECTOR_CHUNCK_SIZE];
+  bool m_set_byte_mask_on_fill;
   bool m_readable[SECTOR_CHUNCK_SIZE];
   mem_access_byte_mask_t m_byte_mask;
 
@@ -904,7 +927,8 @@ class tag_array {
 
   void fill(new_addr_type addr, unsigned time, mem_fetch *mf);
   void fill(unsigned idx, unsigned time, mem_fetch *mf);
-  void fill(new_addr_type addr, unsigned time, mem_access_sector_mask_t mask);
+  void fill(new_addr_type addr, unsigned time, mem_access_sector_mask_t mask, 
+            mem_access_byte_mask_t byte_mask);
 
   unsigned size() const { return m_config.get_num_lines(); }
   cache_block_t *get_block(unsigned idx) { return m_lines[idx]; }
@@ -1291,7 +1315,8 @@ class baseline_cache : public cache_t {
   // something is read or written without doing anything else.
   void force_tag_access(new_addr_type addr, unsigned time,
                         mem_access_sector_mask_t mask) {
-    m_tag_array->fill(addr, time, mask);
+    mem_access_byte_mask_t byte_mask;
+    m_tag_array->fill(addr, time, mask, byte_mask);
   }
 
  protected:

From 40077df94f1afcfaabdc9599d7a2c25d3d98da8a Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Mon, 8 Mar 2021 17:58:02 -0500
Subject: [PATCH 049/154] solve deadlock for non-sectored cache configs

---
 src/gpgpu-sim/l2cache.cc | 47 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index 63119ee90..00b14d7f6 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -717,6 +717,52 @@ memory_sub_partition::breakdown_request_to_sector_requests(mem_fetch *mf) {
   if (mf->get_data_size() == SECTOR_SIZE &&
       mf->get_access_sector_mask().count() == 1) {
     result.push_back(mf);
+  } else if (mf->get_data_size() == 128) {
+    // break down every sector
+    mem_access_byte_mask_t mask;
+    for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; i++) {
+      for (unsigned k = i * SECTOR_SIZE; k < (i + 1) * SECTOR_SIZE; k++) {
+        mask.set(k);
+      }
+      const mem_access_t *ma = new mem_access_t(
+          mf->get_access_type(), mf->get_addr() + SECTOR_SIZE * i, SECTOR_SIZE,
+          mf->is_write(), mf->get_access_warp_mask(),
+          mf->get_access_byte_mask() & mask,
+          std::bitset<SECTOR_CHUNCK_SIZE>().set(i), m_gpu->gpgpu_ctx);
+
+      mem_fetch *n_mf =
+          new mem_fetch(*ma, NULL, mf->get_ctrl_size(), mf->get_wid(),
+                        mf->get_sid(), mf->get_tpc(), mf->get_mem_config(),
+                        m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, mf);
+
+      result.push_back(n_mf);
+    }
+  } else if (mf->get_data_size() == 64 &&
+             (mf->get_access_sector_mask().to_string() == "1111" ||
+              mf->get_access_sector_mask().to_string() == "0000")) {
+    unsigned start;
+    if (mf->get_addr() % 128 == 0)
+      start = 0;
+    else
+      start = 2;
+    mem_access_byte_mask_t mask;
+    for (unsigned i = start; i < start + 2; i++) {
+      for (unsigned k = i * SECTOR_SIZE; k < (i + 1) * SECTOR_SIZE; k++) {
+        mask.set(k);
+      }
+      const mem_access_t *ma = new mem_access_t(
+          mf->get_access_type(), mf->get_addr(), SECTOR_SIZE,
+          mf->is_write(), mf->get_access_warp_mask(),
+          mf->get_access_byte_mask() & mask,
+          std::bitset<SECTOR_CHUNCK_SIZE>().set(i), m_gpu->gpgpu_ctx);
+
+      mem_fetch *n_mf =
+          new mem_fetch(*ma, NULL, mf->get_ctrl_size(), mf->get_wid(),
+                        mf->get_sid(), mf->get_tpc(), mf->get_mem_config(),
+                        m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, mf);
+
+      result.push_back(n_mf);
+    }
   } else {
     for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; i++) {
       if (sector_mask.test(i)) {
@@ -739,6 +785,7 @@ memory_sub_partition::breakdown_request_to_sector_requests(mem_fetch *mf) {
       }
     }
   }
+  if (result.size() == 0) assert(0 && "no mf sent");
   return result;
 }
 

From 64bf6fd7a44a32773389e900862bd9c0527a87e9 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Thu, 18 Mar 2021 13:31:41 -0400
Subject: [PATCH 050/154] dirty counter not resetting after kernel finish

---
 src/gpgpu-sim/gpu-cache.cc | 80 +++++++++++++++++++++++++++++++-------
 1 file changed, 67 insertions(+), 13 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 46813e742..5ac202cea 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -210,6 +210,7 @@ void tag_array::init(int core_id, int type_id) {
   m_core_id = core_id;
   m_type_id = type_id;
   is_used = false;
+  m_dirty = 0;
 }
 
 void tag_array::add_pending_line(mem_fetch *mf) {
@@ -250,7 +251,22 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
   unsigned long long valid_timestamp = (unsigned)-1;
 
   bool all_reserved = true;
+  unsigned count = 0;
+  if (m_config.m_wr_percent == (unsigned)25) {
+    for (unsigned i = 0; i < m_config.m_nset * m_config.m_assoc; i++) {
+      if (m_lines[i]->is_modified_line()) {
+        m_lines[i]->is_modified_line();
+        count++;
+      }
+    }
+    if (count != m_dirty) {
+      printf("count = %u, m_dirty = %u",count,m_dirty);
+      fflush(stdout);
+      assert(0 && "m_dirty miss match");
+      printf("count = %u, m_dirty = %u",count,m_dirty);
 
+    }
+  }
   // check for hit or pending hit
   for (unsigned way = 0; way < m_config.m_assoc; way++) {
     unsigned index = set_index * m_config.m_assoc + way;
@@ -279,15 +295,17 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
       }
     }
     if (!line->is_reserved_line()) {
-      all_reserved = false;
-      if (line->is_invalid_line()) {
-        invalid_line = index;
-      } else {
-        // valid line : keep track of most appropriate replacement candidate
-        if (!line->get_status(mask) == MODIFIED || 
-            100 * m_dirty/(m_config.m_nset * m_config.m_assoc) >= m_config.m_wr_percent) {
-              // don't evict write until dirty lines reach threshold
-              // make sure at least 1 candidate is assigned
+      if (!line->is_modified_line() ||
+          100 * m_dirty / (m_config.m_nset * m_config.m_assoc) >=
+              m_config.m_wr_percent) {
+        all_reserved = false;
+        if (line->is_invalid_line()) {
+          invalid_line = index;
+        } else {
+          // valid line : keep track of most appropriate replacement candidate
+
+          // don't evict write until dirty lines reach threshold
+          // make sure at least 1 candidate is assigned
           if (m_config.m_replacement_policy == LRU) {
             if (line->get_last_access_time() < valid_timestamp) {
               valid_timestamp = line->get_last_access_time();
@@ -363,6 +381,7 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
                            m_lines[idx]->get_modified_size(),
                            m_lines[idx]->get_byte_mask(),
                             m_lines[idx]->get_sector_mask());
+          m_dirty--;
         }
         m_lines[idx]->allocate(m_config.tag(addr), m_config.block_addr(addr),
                                time, mf->get_access_sector_mask());
@@ -373,8 +392,12 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
       m_sector_miss++;
       shader_cache_access_log(m_core_id, m_type_id, 1);  // log cache misses
       if (m_config.m_alloc_policy == ON_MISS) {
+        bool before = m_lines[idx]->is_modified_line();
         ((sector_cache_block *)m_lines[idx])
             ->allocate_sector(time, mf->get_access_sector_mask());
+            if (before && !m_lines[idx]->is_modified_line()) {
+              m_dirty--;
+            }
       }
       break;
     case RESERVATION_FAIL:
@@ -400,22 +423,35 @@ void tag_array::fill(new_addr_type addr, unsigned time,
   // assert( m_config.m_alloc_policy == ON_FILL );
   unsigned idx;
   enum cache_request_status status = probe(addr, idx, mask);
+  bool before = false;
   // assert(status==MISS||status==SECTOR_MISS); // MSHR should have prevented
   // redundant memory request
-  if (status == MISS)
+  if (status == MISS) {
+    before = m_lines[idx]->is_modified_line();
     m_lines[idx]->allocate(m_config.tag(addr), m_config.block_addr(addr), time,
                            mask);
-  else if (status == SECTOR_MISS) {
+  } else if (status == SECTOR_MISS) {
     assert(m_config.m_cache_type == SECTOR);
+    before = m_lines[idx]->is_modified_line();
     ((sector_cache_block *)m_lines[idx])->allocate_sector(time, mask);
   }
-
+  if (before && !m_lines[idx]->is_modified_line()) {
+    m_dirty--;
+  }
+  before = m_lines[idx]->is_modified_line();
   m_lines[idx]->fill(time, mask, byte_mask);
+  if (m_lines[idx]->is_modified_line() && !before) {
+    m_dirty++;
+  }
 }
 
 void tag_array::fill(unsigned index, unsigned time, mem_fetch *mf) {
   assert(m_config.m_alloc_policy == ON_MISS);
+  bool before = m_lines[index]->is_modified_line();
   m_lines[index]->fill(time, mf->get_access_sector_mask(), mf->get_access_byte_mask());
+  if (m_lines[index]->is_modified_line() && !before) {
+    m_dirty++;
+  }
 }
 
 // TODO: we need write back the flushed data to the upper level
@@ -424,9 +460,9 @@ void tag_array::flush() {
 
   for (unsigned i = 0; i < m_config.get_num_lines(); i++)
     if (m_lines[i]->is_modified_line()) {
+      m_dirty--;
       for (unsigned j = 0; j < SECTOR_CHUNCK_SIZE; j++) {
         m_lines[i]->set_status(INVALID, mem_access_sector_mask_t().set(j));
-        m_dirty--;
       }
     }
 
@@ -1078,6 +1114,9 @@ void baseline_cache::fill(mem_fetch *mf, unsigned time) {
   if (has_atomic) {
     assert(m_config.m_alloc_policy == ON_MISS);
     cache_block_t *block = m_tag_array->get_block(e->second.m_cache_index);
+    if (!block->is_modified_line()) {
+      m_tag_array->inc_dirty();
+    }
     block->set_status(MODIFIED,
                       mf->get_access_sector_mask());  // mark line as dirty for
                                                       // atomic operation
@@ -1187,6 +1226,9 @@ cache_request_status data_cache::wr_hit_wb(new_addr_type addr,
   new_addr_type block_addr = m_config.block_addr(addr);
   m_tag_array->access(block_addr, time, cache_index, mf);  // update LRU state
   cache_block_t *block = m_tag_array->get_block(cache_index);
+  if (!block->is_modified_line()) {
+    m_tag_array->inc_dirty();
+  }
   block->set_status(MODIFIED, mf->get_access_sector_mask());
   block->set_byte_mask(mf);
 
@@ -1207,6 +1249,9 @@ cache_request_status data_cache::wr_hit_wt(new_addr_type addr,
   new_addr_type block_addr = m_config.block_addr(addr);
   m_tag_array->access(block_addr, time, cache_index, mf);  // update LRU state
   cache_block_t *block = m_tag_array->get_block(cache_index);
+  if (!block->is_modified_line()) {
+    m_tag_array->inc_dirty();
+  }
   block->set_status(MODIFIED, mf->get_access_sector_mask());
   block->set_byte_mask(mf);
 
@@ -1358,6 +1403,9 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
         m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf);
     assert(status != HIT);
     cache_block_t *block = m_tag_array->get_block(cache_index);
+    if (!block->is_modified_line()) {
+      m_tag_array->inc_dirty();
+    }
     block->set_status(MODIFIED, mf->get_access_sector_mask());
     block->set_byte_mask(mf);
     if (status == HIT_RESERVED)
@@ -1479,6 +1527,9 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
       m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf);
   assert(m_status != HIT);
   cache_block_t *block = m_tag_array->get_block(cache_index);
+  if (!block->is_modified_line()) {
+    m_tag_array->inc_dirty();
+  }
   block->set_status(MODIFIED, mf->get_access_sector_mask());
   block->set_byte_mask(mf);
   if (m_status == HIT_RESERVED) {
@@ -1546,6 +1597,9 @@ enum cache_request_status data_cache::rd_hit_base(
   if (mf->isatomic()) {
     assert(mf->get_access_type() == GLOBAL_ACC_R);
     cache_block_t *block = m_tag_array->get_block(cache_index);
+    if (!block->is_modified_line()) {
+      m_tag_array->inc_dirty();
+    }
     block->set_status(MODIFIED,
                       mf->get_access_sector_mask());  // mark line as 
     block->set_byte_mask(mf);

From a374b330ac3bec0b47ce588adf72af89e5cd9307 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Fri, 26 Mar 2021 16:33:41 -0400
Subject: [PATCH 051/154] remove MSHR_HIT from cache total access

---
 src/gpgpu-sim/gpu-cache.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 5ac202cea..d2f9fef9c 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -819,7 +819,7 @@ void cache_stats::print_stats(FILE *fout, const char *cache_name) const {
               cache_request_status_str((enum cache_request_status)status),
               m_stats[type][status]);
 
-      if (status != RESERVATION_FAIL)
+      if (status != RESERVATION_FAIL && status != MSHR_HIT)
         total_access[type] += m_stats[type][status];
     }
   }

From f6fb56ba32141030803ecfe01b52a6f6c93d8e6c Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Tue, 6 Apr 2021 15:46:03 -0400
Subject: [PATCH 052/154] check sector readable only on reads

---
 src/gpgpu-sim/gpu-cache.cc | 27 ++++++++++++++-------------
 src/gpgpu-sim/gpu-cache.h  | 10 ++++++----
 2 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index d2f9fef9c..9c65476b1 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -232,15 +232,15 @@ void tag_array::remove_pending_line(mem_fetch *mf) {
 }
 
 enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
-                                           mem_fetch *mf,
+                                           mem_fetch *mf, bool is_write,
                                            bool probe_mode) const {
   mem_access_sector_mask_t mask = mf->get_access_sector_mask();
-  return probe(addr, idx, mask, probe_mode, mf);
+  return probe(addr, idx, mask,is_write, probe_mode, mf);
 }
 
 enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
                                            mem_access_sector_mask_t mask,
-                                           bool probe_mode,
+                                           bool is_write, bool probe_mode,
                                            mem_fetch *mf) const {
   // assert( m_config.m_write_policy == READ_ONLY );
   unsigned set_index = m_config.set_index(addr);
@@ -279,7 +279,7 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
         idx = index;
         return HIT;
       } else if (line->get_status(mask) == MODIFIED) {
-        if (line->is_readable(mask)) {
+        if ((!is_write && line->is_readable(mask)) || is_write) {
           idx = index;
           return HIT;
         } else {
@@ -363,7 +363,7 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
   m_access++;
   is_used = true;
   shader_cache_access_log(m_core_id, m_type_id, 0);  // log accesses to cache
-  enum cache_request_status status = probe(addr, idx, mf);
+  enum cache_request_status status = probe(addr, idx, mf, mf->is_write());
   switch (status) {
     case HIT_RESERVED:
       m_pending_hit++;
@@ -414,16 +414,17 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
   return status;
 }
 
-void tag_array::fill(new_addr_type addr, unsigned time, mem_fetch *mf) {
-  fill(addr, time, mf->get_access_sector_mask(), mf->get_access_byte_mask());
+void tag_array::fill(new_addr_type addr, unsigned time, mem_fetch *mf, bool is_write) {
+  fill(addr, time, mf->get_access_sector_mask(), mf->get_access_byte_mask(), is_write);
 }
 
 void tag_array::fill(new_addr_type addr, unsigned time,
-                     mem_access_sector_mask_t mask, mem_access_byte_mask_t byte_mask) {
+                     mem_access_sector_mask_t mask, mem_access_byte_mask_t byte_mask,
+                     bool is_write) {
   // assert( m_config.m_alloc_policy == ON_FILL );
   unsigned idx;
-  enum cache_request_status status = probe(addr, idx, mask);
-  bool before = false;
+  enum cache_request_status status = probe(addr, idx, mask,is_write);
+  bool before = m_lines[idx]->is_modified_line();
   // assert(status==MISS||status==SECTOR_MISS); // MSHR should have prevented
   // redundant memory request
   if (status == MISS) {
@@ -1105,7 +1106,7 @@ void baseline_cache::fill(mem_fetch *mf, unsigned time) {
   if (m_config.m_alloc_policy == ON_MISS)
     m_tag_array->fill(e->second.m_cache_index, time, mf);
   else if (m_config.m_alloc_policy == ON_FILL) {
-    m_tag_array->fill(e->second.m_block_addr, time, mf);
+    m_tag_array->fill(e->second.m_block_addr, time, mf, mf->is_write());
     if (m_config.is_streaming()) m_tag_array->remove_pending_line(mf);
   } else
     abort();
@@ -1659,7 +1660,7 @@ enum cache_request_status read_only_cache::access(
   new_addr_type block_addr = m_config.block_addr(addr);
   unsigned cache_index = (unsigned)-1;
   enum cache_request_status status =
-      m_tag_array->probe(block_addr, cache_index, mf);
+      m_tag_array->probe(block_addr, cache_index, mf, mf->is_write());
   enum cache_request_status cache_status = RESERVATION_FAIL;
 
   if (status == HIT) {
@@ -1746,7 +1747,7 @@ enum cache_request_status data_cache::access(new_addr_type addr, mem_fetch *mf,
   new_addr_type block_addr = m_config.block_addr(addr);
   unsigned cache_index = (unsigned)-1;
   enum cache_request_status probe_status =
-      m_tag_array->probe(block_addr, cache_index, mf, true);
+      m_tag_array->probe(block_addr, cache_index, mf, mf->is_write(), true);
   enum cache_request_status access_status =
       process_tag_probe(wr, probe_status, addr, cache_index, mf, time, events);
   m_stats.inc_stats(mf->get_access_type(),
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index a84ddd18a..c2e302ead 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -914,9 +914,11 @@ class tag_array {
   ~tag_array();
 
   enum cache_request_status probe(new_addr_type addr, unsigned &idx,
-                                  mem_fetch *mf, bool probe_mode = false) const;
+                                  mem_fetch *mf, bool is_write,
+                                  bool probe_mode = false) const;
   enum cache_request_status probe(new_addr_type addr, unsigned &idx,
                                   mem_access_sector_mask_t mask,
+                                  bool is_write,
                                   bool probe_mode = false,
                                   mem_fetch *mf = NULL) const;
   enum cache_request_status access(new_addr_type addr, unsigned time,
@@ -925,10 +927,10 @@ class tag_array {
                                    unsigned &idx, bool &wb,
                                    evicted_block_info &evicted, mem_fetch *mf);
 
-  void fill(new_addr_type addr, unsigned time, mem_fetch *mf);
+  void fill(new_addr_type addr, unsigned time, mem_fetch *mf, bool is_write);
   void fill(unsigned idx, unsigned time, mem_fetch *mf);
   void fill(new_addr_type addr, unsigned time, mem_access_sector_mask_t mask, 
-            mem_access_byte_mask_t byte_mask);
+            mem_access_byte_mask_t byte_mask, bool is_write);
 
   unsigned size() const { return m_config.get_num_lines(); }
   cache_block_t *get_block(unsigned idx) { return m_lines[idx]; }
@@ -1316,7 +1318,7 @@ class baseline_cache : public cache_t {
   void force_tag_access(new_addr_type addr, unsigned time,
                         mem_access_sector_mask_t mask) {
     mem_access_byte_mask_t byte_mask;
-    m_tag_array->fill(addr, time, mask, byte_mask);
+    m_tag_array->fill(addr, time, mask, byte_mask, true);
   }
 
  protected:

From 994fb19e160e3897b5662fb7e6946a3802fde794 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Tue, 4 May 2021 15:17:57 -0400
Subject: [PATCH 053/154] reset dirty counter

---
 src/gpgpu-sim/gpu-cache.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 9c65476b1..e88a64627 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -428,12 +428,10 @@ void tag_array::fill(new_addr_type addr, unsigned time,
   // assert(status==MISS||status==SECTOR_MISS); // MSHR should have prevented
   // redundant memory request
   if (status == MISS) {
-    before = m_lines[idx]->is_modified_line();
     m_lines[idx]->allocate(m_config.tag(addr), m_config.block_addr(addr), time,
                            mask);
   } else if (status == SECTOR_MISS) {
     assert(m_config.m_cache_type == SECTOR);
-    before = m_lines[idx]->is_modified_line();
     ((sector_cache_block *)m_lines[idx])->allocate_sector(time, mask);
   }
   if (before && !m_lines[idx]->is_modified_line()) {
@@ -458,10 +456,10 @@ void tag_array::fill(unsigned index, unsigned time, mem_fetch *mf) {
 // TODO: we need write back the flushed data to the upper level
 void tag_array::flush() {
   if (!is_used) return;
+  m_dirty = 0;
 
   for (unsigned i = 0; i < m_config.get_num_lines(); i++)
     if (m_lines[i]->is_modified_line()) {
-      m_dirty--;
       for (unsigned j = 0; j < SECTOR_CHUNCK_SIZE; j++) {
         m_lines[i]->set_status(INVALID, mem_access_sector_mask_t().set(j));
       }
@@ -472,6 +470,7 @@ void tag_array::flush() {
 
 void tag_array::invalidate() {
   if (!is_used) return;
+  m_dirty = 0;
 
   for (unsigned i = 0; i < m_config.get_num_lines(); i++)
     for (unsigned j = 0; j < SECTOR_CHUNCK_SIZE; j++)

From 73069303b3dc0845e33b9ddafa7e6697fe3deb38 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Tue, 11 May 2021 22:45:44 -0400
Subject: [PATCH 054/154] remove runtime check of dirty counter

---
 src/gpgpu-sim/gpu-cache.cc | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index e88a64627..9e1db8bc0 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -251,22 +251,6 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
   unsigned long long valid_timestamp = (unsigned)-1;
 
   bool all_reserved = true;
-  unsigned count = 0;
-  if (m_config.m_wr_percent == (unsigned)25) {
-    for (unsigned i = 0; i < m_config.m_nset * m_config.m_assoc; i++) {
-      if (m_lines[i]->is_modified_line()) {
-        m_lines[i]->is_modified_line();
-        count++;
-      }
-    }
-    if (count != m_dirty) {
-      printf("count = %u, m_dirty = %u",count,m_dirty);
-      fflush(stdout);
-      assert(0 && "m_dirty miss match");
-      printf("count = %u, m_dirty = %u",count,m_dirty);
-
-    }
-  }
   // check for hit or pending hit
   for (unsigned way = 0; way < m_config.m_assoc; way++) {
     unsigned index = set_index * m_config.m_assoc + way;

From 0601354a4d7f7f106e008b47cbc74097ec0a2a69 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Tue, 18 May 2021 14:35:04 -0400
Subject: [PATCH 055/154] Add WT to lazy_fetch_on_read

---
 src/gpgpu-sim/gpu-cache.cc | 29 ++++++++++++++++++++++++++---
 src/gpgpu-sim/gpu-cache.h  |  3 +++
 src/gpgpu-sim/shader.cc    |  5 +++--
 3 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 9e1db8bc0..390bacce2 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -1494,16 +1494,39 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
     new_addr_type addr, unsigned cache_index, mem_fetch *mf, unsigned time,
     std::list<cache_event> &events, enum cache_request_status status) {
   new_addr_type block_addr = m_config.block_addr(addr);
+  new_addr_type mshr_addr = m_config.mshr_addr(mf->get_addr());
 
   // if the request writes to the whole cache line/sector, then, write and set
   // cache line Modified. and no need to send read request to memory or reserve
   // mshr
 
-  if (miss_queue_full(0)) {
-    m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
-    return RESERVATION_FAIL;  // cannot handle request this cycle
+  // Write allocate, maximum 2 requests (write miss, write back request)
+  // Conservatively ensure the worst-case request can be handled this
+  // cycle
+  if (m_config.m_write_policy == WRITE_THROUGH) {
+    bool mshr_hit = m_mshrs.probe(mshr_addr);
+    bool mshr_avail = !m_mshrs.full(mshr_addr);
+    if (miss_queue_full(1) ||
+        (!(mshr_hit && mshr_avail) &&
+         !(!mshr_hit && mshr_avail &&
+           (m_miss_queue.size() < m_config.m_miss_queue_size)))) {
+      // check what is the exactly the failure reason
+      if (miss_queue_full(1))
+        m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
+      else if (mshr_hit && !mshr_avail)
+        m_stats.inc_fail_stats(mf->get_access_type(), MSHR_MERGE_ENRTY_FAIL);
+      else if (!mshr_hit && !mshr_avail)
+        m_stats.inc_fail_stats(mf->get_access_type(), MSHR_ENRTY_FAIL);
+      else
+        assert(0);
+
+      return RESERVATION_FAIL;
+    }
+
+    send_write_request(mf, cache_event(WRITE_REQUEST_SENT), time, events);
   }
 
+
   bool wb = false;
   evicted_block_info evicted;
 
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index c2e302ead..6811b868e 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -821,6 +821,9 @@ class cache_config {
   write_allocate_policy_t get_write_allocate_policy() {
     return m_write_alloc_policy;
   }
+  write_policy_t get_write_policy() {
+    return m_write_policy;
+  }
 
  protected:
   void exit_parse_error() {
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 4b4c98db7..22bd8e9a9 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -1989,9 +1989,10 @@ void ldst_unit::L1_latency_queue_cycle() {
       } else {
         assert(status == MISS || status == HIT_RESERVED);
         l1_latency_queue[j][0] = NULL;
-        if (mf_next->get_inst().is_store() &&
+        if (m_config->m_L1D_config.get_write_policy() != WRITE_THROUGH &&
+            mf_next->get_inst().is_store() &&
             (m_config->m_L1D_config.get_write_allocate_policy() == FETCH_ON_WRITE ||
-              m_config->m_L1D_config.get_write_allocate_policy() == LAZY_FETCH_ON_READ) &&
+            m_config->m_L1D_config.get_write_allocate_policy() == LAZY_FETCH_ON_READ) &&
             !was_writeallocate_sent(events)) {
           unsigned dec_ack =
               (m_config->m_L1D_config.get_mshr_type() == SECTOR_ASSOC)

From f7833519471ce92619bd1e4807ec07eb55aed76e Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Mon, 17 May 2021 17:35:06 -0400
Subject: [PATCH 056/154] new configs - adaptive cache and cache write ratio

---
 configs/tested-cfgs/SM75_RTX2060/gpgpusim.config | 2 ++
 configs/tested-cfgs/SM7_QV100/gpgpusim.config    | 6 ++++++
 configs/tested-cfgs/SM7_TITANV/gpgpusim.config   | 3 +++
 src/abstract_hardware_model.h                    | 2 ++
 src/gpgpu-sim/gpu-cache.h                        | 5 +++++
 src/gpgpu-sim/gpu-sim.cc                         | 7 +++++++
 6 files changed, 25 insertions(+)

diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index 6189dca0f..e006085df 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -110,6 +110,8 @@
 -gpgpu_l1_latency 20
 -gpgpu_smem_latency 20
 -gpgpu_flush_l1_cache 1
+-gpgpu_shmem_option 0,8,16,32,64,100
+-gpgpu_unified_l1d_size 128
 
 # 64 sets, each 128 bytes 16-way for each memory sub partition (128 KB per memory sub partition). This gives us 3MB L2 cache
 -gpgpu_cache:dl2 S:64:128:16,L:B:m:L:P,A:192:4,32:0,32
diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index bc5677cf3..043fce64c 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -124,6 +124,9 @@
 -gpgpu_l1_latency 20
 -gpgpu_smem_latency 20
 -gpgpu_flush_l1_cache 1
+-gpgpu_cache_write_ratio 25
+-gpgpu_shmem_option 0,12,24,48,96
+-gpgpu_unified_l1d_size 128
 
 # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
@@ -203,3 +206,6 @@
 #-trace_components WARP_SCHEDULER,SCOREBOARD
 #-trace_sampling_core 0
 
+-gpgpu_cache_write_ratio 25
+-gpgpu_shmem_option 0,12,24,48,96
+-gpgpu_unified_l1d_size 128
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
index 3fa51ee14..1f0c15f51 100644
--- a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
@@ -125,6 +125,9 @@
 -gpgpu_l1_latency 20
 -gpgpu_smem_latency 20
 -gpgpu_flush_l1_cache 1
+-gpgpu_cache_write_ratio 25
+-gpgpu_shmem_option 0,12,24,48,96
+-gpgpu_unified_l1d_size 128
 
 # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 4.5MB L2 cache
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index 982e41606..e796571dc 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -373,6 +373,8 @@ class core_config {
   }
   unsigned mem_warp_parts;
   mutable unsigned gpgpu_shmem_size;
+  char *gpgpu_shmem_option;
+  unsigned gpgpu_unified_l1d_size;
   unsigned gpgpu_shmem_sizeDefault;
   unsigned gpgpu_shmem_sizePrefL1;
   unsigned gpgpu_shmem_sizePrefShared;
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 00c09ae55..ccc935bae 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -491,6 +491,7 @@ class cache_config {
     m_data_port_width = 0;
     m_set_index_function = LINEAR_SET_FUNCTION;
     m_is_streaming = false;
+    m_wr_percent = 0;
   }
   void init(char *config, FuncCache status) {
     cache_status = status;
@@ -754,6 +755,10 @@ class cache_config {
   char *m_config_stringPrefL1;
   char *m_config_stringPrefShared;
   FuncCache cache_status;
+  unsigned m_wr_percent;
+  write_allocate_policy_t get_write_allocate_policy() {
+    return m_write_alloc_policy;
+  }
 
  protected:
   void exit_parse_error() {
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index fd36e006a..bd09cdbe5 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -249,6 +249,7 @@ void shader_core_config::reg_options(class OptionParser *opp) {
                          " {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_"
                          "alloc>,<mshr>:<N>:<merge>,<mq> | none}",
                          "none");
+  option_parser_register(opp,"-gpgpu_cache_write_ratio",OPT_UINT32,&m_L1D_config.m_wr_percent,"L1D write ratio","0");
   option_parser_register(opp, "-gpgpu_l1_banks", OPT_UINT32,
                          &m_L1D_config.l1_banks, "The number of L1 cache banks",
                          "1");
@@ -326,6 +327,12 @@ void shader_core_config::reg_options(class OptionParser *opp) {
   option_parser_register(
       opp, "-gpgpu_shmem_size", OPT_UINT32, &gpgpu_shmem_size,
       "Size of shared memory per shader core (default 16kB)", "16384");
+  option_parser_register(
+      opp, "-gpgpu_shmem_option", OPT_CSTR, &gpgpu_shmem_option,
+      "Option list of shared memory sizes", "0");
+  option_parser_register(
+      opp, "-gpgpu_unified_l1d_size", OPT_UINT32, &gpgpu_unified_l1d_size,
+      "Size of unified data cache(L1D + shared memory) in KB", "0");
   option_parser_register(opp, "-gpgpu_adaptive_cache_config", OPT_BOOL,
                          &adaptive_cache_config, "adaptive_cache_config", "0");
   option_parser_register(

From a2b1b1c2839fe3fc05a0cae126204120fab00f62 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Mon, 17 May 2021 17:35:53 -0400
Subject: [PATCH 057/154] adaptive cache - update

---
 src/abstract_hardware_model.h |  2 +-
 src/gpgpu-sim/gpu-cache.h     | 11 ++++
 src/gpgpu-sim/shader.cc       | 95 +++++++++++++++++++++--------------
 3 files changed, 68 insertions(+), 40 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index e796571dc..bd10a93fe 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -65,7 +65,7 @@ enum FuncCache {
   FuncCachePreferL1 = 2
 };
 
-enum AdaptiveCache { FIXED = 0, ADAPTIVE_VOLTA = 1 };
+enum AdaptiveCache { FIXED = 0, ADAPTIVE_CACHE = 1 };
 
 #ifdef __cplusplus
 
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index ccc935bae..0162b6cbc 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -616,6 +616,8 @@ class cache_config {
     m_atom_sz = (m_cache_type == SECTOR) ? SECTOR_SIZE : m_line_sz;
     m_sector_sz_log2 = LOGB2(SECTOR_SIZE);
     original_m_assoc = m_assoc;
+    original_sz = m_nset * original_m_assoc * m_line_sz;
+
 
     // For more details about difference between FETCH_ON_WRITE and WRITE
     // VALIDAE policies Read: Jouppi, Norman P. "Cache write policies and
@@ -710,6 +712,14 @@ class cache_config {
     assert(m_valid);
     return MAX_DEFAULT_CACHE_SIZE_MULTIBLIER * original_m_assoc;
   }
+  unsigned get_original_assoc() const {
+    assert(m_valid);
+    return original_m_assoc;
+  }
+  unsigned get_original_sz() const {
+    assert(m_valid);
+    return original_sz;
+  }
   void print(FILE *fp) const {
     fprintf(fp, "Size = %d B (%d Set x %d-way x %d byte line)\n",
             m_line_sz * m_nset * m_assoc, m_nset, m_assoc, m_line_sz);
@@ -777,6 +787,7 @@ class cache_config {
   unsigned m_atom_sz;
   unsigned m_sector_sz_log2;
   unsigned original_m_assoc;
+  unsigned original_sz;
   bool m_is_streaming;
 
   enum replacement_policy_t m_replacement_policy;  // 'L' = LRU, 'F' = FIFO
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 14d904424..b2adb4f53 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3292,50 +3292,67 @@ unsigned int shader_core_config::max_cta(const kernel_info_t &k) const {
   if (adaptive_cache_config && !k.cache_config_set) {
     // For more info about adaptive cache, see
     // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x
-    unsigned total_shmed = kernel_info->smem * result;
-    assert(total_shmed >= 0 && total_shmed <= gpgpu_shmem_size);
-    // assert(gpgpu_shmem_size == 98304); //Volta has 96 KB shared
-    // assert(m_L1D_config.get_nset() == 4);  //Volta L1 has four sets
-    if (total_shmed < gpgpu_shmem_size) {
-      switch (adaptive_cache_config) {
-        case FIXED:
-          break;
-        case ADAPTIVE_VOLTA: {
-          // For Volta, we assign the remaining shared memory to L1 cache
-          // For more info about adaptive cache, see
-          // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x
-          // assert(gpgpu_shmem_size == 98304); //Volta has 96 KB shared
-
-          // To Do: make it flexible and not tuned to 9KB share memory
-          unsigned max_assoc = m_L1D_config.get_max_assoc();
-          if (total_shmed == 0)
-            m_L1D_config.set_assoc(max_assoc);  // L1 is 128KB and shd=0
-          else if (total_shmed > 0 && total_shmed <= 8192)
-            m_L1D_config.set_assoc(0.9375 *
-                                   max_assoc);  // L1 is 120KB and shd=8KB
-          else if (total_shmed > 8192 && total_shmed <= 16384)
-            m_L1D_config.set_assoc(0.875 *
-                                   max_assoc);  // L1 is 112KB and shd=16KB
-          else if (total_shmed > 16384 && total_shmed <= 32768)
-            m_L1D_config.set_assoc(0.75 * max_assoc);  // L1 is 96KB and
-                                                       // shd=32KB
-          else if (total_shmed > 32768 && total_shmed <= 65536)
-            m_L1D_config.set_assoc(0.5 * max_assoc);  // L1 is 64KB and shd=64KB
-          else if (total_shmed > 65536 && total_shmed <= gpgpu_shmem_size)
-            m_L1D_config.set_assoc(0.25 * max_assoc);  // L1 is 32KB and
-                                                       // shd=96KB
-          else
-            assert(0);
-          break;
+    std::vector<unsigned> shmem_list;
+    for (unsigned i = 0; i < strlen(gpgpu_shmem_option); i++) {
+      char option[4];
+      int j = 0;
+      while (gpgpu_shmem_option[i] != ',' && i < strlen(gpgpu_shmem_option)) {
+        if (gpgpu_shmem_option[i] == ' ') {
+          // skip spaces
+          i++;
+        } else {
+          if (!isdigit(gpgpu_shmem_option[i])) {
+            // check for non digits, which should not be here
+            assert(0 && "invalid config: -gpgpu_shmem_option");
+          }
+          option[j] = gpgpu_shmem_option[i];
+          j++;
+          i++;
         }
-        default:
-          assert(0);
       }
+      // convert KB -> B
+      shmem_list.push_back((unsigned)atoi(option) * 1024);
+    }
 
-      printf("GPGPU-Sim: Reconfigure L1 cache to %uKB\n",
-             m_L1D_config.get_total_size_inKB());
+    unsigned total_shmem = kernel_info->smem * result;
+    unsigned total_unified = gpgpu_unified_l1d_size * 1024;
+    std::sort(shmem_list.begin(), shmem_list.end());
+
+    assert(total_shmem >= 0 && total_shmem <= shmem_list.back());
+    switch (adaptive_cache_config) {
+      case FIXED:
+        break;
+      case ADAPTIVE_CACHE: {
+        // For more info about adaptive cache, see
+        bool l1d_configured = false;
+        unsigned l1_defined = m_L1D_config.get_original_sz() / 1024;
+        unsigned max_assoc = m_L1D_config.get_original_assoc() * 
+          gpgpu_unified_l1d_size / l1_defined;
+
+        if (total_shmem == 0) {
+          m_L1D_config.set_assoc(max_assoc);
+          l1d_configured = true;
+        } else {
+          for (std::vector<unsigned>::iterator it = shmem_list.begin();
+               it < shmem_list.end() - 1; it++) {
+            if (total_shmem > *it && total_shmem <= *(it + 1)) {
+              float l1_ratio = 1 - (float) *(it + 1) / total_unified;
+              m_L1D_config.set_assoc(max_assoc * l1_ratio);
+              l1d_configured = true;
+              break;
+            }
+          }
+        }
+        assert(l1d_configured && "no shared memory option found");
+        break;
+      }
+      default:
+        assert(0);
     }
 
+    printf("GPGPU-Sim: Reconfigure L1 cache to %uKB\n",
+           m_L1D_config.get_total_size_inKB());
+
     k.cache_config_set = true;
   }
 

From f70f5d6e5599c643074b0d00d3e3dcc385e5913d Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Wed, 19 May 2021 15:10:51 -0400
Subject: [PATCH 058/154] re-wording/formatting

---
 src/gpgpu-sim/gpu-cache.cc | 17 ++++++++---------
 src/gpgpu-sim/gpu-cache.h  |  6 +++---
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 390bacce2..05b338ea6 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -280,7 +280,7 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
     }
     if (!line->is_reserved_line()) {
       if (!line->is_modified_line() ||
-          100 * m_dirty / (m_config.m_nset * m_config.m_assoc) >=
+          m_dirty / (m_config.m_nset * m_config.m_assoc * 100) >=
               m_config.m_wr_percent) {
         all_reserved = false;
         if (line->is_invalid_line()) {
@@ -364,7 +364,7 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
           evicted.set_info(m_lines[idx]->m_block_addr,
                            m_lines[idx]->get_modified_size(),
                            m_lines[idx]->get_byte_mask(),
-                            m_lines[idx]->get_sector_mask());
+                            m_lines[idx]->get_dirty_sector_mask());
           m_dirty--;
         }
         m_lines[idx]->allocate(m_config.tag(addr), m_config.block_addr(addr),
@@ -430,17 +430,13 @@ void tag_array::fill(new_addr_type addr, unsigned time,
 
 void tag_array::fill(unsigned index, unsigned time, mem_fetch *mf) {
   assert(m_config.m_alloc_policy == ON_MISS);
-  bool before = m_lines[index]->is_modified_line();
   m_lines[index]->fill(time, mf->get_access_sector_mask(), mf->get_access_byte_mask());
-  if (m_lines[index]->is_modified_line() && !before) {
-    m_dirty++;
-  }
+  m_dirty++;
 }
 
 // TODO: we need write back the flushed data to the upper level
 void tag_array::flush() {
   if (!is_used) return;
-  m_dirty = 0;
 
   for (unsigned i = 0; i < m_config.get_num_lines(); i++)
     if (m_lines[i]->is_modified_line()) {
@@ -448,18 +444,19 @@ void tag_array::flush() {
         m_lines[i]->set_status(INVALID, mem_access_sector_mask_t().set(j));
       }
     }
-
+  
+  m_dirty = 0;
   is_used = false;
 }
 
 void tag_array::invalidate() {
   if (!is_used) return;
-  m_dirty = 0;
 
   for (unsigned i = 0; i < m_config.get_num_lines(); i++)
     for (unsigned j = 0; j < SECTOR_CHUNCK_SIZE; j++)
       m_lines[i]->set_status(INVALID, mem_access_sector_mask_t().set(j));
 
+  m_dirty = 0;
   is_used = false;
 }
 
@@ -804,6 +801,8 @@ void cache_stats::print_stats(FILE *fout, const char *cache_name) const {
               m_stats[type][status]);
 
       if (status != RESERVATION_FAIL && status != MSHR_HIT)
+      // MSHR_HIT is a special type of SECTOR_MISS
+      // so its already included in the SECTOR_MISS
         total_access[type] += m_stats[type][status];
     }
   }
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 6811b868e..51791735a 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -136,7 +136,7 @@ struct cache_block_t {
   virtual void set_byte_mask(mem_fetch *mf) = 0;
   virtual void set_byte_mask(mem_access_byte_mask_t byte_mask) = 0;
   virtual mem_access_byte_mask_t get_byte_mask() = 0;
-  virtual mem_access_sector_mask_t get_sector_mask() = 0;
+  virtual mem_access_sector_mask_t get_dirty_sector_mask() = 0;
   virtual unsigned long long get_last_access_time() = 0;
   virtual void set_last_access_time(unsigned long long time,
                                     mem_access_sector_mask_t sector_mask) = 0;
@@ -218,7 +218,7 @@ struct line_cache_block : public cache_block_t {
   virtual mem_access_byte_mask_t get_byte_mask() {
     return m_byte_mask;
   }
-  virtual mem_access_sector_mask_t get_sector_mask() {
+  virtual mem_access_sector_mask_t get_dirty_sector_mask() {
     mem_access_sector_mask_t sector_mask;
     if (m_status == MODIFIED) sector_mask.set();
     return sector_mask;
@@ -413,7 +413,7 @@ struct sector_cache_block : public cache_block_t {
   virtual mem_access_byte_mask_t get_byte_mask() {
     return m_byte_mask;
   }
-  virtual mem_access_sector_mask_t get_sector_mask() {
+  virtual mem_access_sector_mask_t get_dirty_sector_mask() {
     mem_access_sector_mask_t sector_mask;
     for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; i++) {
       if (m_status[i] == MODIFIED) 

From 4a762a933a054b5124fa46a12789ea98f5e2411d Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Wed, 19 May 2021 15:22:31 -0400
Subject: [PATCH 059/154] formatting again

---
 configs/tested-cfgs/SM75_RTX2060/gpgpusim.config | 4 ++--
 configs/tested-cfgs/SM7_QV100/gpgpusim.config    | 8 ++------
 configs/tested-cfgs/SM7_TITANV/gpgpusim.config   | 2 +-
 src/gpgpu-sim/gpu-sim.cc                         | 2 +-
 src/gpgpu-sim/shader.cc                          | 1 +
 5 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index e006085df..d7573ab33 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -100,6 +100,8 @@
 # <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
 # ** Optional parameter - Required when mshr_type==Texture Fifo
 -gpgpu_adaptive_cache_config 0
+-gpgpu_shmem_option 0,8,16,32,64,100
+-gpgpu_unified_l1d_size 128
 -gpgpu_l1_banks 4
 -gpgpu_cache:dl1  S:1:128:512,L:L:m:N:L,A:512:8,16:0,32
 -gpgpu_shmem_size 65536
@@ -110,8 +112,6 @@
 -gpgpu_l1_latency 20
 -gpgpu_smem_latency 20
 -gpgpu_flush_l1_cache 1
--gpgpu_shmem_option 0,8,16,32,64,100
--gpgpu_unified_l1d_size 128
 
 # 64 sets, each 128 bytes 16-way for each memory sub partition (128 KB per memory sub partition). This gives us 3MB L2 cache
 -gpgpu_cache:dl2 S:64:128:16,L:B:m:L:P,A:192:4,32:0,32
diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index 043fce64c..59c7f43f7 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -124,7 +124,7 @@
 -gpgpu_l1_latency 20
 -gpgpu_smem_latency 20
 -gpgpu_flush_l1_cache 1
--gpgpu_cache_write_ratio 25
+-gpgpu_l1_cache_write_ratio 25
 -gpgpu_shmem_option 0,12,24,48,96
 -gpgpu_unified_l1d_size 128
 
@@ -204,8 +204,4 @@
 # tracing functionality
 #-trace_enabled 1
 #-trace_components WARP_SCHEDULER,SCOREBOARD
-#-trace_sampling_core 0
-
--gpgpu_cache_write_ratio 25
--gpgpu_shmem_option 0,12,24,48,96
--gpgpu_unified_l1d_size 128
\ No newline at end of file
+#-trace_sampling_core 0
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
index 1f0c15f51..3e080bcc5 100644
--- a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
@@ -125,7 +125,7 @@
 -gpgpu_l1_latency 20
 -gpgpu_smem_latency 20
 -gpgpu_flush_l1_cache 1
--gpgpu_cache_write_ratio 25
+-gpgpu_l1_cache_write_ratio 25
 -gpgpu_shmem_option 0,12,24,48,96
 -gpgpu_unified_l1d_size 128
 
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index bd09cdbe5..a2aa9293f 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -249,7 +249,7 @@ void shader_core_config::reg_options(class OptionParser *opp) {
                          " {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_"
                          "alloc>,<mshr>:<N>:<merge>,<mq> | none}",
                          "none");
-  option_parser_register(opp,"-gpgpu_cache_write_ratio",OPT_UINT32,&m_L1D_config.m_wr_percent,"L1D write ratio","0");
+  option_parser_register(opp,"-gpgpu_l1_cache_write_ratio",OPT_UINT32,&m_L1D_config.m_wr_percent,"L1D write ratio","0");
   option_parser_register(opp, "-gpgpu_l1_banks", OPT_UINT32,
                          &m_L1D_config.l1_banks, "The number of L1 cache banks",
                          "1");
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index b2adb4f53..141c700db 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3326,6 +3326,7 @@ unsigned int shader_core_config::max_cta(const kernel_info_t &k) const {
         // For more info about adaptive cache, see
         bool l1d_configured = false;
         unsigned l1_defined = m_L1D_config.get_original_sz() / 1024;
+        assert(gpgpu_unified_l1d_size % l1_defined == 0);
         unsigned max_assoc = m_L1D_config.get_original_assoc() * 
           gpgpu_unified_l1d_size / l1_defined;
 

From 4c354ebda2c92bb5866c20f03a254743c8ec85a3 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Wed, 19 May 2021 15:45:35 -0400
Subject: [PATCH 060/154] minor improvements

---
 src/gpgpu-sim/gpu-cache.cc | 14 ++++++++------
 src/gpgpu-sim/gpu-cache.h  |  6 +++---
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 05b338ea6..98951cabb 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -279,17 +279,19 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
       }
     }
     if (!line->is_reserved_line()) {
+      // percentage of dirty lines in the cache
+      // number of dirty lines / total lines in the cache
+      float dirty_line_percentage = 
+          (float) (m_dirty / (m_config.m_nset * m_config.m_assoc )) * 100;
       if (!line->is_modified_line() ||
-          m_dirty / (m_config.m_nset * m_config.m_assoc * 100) >=
-              m_config.m_wr_percent) {
+          dirty_line_percentage >= m_config.m_wr_percent) {
+        // if number of dirty lines in the cache is greater than
+        // a specific value
         all_reserved = false;
         if (line->is_invalid_line()) {
           invalid_line = index;
         } else {
           // valid line : keep track of most appropriate replacement candidate
-
-          // don't evict write until dirty lines reach threshold
-          // make sure at least 1 candidate is assigned
           if (m_config.m_replacement_policy == LRU) {
             if (line->get_last_access_time() < valid_timestamp) {
               valid_timestamp = line->get_last_access_time();
@@ -363,7 +365,7 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
           m_lines[idx]->set_byte_mask(mf);
           evicted.set_info(m_lines[idx]->m_block_addr,
                            m_lines[idx]->get_modified_size(),
-                           m_lines[idx]->get_byte_mask(),
+                           m_lines[idx]->get_dirty_byte_mask(),
                             m_lines[idx]->get_dirty_sector_mask());
           m_dirty--;
         }
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 51791735a..dc3b39a50 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -135,7 +135,7 @@ struct cache_block_t {
                           mem_access_sector_mask_t sector_mask) = 0;
   virtual void set_byte_mask(mem_fetch *mf) = 0;
   virtual void set_byte_mask(mem_access_byte_mask_t byte_mask) = 0;
-  virtual mem_access_byte_mask_t get_byte_mask() = 0;
+  virtual mem_access_byte_mask_t get_dirty_byte_mask() = 0;
   virtual mem_access_sector_mask_t get_dirty_sector_mask() = 0;
   virtual unsigned long long get_last_access_time() = 0;
   virtual void set_last_access_time(unsigned long long time,
@@ -215,7 +215,7 @@ struct line_cache_block : public cache_block_t {
   virtual void set_byte_mask(mem_access_byte_mask_t byte_mask) {
     m_byte_mask = m_byte_mask | byte_mask;
   }
-  virtual mem_access_byte_mask_t get_byte_mask() {
+  virtual mem_access_byte_mask_t get_dirty_byte_mask() {
     return m_byte_mask;
   }
   virtual mem_access_sector_mask_t get_dirty_sector_mask() {
@@ -410,7 +410,7 @@ struct sector_cache_block : public cache_block_t {
   virtual void set_byte_mask(mem_access_byte_mask_t byte_mask) {
     m_byte_mask = m_byte_mask | byte_mask;
   }
-  virtual mem_access_byte_mask_t get_byte_mask() {
+  virtual mem_access_byte_mask_t get_dirty_byte_mask() {
     return m_byte_mask;
   }
   virtual mem_access_sector_mask_t get_dirty_sector_mask() {

From f27da224f3e468d600499a9d3619009ed9c70256 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Wed, 19 May 2021 17:27:43 -0400
Subject: [PATCH 061/154] Use cache config multipilier when possible

---
 src/abstract_hardware_model.h |  1 -
 src/gpgpu-sim/gpu-cache.h     | 28 +++++++++++++++-------------
 src/gpgpu-sim/gpu-sim.cc      |  2 +-
 src/gpgpu-sim/shader.cc       |  8 +++-----
 4 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index bd10a93fe..dbe138a66 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -374,7 +374,6 @@ class core_config {
   unsigned mem_warp_parts;
   mutable unsigned gpgpu_shmem_size;
   char *gpgpu_shmem_option;
-  unsigned gpgpu_unified_l1d_size;
   unsigned gpgpu_shmem_sizeDefault;
   unsigned gpgpu_shmem_sizePrefL1;
   unsigned gpgpu_shmem_sizePrefShared;
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 0162b6cbc..87a6b13e7 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -512,6 +512,14 @@ class cache_config {
       exit_parse_error();
     }
 
+    // set * assoc * cacheline size. Then convert Byte to KB
+    unsigned original_size = m_nset * m_assoc * m_line_sz / 1024;
+    if (m_unified_cache_size > 0) {
+      max_cache_multiplier = m_unified_cache_size / original_size;
+    } else {
+      max_cache_multiplier = MAX_DEFAULT_CACHE_SIZE_MULTIBLIER;
+    }
+    
     switch (ct) {
       case 'N':
         m_cache_type = NORMAL;
@@ -588,7 +596,7 @@ class cache_config {
       // https://ieeexplore.ieee.org/document/8344474/
       m_is_streaming = true;
       m_alloc_policy = ON_FILL;
-      m_mshr_entries = m_nset * m_assoc * MAX_DEFAULT_CACHE_SIZE_MULTIBLIER;
+      m_mshr_entries = m_nset * m_assoc * max_cache_multiplier;
       if (m_cache_type == SECTOR) m_mshr_entries *= SECTOR_CHUNCK_SIZE;
       m_mshr_max_merge = MAX_WARP_PER_SM;
     }
@@ -616,7 +624,6 @@ class cache_config {
     m_atom_sz = (m_cache_type == SECTOR) ? SECTOR_SIZE : m_line_sz;
     m_sector_sz_log2 = LOGB2(SECTOR_SIZE);
     original_m_assoc = m_assoc;
-    original_sz = m_nset * original_m_assoc * m_line_sz;
 
 
     // For more details about difference between FETCH_ON_WRITE and WRITE
@@ -706,19 +713,13 @@ class cache_config {
   }
   unsigned get_max_num_lines() const {
     assert(m_valid);
-    return MAX_DEFAULT_CACHE_SIZE_MULTIBLIER * m_nset * original_m_assoc;
+    // gpgpu_unified_cache_size is in KB while original_sz is in B
+    return max_cache_multiplier * m_nset * original_m_assoc;
   }
   unsigned get_max_assoc() const {
     assert(m_valid);
-    return MAX_DEFAULT_CACHE_SIZE_MULTIBLIER * original_m_assoc;
-  }
-  unsigned get_original_assoc() const {
-    assert(m_valid);
-    return original_m_assoc;
-  }
-  unsigned get_original_sz() const {
-    assert(m_valid);
-    return original_sz;
+    // gpgpu_unified_cache_size is in KB while original_sz is in B
+    return max_cache_multiplier * original_m_assoc;
   }
   void print(FILE *fp) const {
     fprintf(fp, "Size = %d B (%d Set x %d-way x %d byte line)\n",
@@ -766,6 +767,7 @@ class cache_config {
   char *m_config_stringPrefShared;
   FuncCache cache_status;
   unsigned m_wr_percent;
+  unsigned m_unified_cache_size;
   write_allocate_policy_t get_write_allocate_policy() {
     return m_write_alloc_policy;
   }
@@ -787,8 +789,8 @@ class cache_config {
   unsigned m_atom_sz;
   unsigned m_sector_sz_log2;
   unsigned original_m_assoc;
-  unsigned original_sz;
   bool m_is_streaming;
+  unsigned max_cache_multiplier;
 
   enum replacement_policy_t m_replacement_policy;  // 'L' = LRU, 'F' = FIFO
   enum write_policy_t
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index a2aa9293f..df3004772 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -331,7 +331,7 @@ void shader_core_config::reg_options(class OptionParser *opp) {
       opp, "-gpgpu_shmem_option", OPT_CSTR, &gpgpu_shmem_option,
       "Option list of shared memory sizes", "0");
   option_parser_register(
-      opp, "-gpgpu_unified_l1d_size", OPT_UINT32, &gpgpu_unified_l1d_size,
+      opp, "-gpgpu_unified_l1d_size", OPT_UINT32, &m_L1D_config.m_unified_cache_size,
       "Size of unified data cache(L1D + shared memory) in KB", "0");
   option_parser_register(opp, "-gpgpu_adaptive_cache_config", OPT_BOOL,
                          &adaptive_cache_config, "adaptive_cache_config", "0");
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 141c700db..3efef2b34 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3315,7 +3315,8 @@ unsigned int shader_core_config::max_cta(const kernel_info_t &k) const {
     }
 
     unsigned total_shmem = kernel_info->smem * result;
-    unsigned total_unified = gpgpu_unified_l1d_size * 1024;
+    // Unified cache config is in KB. Converting to B
+    unsigned total_unified = m_L1D_config.m_unified_cache_size * 1024;
     std::sort(shmem_list.begin(), shmem_list.end());
 
     assert(total_shmem >= 0 && total_shmem <= shmem_list.back());
@@ -3325,10 +3326,7 @@ unsigned int shader_core_config::max_cta(const kernel_info_t &k) const {
       case ADAPTIVE_CACHE: {
         // For more info about adaptive cache, see
         bool l1d_configured = false;
-        unsigned l1_defined = m_L1D_config.get_original_sz() / 1024;
-        assert(gpgpu_unified_l1d_size % l1_defined == 0);
-        unsigned max_assoc = m_L1D_config.get_original_assoc() * 
-          gpgpu_unified_l1d_size / l1_defined;
+        unsigned max_assoc = m_L1D_config.get_max_assoc();
 
         if (total_shmem == 0) {
           m_L1D_config.set_assoc(max_assoc);

From 14f22bcdd171cdeb8d8f56f9ed02d6f711189be8 Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Wed, 19 May 2021 17:56:14 -0400
Subject: [PATCH 062/154] add checking on spec unit in subcore

---
 src/gpgpu-sim/shader.cc | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 6229d1625..2513dde11 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -108,7 +108,7 @@ void shader_core_ctx::create_front_pipeline() {
 
   if (m_config->sub_core_model) {
     // in subcore model, each scheduler should has its own issue register, so
-    // num scheduler = reg width
+    // ensure num scheduler = reg width
     assert(m_config->gpgpu_num_sched_per_core ==
            m_pipeline_reg[ID_OC_SP].get_size());
     assert(m_config->gpgpu_num_sched_per_core ==
@@ -124,6 +124,11 @@ void shader_core_ctx::create_front_pipeline() {
     if (m_config->gpgpu_num_int_units > 0)
       assert(m_config->gpgpu_num_sched_per_core ==
              m_pipeline_reg[ID_OC_INT].get_size());
+    for (int j = 0; j < m_config->m_specialized_unit.size(); j++) {
+      if (m_config->m_specialized_unit[j].num_units > 0)
+         assert(m_config->gpgpu_num_sched_per_core ==
+             m_config->m_specialized_unit[j].id_oc_spec_reg_width);
+    }
   }
 
   m_threadState = (thread_ctx_t *)calloc(sizeof(thread_ctx_t),

From 604baaf59255776b4714c0270ce36ad823d34df4 Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Wed, 19 May 2021 18:28:41 -0400
Subject: [PATCH 063/154] fixing the failing of merging

---
 src/gpgpu-sim/gpu-cache.h | 3 +--
 src/gpgpu-sim/shader.cc   | 5 +----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 75dce40f4..d80152812 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -841,8 +841,8 @@ class cache_config {
   char *m_config_stringPrefL1;
   char *m_config_stringPrefShared;
   FuncCache cache_status;
-  unsigned m_wr_percent;
   unsigned m_unified_cache_size;
+  unsigned m_wr_percent;
   write_allocate_policy_t get_write_allocate_policy() {
     return m_write_alloc_policy;
   }
@@ -897,7 +897,6 @@ class cache_config {
   unsigned m_data_port_width;  //< number of byte the cache can access per cycle
   enum set_index_function
       m_set_index_function;  // Hash, linear, or custom set index function
-  unsigned m_wr_percent;
 
   friend class tag_array;
   friend class baseline_cache;
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index db53fca7b..75fbe1646 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3391,13 +3391,12 @@ unsigned int shader_core_config::max_cta(const kernel_info_t &k) const {
         assert(0);
     }
 
-<<<<<<< HEAD
     if(m_L1D_config.is_streaming()) {
       //for streaming cache, if the whole memory is allocated
       //to the L1 cache, then make the allocation to be on_MISS
       //otherwise, make it ON_FILL to eliminate line allocation fails
       //i.e. MSHR throughput is the same, independent on the L1 cache size/associativity
-      if(total_shmed == 0) {
+      if(total_shmem == 0) {
         m_L1D_config.set_allocation_policy(ON_MISS);
         printf("GPGPU-Sim: Reconfigure L1 allocation to ON_MISS\n");
       }
@@ -3406,10 +3405,8 @@ unsigned int shader_core_config::max_cta(const kernel_info_t &k) const {
         printf("GPGPU-Sim: Reconfigure L1 allocation to ON_FILL\n");
       }
     }
-=======
     printf("GPGPU-Sim: Reconfigure L1 cache to %uKB\n",
            m_L1D_config.get_total_size_inKB());
->>>>>>> 2b2b6a2916e4ed833c707be887bf927167a71fa6
 
     k.cache_config_set = true;
   }

From a2ba2f57e8a24b9dd6ec6f2568accbbf439a9dca Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Wed, 19 May 2021 19:39:48 -0400
Subject: [PATCH 064/154] updating config files with right adaptive cache
 parameters

---
 .../tested-cfgs/SM75_RTX2060/gpgpusim.config  | 18 ++++++++++--------
 configs/tested-cfgs/SM7_QV100/gpgpusim.config | 19 ++++++++++---------
 .../tested-cfgs/SM7_TITANV/gpgpusim.config    | 17 +++++++++--------
 .../tested-cfgs/SM86_RTX3070/gpgpusim.config  | 11 +++++++----
 4 files changed, 36 insertions(+), 29 deletions(-)

diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index d7573ab33..9e50fa305 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -99,19 +99,21 @@
 
 # <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
 # ** Optional parameter - Required when mshr_type==Texture Fifo
--gpgpu_adaptive_cache_config 0
--gpgpu_shmem_option 0,8,16,32,64,100
--gpgpu_unified_l1d_size 128
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 32,64
+-gpgpu_unified_l1d_size 96
+# L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:512,L:L:m:N:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:512:8,16:0,32
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_n_cluster_ejection_buffer_size 32
+-gpgpu_flush_l1_cache 1
+# shared memory configuration
 -gpgpu_shmem_size 65536
 -gpgpu_shmem_sizeDefault 65536
 -gpgpu_shmem_per_block 65536
--gpgpu_gmem_skip_L1D 0
--gpgpu_n_cluster_ejection_buffer_size 32
--gpgpu_l1_latency 20
 -gpgpu_smem_latency 20
--gpgpu_flush_l1_cache 1
 
 # 64 sets, each 128 bytes 16-way for each memory sub partition (128 KB per memory sub partition). This gives us 3MB L2 cache
 -gpgpu_cache:dl2 S:64:128:16,L:B:m:L:P,A:192:4,32:0,32
diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index 59c7f43f7..3750de09f 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -94,7 +94,7 @@
 -gpgpu_shmem_num_banks 32
 -gpgpu_shmem_limited_broadcast 0
 -gpgpu_shmem_warp_parts 1
--gpgpu_coalesce_arch 60
+-gpgpu_coalesce_arch 70
 
 # Volta has four schedulers per core
 -gpgpu_num_sched_per_core 4
@@ -113,20 +113,21 @@
 # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
 # disable this mode in case of multi kernels/apps execution
 -gpgpu_adaptive_cache_config 1
-# Volta unified cache has four banks
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
 -gpgpu_l1_banks 4
 -gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
 -gpgpu_shmem_size 98304
 -gpgpu_shmem_sizeDefault 98304
 -gpgpu_shmem_per_block 65536
--gpgpu_gmem_skip_L1D 0
--gpgpu_n_cluster_ejection_buffer_size 32
--gpgpu_l1_latency 20
 -gpgpu_smem_latency 20
--gpgpu_flush_l1_cache 1
--gpgpu_l1_cache_write_ratio 25
--gpgpu_shmem_option 0,12,24,48,96
--gpgpu_unified_l1d_size 128
 
 # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
diff --git a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
index 32245d78a..e7f73059a 100644
--- a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
@@ -114,20 +114,21 @@
 # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
 # disable this mode in case of multi kernels/apps execution
 -gpgpu_adaptive_cache_config 1
-# Volta unified cache has four banks
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
 -gpgpu_l1_banks 4
 -gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_l1_latency 20
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
 -gpgpu_shmem_size 98304
 -gpgpu_shmem_sizeDefault 98304
 -gpgpu_shmem_per_block 65536
--gpgpu_gmem_skip_L1D 0
--gpgpu_n_cluster_ejection_buffer_size 32
--gpgpu_l1_latency 20
 -gpgpu_smem_latency 20
--gpgpu_flush_l1_cache 1
--gpgpu_l1_cache_write_ratio 25
--gpgpu_shmem_option 0,12,24,48,96
--gpgpu_unified_l1d_size 128
 
 # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 4.5MB L2 cache
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
index f5418ad8e..3c0db06a8 100644
--- a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
+++ b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
@@ -107,17 +107,20 @@
 # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#global-memory-8-x
 # disable this mode in case of multi kernels/apps execution
 -gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,100
+-gpgpu_unified_l1d_size 128
 # Ampere unified cache has four banks
 -gpgpu_l1_banks 4
 -gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:512:8,16:0,32
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_l1_latency 20
+-gpgpu_n_cluster_ejection_buffer_size 32
+-gpgpu_flush_l1_cache 1
+# shared memory configuration
 -gpgpu_shmem_size 102400
 -gpgpu_shmem_sizeDefault 102400
 -gpgpu_shmem_per_block 102400
--gpgpu_gmem_skip_L1D 0
--gpgpu_n_cluster_ejection_buffer_size 32
--gpgpu_l1_latency 20
 -gpgpu_smem_latency 20
--gpgpu_flush_l1_cache 1
 
 # 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 3MB L2 cache
 -gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32

From b63d19a55c320b0bfd3ba4c80fe6f47a11bba39b Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Wed, 19 May 2021 19:41:22 -0400
Subject: [PATCH 065/154] updating config files

---
 .../tested-cfgs/SM75_RTX2060/gpgpusim.config  |   1 +
 .../tested-cfgs/SM86_RTX3070/gpgpusim.config  |   1 +
 configs/tested-cfgs/TITAN_V/gpgpusim.config   | 173 ++++++++++++++++++
 configs/tested-cfgs/TITAN_V/trace.config      |  18 ++
 4 files changed, 193 insertions(+)
 create mode 100644 configs/tested-cfgs/TITAN_V/gpgpusim.config
 create mode 100644 configs/tested-cfgs/TITAN_V/trace.config

diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index 9e50fa305..856f5cffd 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -105,6 +105,7 @@
 # L1 cache configuration
 -gpgpu_l1_banks 4
 -gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
 -gpgpu_l1_latency 20
 -gpgpu_gmem_skip_L1D 0
 -gpgpu_n_cluster_ejection_buffer_size 32
diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
index 3c0db06a8..9123e206f 100644
--- a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
+++ b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
@@ -112,6 +112,7 @@
 # Ampere unified cache has four banks
 -gpgpu_l1_banks 4
 -gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
 -gpgpu_gmem_skip_L1D 0
 -gpgpu_l1_latency 20
 -gpgpu_n_cluster_ejection_buffer_size 32
diff --git a/configs/tested-cfgs/TITAN_V/gpgpusim.config b/configs/tested-cfgs/TITAN_V/gpgpusim.config
new file mode 100644
index 000000000..8b5cb202f
--- /dev/null
+++ b/configs/tested-cfgs/TITAN_V/gpgpusim.config
@@ -0,0 +1,173 @@
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency  6745
+-gpgpu_TB_launch_latency 0
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 24
+-gpgpu_n_sub_partition_per_mchannel 2
+
+# clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1200:1200:1200:850
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+-gpgpu_shader_core_pipeline 2048:32
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,4,4,4,21
+-ptx_opcode_initiation_int 2,2,2,2,2
+-ptx_opcode_latency_fp 4,4,4,4,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,8,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 21
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 35
+-ptx_opcode_initiation_tensor 32
+
+# sub core model: in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# register banks
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# warp scheduling
+-gpgpu_num_sched_per_core 4
+-gpgpu_scheduler gto
+# a warp scheduler issue mode
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# In adaptive cache, we adaptively assign the remaining shared memory to L1 cache 
+-gpgpu_adaptive_cache_config 1
+-gpgpu_l1_banks 4
+-gpgpu_cache:dl1 S:4:128:64,L:L:m:N:L,A:512:64,16:0,32
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 49152
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_n_cluster_ejection_buffer_size 32
+-gpgpu_l1_latency 33
+-gpgpu_smem_latency 27
+-gpgpu_flush_l1_cache 1
+
+# L2 cache
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 1
+-gpgpu_memory_partition_indexing 0
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected since Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 177
+-dram_latency 103
+
+# dram sched config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# dram model config
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCC.BCCSSSSS
+
+# Mem timing 
+-gpgpu_dram_timing_opt nbk=16:CCD=1:RRD=4:RCD=12:RAS=29:RP=12:RC=40:CL=12:WL=2:CDLR=3:WR=11:nbkgrp=4:CCDL=2:RTPL=4
+-dram_dual_bus_interface 1
+
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# power model configs, disable it untill we create a real energy model
+-power_simulation_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
+
diff --git a/configs/tested-cfgs/TITAN_V/trace.config b/configs/tested-cfgs/TITAN_V/trace.config
new file mode 100644
index 000000000..6e193f7bf
--- /dev/null
+++ b/configs/tested-cfgs/TITAN_V/trace.config
@@ -0,0 +1,18 @@
+-trace_opcode_latency_initiation_int 4,2
+-trace_opcode_latency_initiation_sp 4,2
+-trace_opcode_latency_initiation_dp 8,4
+-trace_opcode_latency_initiation_sfu 21,8
+-trace_opcode_latency_initiation_tensor 2,2
+
+#execute branch insts on spec unit 1
+#<enabled>,<num_units>,<max_latency>,<ID_OC_SPEC>,<OC_EX_SPEC>,<NAME>
+-specialized_unit_1 1,4,4,4,4,BRA
+-trace_opcode_latency_initiation_spec_op_1 4,4
+
+#TEX unit, make fixed latency for all tex insts
+-specialized_unit_2 1,4,200,4,4,TEX
+-trace_opcode_latency_initiation_spec_op_2 200,4
+
+#tensor unit
+-specialized_unit_3 1,4,2,4,4,TENSOR
+-trace_opcode_latency_initiation_spec_op_3 2,2

From e3d186bbeade78dec776989ccec2a0c0aea27fb4 Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Wed, 19 May 2021 19:43:29 -0400
Subject: [PATCH 066/154] chaning @sets to 4 based on recent ubenchs

---
 configs/tested-cfgs/SM75_RTX2060/gpgpusim.config | 2 +-
 configs/tested-cfgs/SM7_QV100/gpgpusim.config    | 2 +-
 configs/tested-cfgs/SM7_TITANV/gpgpusim.config   | 2 +-
 configs/tested-cfgs/SM86_RTX3070/gpgpusim.config | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index 856f5cffd..a63d50fcb 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -104,7 +104,7 @@
 -gpgpu_unified_l1d_size 96
 # L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:L:m:N:L,A:512:8,16:0,32
 -gpgpu_l1_cache_write_ratio 25
 -gpgpu_l1_latency 20
 -gpgpu_gmem_skip_L1D 0
diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index 3750de09f..47bf1c898 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -117,7 +117,7 @@
 -gpgpu_unified_l1d_size 128
 # L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:L:m:N:L,A:512:8,16:0,32
 -gpgpu_l1_cache_write_ratio 25
 -gpgpu_l1_latency 20
 -gpgpu_gmem_skip_L1D 0
diff --git a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
index e7f73059a..3db64b3bc 100644
--- a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
@@ -118,7 +118,7 @@
 -gpgpu_unified_l1d_size 128
 # L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:L:m:N:L,A:512:8,16:0,32
 -gpgpu_l1_cache_write_ratio 25
 -gpgpu_gmem_skip_L1D 0
 -gpgpu_l1_latency 20
diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
index 9123e206f..c70cfe8f3 100644
--- a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
+++ b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
@@ -111,7 +111,7 @@
 -gpgpu_unified_l1d_size 128
 # Ampere unified cache has four banks
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:1:128:256,L:L:m:N:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:L:m:N:L,A:512:8,16:0,32
 -gpgpu_l1_cache_write_ratio 25
 -gpgpu_gmem_skip_L1D 0
 -gpgpu_l1_latency 20

From 24ffab25f41d76b94fd2012a8897312a73a7165f Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Wed, 19 May 2021 21:17:40 -0400
Subject: [PATCH 067/154] moving shmem option to the base class and change the
 code to accept turing config

---
 src/abstract_hardware_model.h |  1 +
 src/gpgpu-sim/gpu-cache.h     |  3 +--
 src/gpgpu-sim/shader.cc       | 46 +++++++----------------------------
 src/gpgpu-sim/shader.h        | 26 ++++++++++++++++++++
 4 files changed, 37 insertions(+), 39 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index 17a1cecb1..b33c50bd4 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -374,6 +374,7 @@ class core_config {
   unsigned mem_warp_parts;
   mutable unsigned gpgpu_shmem_size;
   char *gpgpu_shmem_option;
+  std::vector<unsigned> shmem_opt_list;
   unsigned gpgpu_shmem_sizeDefault;
   unsigned gpgpu_shmem_sizePrefL1;
   unsigned gpgpu_shmem_sizePrefShared;
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index d80152812..26ed6211c 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -577,6 +577,7 @@ class cache_config {
     }
 
     // set * assoc * cacheline size. Then convert Byte to KB
+    // gpgpu_unified_cache_size is in KB while original_sz is in B
     unsigned original_size = m_nset * m_assoc * m_line_sz / 1024;
     if (m_unified_cache_size > 0) {
       max_cache_multiplier = m_unified_cache_size / original_size;
@@ -785,12 +786,10 @@ class cache_config {
   }
   unsigned get_max_num_lines() const {
     assert(m_valid);
-    // gpgpu_unified_cache_size is in KB while original_sz is in B
     return max_cache_multiplier * m_nset * original_m_assoc;
   }
   unsigned get_max_assoc() const {
     assert(m_valid);
-    // gpgpu_unified_cache_size is in KB while original_sz is in B
     return max_cache_multiplier * original_m_assoc;
   }
   void print(FILE *fp) const {
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 75fbe1646..bc747d676 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3334,56 +3334,28 @@ unsigned int shader_core_config::max_cta(const kernel_info_t &k) const {
   if (adaptive_cache_config && !k.cache_config_set) {
     // For more info about adaptive cache, see
     // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x
-    std::vector<unsigned> shmem_list;
-    for (unsigned i = 0; i < strlen(gpgpu_shmem_option); i++) {
-      char option[4];
-      int j = 0;
-      while (gpgpu_shmem_option[i] != ',' && i < strlen(gpgpu_shmem_option)) {
-        if (gpgpu_shmem_option[i] == ' ') {
-          // skip spaces
-          i++;
-        } else {
-          if (!isdigit(gpgpu_shmem_option[i])) {
-            // check for non digits, which should not be here
-            assert(0 && "invalid config: -gpgpu_shmem_option");
-          }
-          option[j] = gpgpu_shmem_option[i];
-          j++;
-          i++;
-        }
-      }
-      // convert KB -> B
-      shmem_list.push_back((unsigned)atoi(option) * 1024);
-    }
-
     unsigned total_shmem = kernel_info->smem * result;
     // Unified cache config is in KB. Converting to B
     unsigned total_unified = m_L1D_config.m_unified_cache_size * 1024;
-    std::sort(shmem_list.begin(), shmem_list.end());
 
-    assert(total_shmem >= 0 && total_shmem <= shmem_list.back());
+    assert(total_shmem >= 0 && total_shmem <= shmem_opt_list.back());
     switch (adaptive_cache_config) {
       case FIXED:
         break;
       case ADAPTIVE_CACHE: {
-        // For more info about adaptive cache, see
         bool l1d_configured = false;
         unsigned max_assoc = m_L1D_config.get_max_assoc();
 
-        if (total_shmem == 0) {
-          m_L1D_config.set_assoc(max_assoc);
-          l1d_configured = true;
-        } else {
-          for (std::vector<unsigned>::iterator it = shmem_list.begin();
-               it < shmem_list.end() - 1; it++) {
-            if (total_shmem > *it && total_shmem <= *(it + 1)) {
-              float l1_ratio = 1 - (float) *(it + 1) / total_unified;
-              m_L1D_config.set_assoc(max_assoc * l1_ratio);
-              l1d_configured = true;
-              break;
-            }
+        for (std::vector<unsigned>::const_iterator it = shmem_opt_list.begin();
+              it < shmem_opt_list.end(); it++) {
+          if (total_shmem <= *it) {
+            float l1_ratio = 1 - ((float) *(it) / total_unified);
+            m_L1D_config.set_assoc(max_assoc * l1_ratio);
+            l1d_configured = true;
+            break;
           }
         }
+        
         assert(l1d_configured && "no shared memory option found");
         break;
       }
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index a7a2c02d6..42bbdcb99 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -1495,6 +1495,32 @@ class shader_core_config : public core_config {
       } else
         break;  // we only accept continuous specialized_units, i.e., 1,2,3,4
     }
+
+    //parse gpgpu_shmem_option for adpative cache config
+    if(adaptive_cache_config) {
+      for (unsigned i = 0; i < strlen(gpgpu_shmem_option); i++) {
+        char option[4];
+        int j = 0;
+        while (gpgpu_shmem_option[i] != ',' && i < strlen(gpgpu_shmem_option)) {
+          if (gpgpu_shmem_option[i] == ' ') {
+            // skip spaces
+            i++;
+          } else {
+            if (!isdigit(gpgpu_shmem_option[i])) {
+              // check for non digits, which should not be here
+              assert(0 && "invalid config: -gpgpu_shmem_option");
+            }
+            option[j] = gpgpu_shmem_option[i];
+            j++;
+            i++;
+          }
+        }
+        // convert KB -> B
+        shmem_opt_list.push_back((unsigned)atoi(option) * 1024);
+      }
+      std::sort(shmem_opt_list.begin(), shmem_opt_list.end());
+    }
+
   }
   void reg_options(class OptionParser *opp);
   unsigned max_cta(const kernel_info_t &k) const;

From fedcde3789f7921647caee184c0fa104403c848d Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Wed, 19 May 2021 21:42:29 -0400
Subject: [PATCH 068/154] moving the unified size from the base class config to
 l1 config

---
 src/gpgpu-sim/gpu-cache.h | 30 ++++++++++++++++--------------
 src/gpgpu-sim/shader.cc   |  3 ++-
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 26ed6211c..8bd62da39 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -575,15 +575,6 @@ class cache_config {
       }
       exit_parse_error();
     }
-
-    // set * assoc * cacheline size. Then convert Byte to KB
-    // gpgpu_unified_cache_size is in KB while original_sz is in B
-    unsigned original_size = m_nset * m_assoc * m_line_sz / 1024;
-    if (m_unified_cache_size > 0) {
-      max_cache_multiplier = m_unified_cache_size / original_size;
-    } else {
-      max_cache_multiplier = MAX_DEFAULT_CACHE_SIZE_MULTIBLIER;
-    }
     
     switch (ct) {
       case 'N':
@@ -694,7 +685,6 @@ class cache_config {
     m_sector_sz_log2 = LOGB2(SECTOR_SIZE);
     original_m_assoc = m_assoc;
 
-
     // For more details about difference between FETCH_ON_WRITE and WRITE
     // VALIDAE policies Read: Jouppi, Norman P. "Cache write policies and
     // performance". ISCA 93. WRITE_ALLOCATE is the old write policy in
@@ -786,11 +776,11 @@ class cache_config {
   }
   unsigned get_max_num_lines() const {
     assert(m_valid);
-    return max_cache_multiplier * m_nset * original_m_assoc;
+    return get_max_cache_multiplier() * m_nset * original_m_assoc;
   }
   unsigned get_max_assoc() const {
     assert(m_valid);
-    return max_cache_multiplier * original_m_assoc;
+    return get_max_cache_multiplier() * original_m_assoc;
   }
   void print(FILE *fp) const {
     fprintf(fp, "Size = %d B (%d Set x %d-way x %d byte line)\n",
@@ -799,6 +789,8 @@ class cache_config {
 
   virtual unsigned set_index(new_addr_type addr) const;
 
+  virtual unsigned get_max_cache_multiplier() const { return MAX_DEFAULT_CACHE_SIZE_MULTIBLIER;}
+
   unsigned hash_function(new_addr_type addr, unsigned m_nset,
                          unsigned m_line_sz_log2, unsigned m_nset_log2,
                          unsigned m_index_function) const;
@@ -840,7 +832,6 @@ class cache_config {
   char *m_config_stringPrefL1;
   char *m_config_stringPrefShared;
   FuncCache cache_status;
-  unsigned m_unified_cache_size;
   unsigned m_wr_percent;
   write_allocate_policy_t get_write_allocate_policy() {
     return m_write_alloc_policy;
@@ -867,7 +858,6 @@ class cache_config {
   unsigned m_sector_sz_log2;
   unsigned original_m_assoc;
   bool m_is_streaming;
-  unsigned max_cache_multiplier;
 
   enum replacement_policy_t m_replacement_policy;  // 'L' = LRU, 'F' = FIFO
   enum write_policy_t
@@ -922,6 +912,18 @@ class l1d_cache_config : public cache_config {
   unsigned l1_banks_byte_interleaving;
   unsigned l1_banks_byte_interleaving_log2;
   unsigned l1_banks_hashing_function;
+  unsigned m_unified_cache_size;
+  virtual unsigned get_max_cache_multiplier() const { 
+      // set * assoc * cacheline size. Then convert Byte to KB
+      // gpgpu_unified_cache_size is in KB while original_sz is in B
+      if (m_unified_cache_size > 0) {
+        unsigned original_size = m_nset * original_m_assoc * m_line_sz / 1024;
+        assert(m_unified_cache_size % original_size == 0);
+        return m_unified_cache_size / original_size;
+      } else {
+        return MAX_DEFAULT_CACHE_SIZE_MULTIBLIER;
+      }    
+    }
 };
 
 class l2_cache_config : public cache_config {
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index bc747d676..7f27b7b64 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3335,10 +3335,11 @@ unsigned int shader_core_config::max_cta(const kernel_info_t &k) const {
     // For more info about adaptive cache, see
     // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x
     unsigned total_shmem = kernel_info->smem * result;
+    assert(total_shmem >= 0 && total_shmem <= shmem_opt_list.back());
+
     // Unified cache config is in KB. Converting to B
     unsigned total_unified = m_L1D_config.m_unified_cache_size * 1024;
 
-    assert(total_shmem >= 0 && total_shmem <= shmem_opt_list.back());
     switch (adaptive_cache_config) {
       case FIXED:
         break;

From 8aee56d7401af9a91a1de3adae1b61329e0d30e5 Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Wed, 19 May 2021 22:10:53 -0400
Subject: [PATCH 069/154] rename set_dirty_byte_mask

---
 src/gpgpu-sim/gpu-cache.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 8bd62da39..91cde7e8f 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -210,13 +210,13 @@ struct line_cache_block : public cache_block_t {
     m_status = status;
   }
   virtual void set_byte_mask(mem_fetch *mf) {
-    m_byte_mask = m_byte_mask | mf->get_access_byte_mask();
+    m_dirty_byte_mask = m_dirty_byte_mask | mf->get_access_byte_mask();
   }
   virtual void set_byte_mask(mem_access_byte_mask_t byte_mask) {
-    m_byte_mask = m_byte_mask | byte_mask;
+    m_dirty_byte_mask = m_dirty_byte_mask | byte_mask;
   }
   virtual mem_access_byte_mask_t get_dirty_byte_mask() {
-    return m_byte_mask;
+    return m_dirty_byte_mask;
   }
   virtual mem_access_sector_mask_t get_dirty_sector_mask() {
     mem_access_sector_mask_t sector_mask;
@@ -270,7 +270,7 @@ struct line_cache_block : public cache_block_t {
   bool m_set_readable_on_fill;
   bool m_set_byte_mask_on_fill;
   bool m_readable;
-  mem_access_byte_mask_t m_byte_mask;
+  mem_access_byte_mask_t m_dirty_byte_mask;
 };
 
 struct sector_cache_block : public cache_block_t {
@@ -290,7 +290,7 @@ struct sector_cache_block : public cache_block_t {
     m_line_alloc_time = 0;
     m_line_last_access_time = 0;
     m_line_fill_time = 0;
-    m_byte_mask.reset();
+    m_dirty_byte_mask.reset();
   }
 
   virtual void allocate(new_addr_type tag, new_addr_type block_addr,
@@ -405,13 +405,13 @@ struct sector_cache_block : public cache_block_t {
   }
 
   virtual void set_byte_mask(mem_fetch *mf) {
-    m_byte_mask = m_byte_mask | mf->get_access_byte_mask();
+    m_dirty_byte_mask = m_dirty_byte_mask | mf->get_access_byte_mask();
   }
   virtual void set_byte_mask(mem_access_byte_mask_t byte_mask) {
-    m_byte_mask = m_byte_mask | byte_mask;
+    m_dirty_byte_mask = m_dirty_byte_mask | byte_mask;
   }
   virtual mem_access_byte_mask_t get_dirty_byte_mask() {
-    return m_byte_mask;
+    return m_dirty_byte_mask;
   }
   virtual mem_access_sector_mask_t get_dirty_sector_mask() {
     mem_access_sector_mask_t sector_mask;
@@ -492,7 +492,7 @@ struct sector_cache_block : public cache_block_t {
   bool m_set_readable_on_fill[SECTOR_CHUNCK_SIZE];
   bool m_set_byte_mask_on_fill;
   bool m_readable[SECTOR_CHUNCK_SIZE];
-  mem_access_byte_mask_t m_byte_mask;
+  mem_access_byte_mask_t m_dirty_byte_mask;
 
   unsigned get_sector_index(mem_access_sector_mask_t sector_mask) {
     assert(sector_mask.count() == 1);

From b466afea67e6d6faf49f01ecfe378257fbdb93af Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Wed, 19 May 2021 22:20:04 -0400
Subject: [PATCH 070/154] eliminate redundant code in gpu-cache.h

---
 src/gpgpu-sim/gpu-cache.h | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 91cde7e8f..6698d9286 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -596,16 +596,6 @@ class cache_config {
       default:
         exit_parse_error();
     }
-    switch (rp) {
-      case 'L':
-        m_replacement_policy = LRU;
-        break;
-      case 'F':
-        m_replacement_policy = FIFO;
-        break;
-      default:
-        exit_parse_error();
-    }
     switch (wp) {
       case 'R':
         m_write_policy = READ_ONLY;

From 7fac247e3e1c4326081c3ea4d46da6c5dc83ccb9 Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Wed, 19 May 2021 22:20:56 -0400
Subject: [PATCH 071/154] change L1 cache config in Volta+ to be write-through
 and write-allocate based on recent ubench

---
 configs/tested-cfgs/SM75_RTX2060/gpgpusim.config | 2 +-
 configs/tested-cfgs/SM7_QV100/gpgpusim.config    | 2 +-
 configs/tested-cfgs/SM7_TITANV/gpgpusim.config   | 2 +-
 configs/tested-cfgs/SM86_RTX3070/gpgpusim.config | 3 +--
 4 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index a63d50fcb..f715f3aa4 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -104,7 +104,7 @@
 -gpgpu_unified_l1d_size 96
 # L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:4:128:64,L:L:m:N:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
 -gpgpu_l1_cache_write_ratio 25
 -gpgpu_l1_latency 20
 -gpgpu_gmem_skip_L1D 0
diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index 47bf1c898..5f22a42b0 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -117,7 +117,7 @@
 -gpgpu_unified_l1d_size 128
 # L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:4:128:64,L:L:m:N:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
 -gpgpu_l1_cache_write_ratio 25
 -gpgpu_l1_latency 20
 -gpgpu_gmem_skip_L1D 0
diff --git a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
index 3db64b3bc..c44563fb6 100644
--- a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
@@ -118,7 +118,7 @@
 -gpgpu_unified_l1d_size 128
 # L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:4:128:64,L:L:m:N:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
 -gpgpu_l1_cache_write_ratio 25
 -gpgpu_gmem_skip_L1D 0
 -gpgpu_l1_latency 20
diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
index c70cfe8f3..02cdb9ec7 100644
--- a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
+++ b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
@@ -101,7 +101,6 @@
 ## L1/shared memory configuration
 # <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
 # ** Optional parameter - Required when mshr_type==Texture Fifo
-# Default config is 28KB DL1 and 100KB shared memory
 # In Ampere, we assign the remaining shared memory to L1 cache 
 # if the assigned shd mem = 0, then L1 cache = 128KB
 # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#global-memory-8-x
@@ -111,7 +110,7 @@
 -gpgpu_unified_l1d_size 128
 # Ampere unified cache has four banks
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:4:128:64,L:L:m:N:L,A:512:8,16:0,32
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
 -gpgpu_l1_cache_write_ratio 25
 -gpgpu_gmem_skip_L1D 0
 -gpgpu_l1_latency 20

From 0d33266ff6ca9b880dff40f6338c8a5cae696438 Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Wed, 19 May 2021 22:25:37 -0400
Subject: [PATCH 072/154] oops delete this config, it should not be pushed

---
 configs/tested-cfgs/TITAN_V/gpgpusim.config | 173 --------------------
 configs/tested-cfgs/TITAN_V/trace.config    |  18 --
 2 files changed, 191 deletions(-)
 delete mode 100644 configs/tested-cfgs/TITAN_V/gpgpusim.config
 delete mode 100644 configs/tested-cfgs/TITAN_V/trace.config

diff --git a/configs/tested-cfgs/TITAN_V/gpgpusim.config b/configs/tested-cfgs/TITAN_V/gpgpusim.config
deleted file mode 100644
index 8b5cb202f..000000000
--- a/configs/tested-cfgs/TITAN_V/gpgpusim.config
+++ /dev/null
@@ -1,173 +0,0 @@
-# functional simulator specification
--gpgpu_ptx_instruction_classification 0
--gpgpu_ptx_sim_mode 0
--gpgpu_ptx_force_max_capability 70
-
-# Device Limits
--gpgpu_stack_size_limit 1024
--gpgpu_heap_size_limit 8388608
--gpgpu_runtime_sync_depth_limit 2
--gpgpu_runtime_pending_launch_count_limit 2048
--gpgpu_kernel_launch_latency  6745
--gpgpu_TB_launch_latency 0
-
-# Compute Capability
--gpgpu_compute_capability_major 7
--gpgpu_compute_capability_minor 0
-
-# PTX execution-driven
--gpgpu_ptx_convert_to_ptxplus 0
--gpgpu_ptx_save_converted_ptxplus 0
-
-# high level architecture configuration
--gpgpu_n_clusters 80
--gpgpu_n_cores_per_cluster 1
--gpgpu_n_mem 24
--gpgpu_n_sub_partition_per_mchannel 2
-
-# clock domains
-#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
--gpgpu_clock_domains 1200:1200:1200:850
-
-# shader core pipeline config
--gpgpu_shader_registers 65536
--gpgpu_registers_per_block 65536
--gpgpu_occupancy_sm_number 70
-
--gpgpu_shader_core_pipeline 2048:32
--gpgpu_shader_cta 32
--gpgpu_simd_model 1
-
-# Pipeline widths and number of FUs
-# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
--gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
--gpgpu_num_sp_units 4
--gpgpu_num_sfu_units 4
--gpgpu_num_dp_units 4
--gpgpu_num_int_units 4
--gpgpu_tensor_core_avail 1
--gpgpu_num_tensor_core_units 4
-
-# Instruction latencies and initiation intervals
-# "ADD,MAX,MUL,MAD,DIV"
-# All Div operations are executed on SFU unit
--ptx_opcode_latency_int 4,4,4,4,21
--ptx_opcode_initiation_int 2,2,2,2,2
--ptx_opcode_latency_fp 4,4,4,4,39
--ptx_opcode_initiation_fp 2,2,2,2,4
--ptx_opcode_latency_dp 8,8,8,8,330
--ptx_opcode_initiation_dp 4,4,4,4,130
--ptx_opcode_latency_sfu 21
--ptx_opcode_initiation_sfu 8
--ptx_opcode_latency_tesnor 35
--ptx_opcode_initiation_tensor 32
-
-# sub core model: in which each scheduler has its own register file and EUs
-# i.e. schedulers are isolated
--gpgpu_sub_core_model 1
-# disable specialized operand collectors and use generic operand collectors instead
--gpgpu_enable_specialized_operand_collector 0
--gpgpu_operand_collector_num_units_gen 8
--gpgpu_operand_collector_num_in_ports_gen 8
--gpgpu_operand_collector_num_out_ports_gen 8
-# register banks
--gpgpu_num_reg_banks 16
--gpgpu_reg_file_port_throughput 2
-
-# shared memory bankconflict detection 
--gpgpu_shmem_num_banks 32
--gpgpu_shmem_limited_broadcast 0
--gpgpu_shmem_warp_parts 1
--gpgpu_coalesce_arch 70
-
-# warp scheduling
--gpgpu_num_sched_per_core 4
--gpgpu_scheduler gto
-# a warp scheduler issue mode
--gpgpu_max_insn_issue_per_warp 1
--gpgpu_dual_issue_diff_exec_units 1
-
-## L1/shared memory configuration
-# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
-# ** Optional parameter - Required when mshr_type==Texture Fifo
-# In adaptive cache, we adaptively assign the remaining shared memory to L1 cache 
--gpgpu_adaptive_cache_config 1
--gpgpu_l1_banks 4
--gpgpu_cache:dl1 S:4:128:64,L:L:m:N:L,A:512:64,16:0,32
--gpgpu_shmem_size 98304
--gpgpu_shmem_sizeDefault 98304
--gpgpu_shmem_per_block 49152
--gpgpu_gmem_skip_L1D 0
--gpgpu_n_cluster_ejection_buffer_size 32
--gpgpu_l1_latency 33
--gpgpu_smem_latency 27
--gpgpu_flush_l1_cache 1
-
-# L2 cache
--gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
--gpgpu_cache:dl2_texture_only 0
--gpgpu_dram_partition_queues 64:64:64:64
--gpgpu_perf_sim_memcpy 1
--gpgpu_memory_partition_indexing 0
-
-# 128 KB Inst.
--gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
--gpgpu_inst_fetch_throughput 4
-# 128 KB Tex
-# Note, TEX is deprected since Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
--gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
-# 64 KB Const
--gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
--gpgpu_perfect_inst_const_cache 1
-
-# interconnection
-# use built-in local xbar
--network_mode 2
--icnt_in_buffer_limit 512
--icnt_out_buffer_limit 512
--icnt_subnets 2
--icnt_flit_size 40
--icnt_arbiter_algo 1
-
-# memory partition latency config 
--gpgpu_l2_rop_latency 177
--dram_latency 103
-
-# dram sched config
--gpgpu_dram_scheduler 1
--gpgpu_frfcfs_dram_sched_queue_size 64
--gpgpu_dram_return_queue_size 192
-
-# dram model config
--gpgpu_n_mem_per_ctrlr 1
--gpgpu_dram_buswidth 16
--gpgpu_dram_burst_length 2
--dram_data_command_freq_ratio 2
--gpgpu_mem_address_mask 1
--gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCC.BCCSSSSS
-
-# Mem timing 
--gpgpu_dram_timing_opt nbk=16:CCD=1:RRD=4:RCD=12:RAS=29:RP=12:RC=40:CL=12:WL=2:CDLR=3:WR=11:nbkgrp=4:CCDL=2:RTPL=4
--dram_dual_bus_interface 1
-
-# select lower bits for bnkgrp to increase bnkgrp parallelism
--dram_bnk_indexing_policy 0
--dram_bnkgrp_indexing_policy 1
-
-#-dram_seperate_write_queue_enable 1
-#-dram_write_queue_size 64:56:32
-
-# stat collection
--gpgpu_memlatency_stat 14 
--gpgpu_runtime_stat 500
--enable_ptx_file_line_stats 1
--visualizer_enabled 0
-
-# power model configs, disable it untill we create a real energy model
--power_simulation_enabled 0
-
-# tracing functionality
-#-trace_enabled 1
-#-trace_components WARP_SCHEDULER,SCOREBOARD
-#-trace_sampling_core 0
-
diff --git a/configs/tested-cfgs/TITAN_V/trace.config b/configs/tested-cfgs/TITAN_V/trace.config
deleted file mode 100644
index 6e193f7bf..000000000
--- a/configs/tested-cfgs/TITAN_V/trace.config
+++ /dev/null
@@ -1,18 +0,0 @@
--trace_opcode_latency_initiation_int 4,2
--trace_opcode_latency_initiation_sp 4,2
--trace_opcode_latency_initiation_dp 8,4
--trace_opcode_latency_initiation_sfu 21,8
--trace_opcode_latency_initiation_tensor 2,2
-
-#execute branch insts on spec unit 1
-#<enabled>,<num_units>,<max_latency>,<ID_OC_SPEC>,<OC_EX_SPEC>,<NAME>
--specialized_unit_1 1,4,4,4,4,BRA
--trace_opcode_latency_initiation_spec_op_1 4,4
-
-#TEX unit, make fixed latency for all tex insts
--specialized_unit_2 1,4,200,4,4,TEX
--trace_opcode_latency_initiation_spec_op_2 200,4
-
-#tensor unit
--specialized_unit_3 1,4,2,4,4,TENSOR
--trace_opcode_latency_initiation_spec_op_3 2,2

From c8eca04403d3acaff413788e342fd6aadd122948 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Mon, 17 May 2021 17:35:06 -0400
Subject: [PATCH 073/154] fix merge conflict

---
 src/gpgpu-sim/gpu-cache.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 6698d9286..007403f5a 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -563,10 +563,10 @@ class cache_config {
     char ct, rp, wp, ap, mshr_type, wap, sif;
 
     int ntok =
-        sscanf(config, "%c:%u:%u:%u,%c:%c:%c:%c:%c,%c:%u:%u,%u:%u,%u,%u", &ct,
+        sscanf(config, "%c:%u:%u:%u,%c:%c:%c:%c:%c,%c:%u:%u,%u:%u,%u", &ct,
                &m_nset, &m_line_sz, &m_assoc, &rp, &wp, &ap, &wap, &sif,
                &mshr_type, &m_mshr_entries, &m_mshr_max_merge,
-               &m_miss_queue_size, &m_result_fifo_entries, &m_data_port_width, &m_wr_percent);
+               &m_miss_queue_size, &m_result_fifo_entries, &m_data_port_width);
 
     if (ntok < 12) {
       if (!strcmp(config, "none")) {

From f665ad5a49620b47118cbf6d578b469155e2a500 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Thu, 20 May 2021 20:52:06 -0400
Subject: [PATCH 074/154] L2 breakdown - reuse mf allocator

---
 src/abstract_hardware_model.h |  4 ++-
 src/gpgpu-sim/gpu-cache.cc    | 10 +++----
 src/gpgpu-sim/l2cache.cc      | 56 +++++++++++++----------------------
 src/gpgpu-sim/l2cache.h       |  4 ++-
 src/gpgpu-sim/shader.cc       | 19 ++++++------
 src/gpgpu-sim/shader.h        |  4 ++-
 6 files changed, 45 insertions(+), 52 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index b33c50bd4..60d7328e7 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -876,7 +876,9 @@ class mem_fetch_allocator {
                             const mem_access_byte_mask_t &byte_mask,
                             const mem_access_sector_mask_t &sector_mask,
                             unsigned size, bool wr,
-                            unsigned long long cycle) const = 0;                    
+                            unsigned long long cycle,
+                            unsigned wid, unsigned sid,
+                            unsigned tpc, mem_fetch *original_mf) const = 0;                    
 };
 
 // the maximum number of destination, source, or address uarch operands in a
diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 297a94c08..23c5592d0 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -1338,7 +1338,7 @@ enum cache_request_status data_cache::wr_miss_wa_naive(
         evicted.m_block_addr,m_wrbk_type,
         mf->get_access_warp_mask(), evicted.m_byte_mask,
         evicted.m_sector_mask, evicted.m_modified_size,
-        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,-1,-1,-1,NULL);
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
       wb->set_chip(mf->get_tlx_addr().chip);
@@ -1391,7 +1391,7 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
         evicted.m_block_addr,m_wrbk_type,
         mf->get_access_warp_mask(), evicted.m_byte_mask,
         evicted.m_sector_mask, evicted.m_modified_size,
-        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,-1,-1,-1,NULL);
         // the evicted block may have wrong chip id when advanced L2 hashing  is
         // used, so set the right chip address from the original mf
         wb->set_chip(mf->get_tlx_addr().chip);
@@ -1464,7 +1464,7 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
         evicted.m_block_addr,m_wrbk_type,
         mf->get_access_warp_mask(), evicted.m_byte_mask,
         evicted.m_sector_mask, evicted.m_modified_size,
-        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,-1,-1,-1,NULL);
         // the evicted block may have wrong chip id when advanced L2 hashing  is
         // used, so set the right chip address from the original mf
         wb->set_chip(mf->get_tlx_addr().chip);
@@ -1549,7 +1549,7 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
         evicted.m_block_addr,m_wrbk_type,
         mf->get_access_warp_mask(), evicted.m_byte_mask,
         evicted.m_sector_mask, evicted.m_modified_size,
-        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,-1,-1,-1,NULL);
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
       wb->set_chip(mf->get_tlx_addr().chip);
@@ -1631,7 +1631,7 @@ enum cache_request_status data_cache::rd_miss_base(
         evicted.m_block_addr,m_wrbk_type,
         mf->get_access_warp_mask(), evicted.m_byte_mask,
         evicted.m_sector_mask, evicted.m_modified_size,
-        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,-1,-1,-1,NULL);
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
       wb->set_chip(mf->get_tlx_addr().chip);
diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index 00b14d7f6..0db6bd44c 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -57,18 +57,19 @@ mem_fetch *partition_mf_allocator::alloc(new_addr_type addr,
   return mf;
 }
 
-mem_fetch *partition_mf_allocator::alloc(new_addr_type addr, 
-                            mem_access_type type,
+mem_fetch *partition_mf_allocator::alloc(new_addr_type addr, mem_access_type type,
                             const active_mask_t &active_mask,
                             const mem_access_byte_mask_t &byte_mask,
                             const mem_access_sector_mask_t &sector_mask,
                             unsigned size, bool wr,
-                            unsigned long long cycle) const {
+                            unsigned long long cycle,
+                            unsigned wid, unsigned sid,
+                            unsigned tpc, mem_fetch *original_mf) const {
   mem_access_t access(type, addr, size, wr, active_mask, byte_mask, 
                         sector_mask, m_memory_config->gpgpu_ctx);
   mem_fetch *mf =
-    new mem_fetch(access, NULL, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE, -1,
-                  -1, -1, m_memory_config, cycle);
+    new mem_fetch(access, NULL, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE, wid,
+                  sid, tpc, m_memory_config, cycle,original_mf);
     return mf;
 }
 memory_partition_unit::memory_partition_unit(unsigned partition_id,
@@ -724,16 +725,11 @@ memory_sub_partition::breakdown_request_to_sector_requests(mem_fetch *mf) {
       for (unsigned k = i * SECTOR_SIZE; k < (i + 1) * SECTOR_SIZE; k++) {
         mask.set(k);
       }
-      const mem_access_t *ma = new mem_access_t(
-          mf->get_access_type(), mf->get_addr() + SECTOR_SIZE * i, SECTOR_SIZE,
-          mf->is_write(), mf->get_access_warp_mask(),
-          mf->get_access_byte_mask() & mask,
-          std::bitset<SECTOR_CHUNCK_SIZE>().set(i), m_gpu->gpgpu_ctx);
-
-      mem_fetch *n_mf =
-          new mem_fetch(*ma, NULL, mf->get_ctrl_size(), mf->get_wid(),
-                        mf->get_sid(), mf->get_tpc(), mf->get_mem_config(),
-                        m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, mf);
+      mem_fetch *n_mf = m_mf_allocator->alloc(mf->get_addr() + SECTOR_SIZE * i, 
+        mf->get_access_type(),mf->get_access_warp_mask(), 
+        mf->get_access_byte_mask() & mask,std::bitset<SECTOR_CHUNCK_SIZE>().set(i), 
+        SECTOR_SIZE,mf->is_write(),m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
+        mf->get_wid(),mf->get_sid(), mf->get_tpc(),mf);
 
       result.push_back(n_mf);
     }
@@ -750,16 +746,11 @@ memory_sub_partition::breakdown_request_to_sector_requests(mem_fetch *mf) {
       for (unsigned k = i * SECTOR_SIZE; k < (i + 1) * SECTOR_SIZE; k++) {
         mask.set(k);
       }
-      const mem_access_t *ma = new mem_access_t(
-          mf->get_access_type(), mf->get_addr(), SECTOR_SIZE,
-          mf->is_write(), mf->get_access_warp_mask(),
-          mf->get_access_byte_mask() & mask,
-          std::bitset<SECTOR_CHUNCK_SIZE>().set(i), m_gpu->gpgpu_ctx);
-
-      mem_fetch *n_mf =
-          new mem_fetch(*ma, NULL, mf->get_ctrl_size(), mf->get_wid(),
-                        mf->get_sid(), mf->get_tpc(), mf->get_mem_config(),
-                        m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, mf);
+      mem_fetch *n_mf = m_mf_allocator->alloc(mf->get_addr(), 
+        mf->get_access_type(),mf->get_access_warp_mask(), 
+        mf->get_access_byte_mask() & mask,std::bitset<SECTOR_CHUNCK_SIZE>().set(i), 
+        SECTOR_SIZE,mf->is_write(),m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
+        mf->get_wid(),mf->get_sid(), mf->get_tpc(),mf);
 
       result.push_back(n_mf);
     }
@@ -770,16 +761,11 @@ memory_sub_partition::breakdown_request_to_sector_requests(mem_fetch *mf) {
         for (unsigned k = i * SECTOR_SIZE; k < (i + 1) * SECTOR_SIZE; k++) {
           mask.set(k);
         }
-        const mem_access_t *ma = new mem_access_t(
-            mf->get_access_type(), mf->get_addr() + SECTOR_SIZE * i,
-            SECTOR_SIZE, mf->is_write(), mf->get_access_warp_mask(),
-            mf->get_access_byte_mask() & mask,
-            std::bitset<SECTOR_CHUNCK_SIZE>().set(i), m_gpu->gpgpu_ctx);
-
-        mem_fetch *n_mf =
-            new mem_fetch(*ma, NULL, mf->get_ctrl_size(), mf->get_wid(),
-                          mf->get_sid(), mf->get_tpc(), mf->get_mem_config(),
-                          m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, mf);
+        mem_fetch *n_mf = m_mf_allocator->alloc(mf->get_addr() + SECTOR_SIZE * i, 
+          mf->get_access_type(),mf->get_access_warp_mask(), 
+          mf->get_access_byte_mask() & mask,std::bitset<SECTOR_CHUNCK_SIZE>().set(i), 
+          SECTOR_SIZE,mf->is_write(),m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
+          mf->get_wid(),mf->get_sid(), mf->get_tpc(),mf);
 
         result.push_back(n_mf);
       }
diff --git a/src/gpgpu-sim/l2cache.h b/src/gpgpu-sim/l2cache.h
index 1f5d7c468..59432b88d 100644
--- a/src/gpgpu-sim/l2cache.h
+++ b/src/gpgpu-sim/l2cache.h
@@ -56,7 +56,9 @@ class partition_mf_allocator : public mem_fetch_allocator {
                             const mem_access_byte_mask_t &byte_mask,
                             const mem_access_sector_mask_t &sector_mask,
                             unsigned size, bool wr,
-                            unsigned long long cycle) const;
+                            unsigned long long cycle,
+                            unsigned wid, unsigned sid,
+                            unsigned tpc, mem_fetch *original_mf) const;
 
  private:
   const memory_config *m_memory_config;
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 7f27b7b64..51366deb4 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -62,18 +62,19 @@ mem_fetch *shader_core_mem_fetch_allocator::alloc(
   return mf;
 }
 
-mem_fetch *shader_core_mem_fetch_allocator::alloc(
-  new_addr_type addr, mem_access_type type,
-  const active_mask_t &active_mask,
-  const mem_access_byte_mask_t &byte_mask,
-  const mem_access_sector_mask_t &sector_mask,
-  unsigned size, bool wr,
-  unsigned long long cycle) const {
+mem_fetch *shader_core_mem_fetch_allocator::alloc(new_addr_type addr, mem_access_type type,
+                            const active_mask_t &active_mask,
+                            const mem_access_byte_mask_t &byte_mask,
+                            const mem_access_sector_mask_t &sector_mask,
+                            unsigned size, bool wr,
+                            unsigned long long cycle,
+                            unsigned wid, unsigned sid,
+                            unsigned tpc, mem_fetch *original_mf) const {
     mem_access_t access(type, addr, size, wr, active_mask, byte_mask, 
                           sector_mask, m_memory_config->gpgpu_ctx);
     mem_fetch *mf =
-      new mem_fetch(access, NULL, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE, -1,
-                    m_core_id, m_cluster_id, m_memory_config, cycle);
+      new mem_fetch(access, NULL, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE, wid,
+                    m_core_id, m_cluster_id, m_memory_config, cycle,original_mf);
       return mf;
   }
 /////////////////////////////////////////////////////////////////////////////
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 42bbdcb99..866231357 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -1903,7 +1903,9 @@ class shader_core_mem_fetch_allocator : public mem_fetch_allocator {
                             const mem_access_byte_mask_t &byte_mask,
                             const mem_access_sector_mask_t &sector_mask,
                             unsigned size, bool wr,
-                            unsigned long long cycle) const;
+                            unsigned long long cycle,
+                            unsigned wid, unsigned sid,
+                            unsigned tpc, mem_fetch *original_mf) const;
   mem_fetch *alloc(const warp_inst_t &inst, const mem_access_t &access,
                    unsigned long long cycle) const {
     warp_inst_t inst_copy = inst;

From b814c52fe9c4538669d845c5f05b247348f6fd1d Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Fri, 21 May 2021 15:12:43 -0400
Subject: [PATCH 075/154] cast to float - dirty line percentage

---
 src/gpgpu-sim/gpu-cache.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 23c5592d0..7e7d2adc4 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -282,7 +282,7 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
       // percentage of dirty lines in the cache
       // number of dirty lines / total lines in the cache
       float dirty_line_percentage = 
-          (float) (m_dirty / (m_config.m_nset * m_config.m_assoc )) * 100;
+          ((float) m_dirty / (m_config.m_nset * m_config.m_assoc )) * 100;
       if (!line->is_modified_line() ||
           dirty_line_percentage >= m_config.m_wr_percent) {
         // if number of dirty lines in the cache is greater than

From 3b75d8f22694e6a8743793e5bc07779f518650b9 Mon Sep 17 00:00:00 2001
From: mkhairy <khairy2011@gmail.com>
Date: Sat, 22 May 2021 09:04:13 -0400
Subject: [PATCH 076/154] Update version

---
 version | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version b/version
index 1a1a990cd..c832e567c 100644
--- a/version
+++ b/version
@@ -1 +1 @@
-const char *g_gpgpusim_version_string = "GPGPU-Sim Simulator Version 4.0.0 ";
+const char *g_gpgpusim_version_string = "GPGPU-Sim Simulator Version 4.1.0 ";

From 7e48560639e453fa2e4d86c99bec08f4a43bd884 Mon Sep 17 00:00:00 2001
From: mkhairy <khairy2011@gmail.com>
Date: Sat, 22 May 2021 09:08:05 -0400
Subject: [PATCH 077/154] Update CHANGES

---
 CHANGES | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/CHANGES b/CHANGES
index 0c48a3dc0..7964153c0 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,4 +1,16 @@
 LOG:
+Version 4.1.0 versus 4.0.0
+-Features:
+1- Supporting L1 write-allocate with sub-sector writing policy as in Volta+ hardware, and changing the Volta+ cards config to make L1 write-allocate with write-through
+2- Making the L1 adaptive cache policy to be configurable 
+3- Adding Ampere RTX 3060 config files
+-Bugs:
+1- Fixing L1 bank hash function bug
+2- Fixing L1 read hit counters in gpgpu-sim to match nvprof, to achieve more accurate L1 correlation with the HW
+3- Fixing bugs in lazy write handling, thanks to Gwendolyn Voskuilen from Sandia labs for this fix
+4- Fixing the backend pipeline for sub_core model 
+5- Fixing Memory stomp bug at the shader_config
+6- Some code refactoring:
 Version 4.0.0 (development branch) versus 3.2.3
 -Front-End:
 1- Support .nc cache modifier and __ldg function to access the read-only L1D cache

From b6409b4605dac8e39ea22ea6977a28c31177e44a Mon Sep 17 00:00:00 2001
From: mkhairy <khairy2011@gmail.com>
Date: Sat, 22 May 2021 09:34:34 -0400
Subject: [PATCH 078/154] Update README.md

---
 README.md | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 9f9f6698f..9bb891659 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,11 @@ This version of GPGPU-Sim has been tested with a subset of CUDA version 4.2,
 Please see the copyright notice in the file COPYRIGHT distributed with this
 release in the same directory as this file.
 
+GPGPU-Sim 4.0 is compatible with Accel-Sim simulation framework. With the support 
+of Accel-Sim, GPGPU-Sim 4.0 can run NVIDIA SASS traces (trace-based simulation) 
+generated by NVIDIA's dynamic binary instrumentation tool (NVBit). For more information 
+about Accel-Sim, see [https://accel-sim.github.io/](https://accel-sim.github.io/)
+
 If you use GPGPU-Sim 4.0 in your research, please cite:
 
 Mahmoud Khairy, Zhesheng Shen, Tor M. Aamodt, Timothy G Rogers.
@@ -18,7 +23,7 @@ Accel-Sim: An Extensible Simulation Framework for Validated GPU Modeling.
 In proceedings of the 47th IEEE/ACM International Symposium on Computer Architecture (ISCA),
 May 29 - June 3, 2020.
 
-If you use CuDNN or PyTorch support, checkpointing or our new debugging tool for functional 
+If you use CuDNN or PyTorch support (execution-driven simulation), checkpointing or our new debugging tool for functional 
 simulation errors in GPGPU-Sim for your research, please cite:
 
 Jonathan Lew, Deval Shah, Suchita Pati, Shaylin Cattell, Mengchi Zhang, Amruth Sandhupatla, 
@@ -26,7 +31,6 @@ Christopher Ng, Negar Goli, Matthew D. Sinclair, Timothy G. Rogers, Tor M. Aamod
 Analyzing Machine Learning Workloads Using a Detailed GPU Simulator, arXiv:1811.08933,
 https://arxiv.org/abs/1811.08933
 
-
 If you use the Tensor Core model in GPGPU-Sim or GPGPU-Sim's CUTLASS Library 
 for your research please cite:
 
@@ -261,6 +265,7 @@ To clean the docs run
 The documentation resides at doc/doxygen/html.
 
 To run Pytorch applications with the simulator, install the modified Pytorch library as well by following instructions [here](https://github.com/gpgpu-sim/pytorch-gpgpu-sim).
+
 ## Step 3: Run
 
 Before we run, we need to make sure the application's executable file is dynamically linked to CUDA runtime library. This can be done during compilation of your program by introducing the nvcc flag "--cudart shared" in makefile (quotes should be excluded).

From 6c9e13db93e4a1614f7401e9675c62ea40b65a3b Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Sun, 23 May 2021 12:59:34 -0400
Subject: [PATCH 079/154] format code

---
 src/abstract_hardware_model.cc |  12 ++--
 src/abstract_hardware_model.h  |  13 ++--
 src/cuda-sim/instructions.cc   |  99 ++++++++++++++-------------
 src/cuda-sim/ptx_ir.cc         |   4 +-
 src/cuda-sim/ptx_ir.h          |   4 +-
 src/cuda-sim/ptx_parser.cc     |  14 ++--
 src/gpgpu-sim/gpu-cache.cc     |  89 ++++++++++++------------
 src/gpgpu-sim/gpu-cache.h      |  84 +++++++++++------------
 src/gpgpu-sim/gpu-sim.cc       |  12 ++--
 src/gpgpu-sim/l2cache.cc       |  57 ++++++++--------
 src/gpgpu-sim/l2cache.h        |  13 ++--
 src/gpgpu-sim/shader.cc        | 119 +++++++++++++++++----------------
 src/gpgpu-sim/shader.h         |  17 ++---
 13 files changed, 273 insertions(+), 264 deletions(-)

diff --git a/src/abstract_hardware_model.cc b/src/abstract_hardware_model.cc
index e0e1d23cf..30aee60c9 100644
--- a/src/abstract_hardware_model.cc
+++ b/src/abstract_hardware_model.cc
@@ -448,7 +448,8 @@ void warp_inst_t::generate_mem_accesses() {
     for (unsigned thread = 0; thread < m_config->warp_size; thread++) {
       if (!active(thread)) continue;
       new_addr_type addr = m_per_scalar_thread[thread].memreqaddr[0];
-      new_addr_type block_address = line_size_based_tag_func(addr, cache_block_size);
+      new_addr_type block_address =
+          line_size_based_tag_func(addr, cache_block_size);
       accesses[block_address].set(thread);
       unsigned idx = addr - block_address;
       for (unsigned i = 0; i < data_size; i++) byte_mask.set(idx + i);
@@ -530,7 +531,8 @@ void warp_inst_t::memory_coalescing_arch(bool is_write,
            (m_per_scalar_thread[thread].memreqaddr[access] != 0);
            access++) {
         new_addr_type addr = m_per_scalar_thread[thread].memreqaddr[access];
-        new_addr_type block_address = line_size_based_tag_func(addr, segment_size);
+        new_addr_type block_address =
+            line_size_based_tag_func(addr, segment_size);
         unsigned chunk =
             (addr & 127) / 32;  // which 32-byte chunk within in a 128-byte
                                 // chunk does this thread access?
@@ -552,7 +554,8 @@ void warp_inst_t::memory_coalescing_arch(bool is_write,
         if (block_address != line_size_based_tag_func(
                                  addr + data_size_coales - 1, segment_size)) {
           addr = addr + data_size_coales - 1;
-          new_addr_type block_address = line_size_based_tag_func(addr, segment_size);
+          new_addr_type block_address =
+              line_size_based_tag_func(addr, segment_size);
           unsigned chunk = (addr & 127) / 32;
           transaction_info &info = subwarp_transactions[block_address];
           info.chunks.set(chunk);
@@ -625,7 +628,8 @@ void warp_inst_t::memory_coalescing_arch_atomic(bool is_write,
       if (!active(thread)) continue;
 
       new_addr_type addr = m_per_scalar_thread[thread].memreqaddr[0];
-      new_addr_type block_address = line_size_based_tag_func(addr, segment_size);
+      new_addr_type block_address =
+          line_size_based_tag_func(addr, segment_size);
       unsigned chunk =
           (addr & 127) / 32;  // which 32-byte chunk within in a 128-byte chunk
                               // does this thread access?
diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index 60d7328e7..35e28ca57 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -872,13 +872,12 @@ class mem_fetch_allocator {
                            const mem_access_t &access,
                            unsigned long long cycle) const = 0;
   virtual mem_fetch *alloc(new_addr_type addr, mem_access_type type,
-                            const active_mask_t &active_mask,
-                            const mem_access_byte_mask_t &byte_mask,
-                            const mem_access_sector_mask_t &sector_mask,
-                            unsigned size, bool wr,
-                            unsigned long long cycle,
-                            unsigned wid, unsigned sid,
-                            unsigned tpc, mem_fetch *original_mf) const = 0;                    
+                           const active_mask_t &active_mask,
+                           const mem_access_byte_mask_t &byte_mask,
+                           const mem_access_sector_mask_t &sector_mask,
+                           unsigned size, bool wr, unsigned long long cycle,
+                           unsigned wid, unsigned sid, unsigned tpc,
+                           mem_fetch *original_mf) const = 0;
 };
 
 // the maximum number of destination, source, or address uarch operands in a
diff --git a/src/cuda-sim/instructions.cc b/src/cuda-sim/instructions.cc
index 8936fa80e..0b990e83c 100644
--- a/src/cuda-sim/instructions.cc
+++ b/src/cuda-sim/instructions.cc
@@ -166,8 +166,9 @@ void inst_not_implemented(const ptx_instruction *pI);
 ptx_reg_t srcOperandModifiers(ptx_reg_t opData, operand_info opInfo,
                               operand_info dstInfo, unsigned type,
                               ptx_thread_info *thread);
-                              
-void video_mem_instruction(const ptx_instruction *pI, ptx_thread_info *thread, int op_code);
+
+void video_mem_instruction(const ptx_instruction *pI, ptx_thread_info *thread,
+                           int op_code);
 
 void sign_extend(ptx_reg_t &data, unsigned src_size, const operand_info &dst);
 
@@ -1711,40 +1712,50 @@ void bfi_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
   }
   thread->set_operand_value(dst, data, i_type, thread, pI);
 }
-void bfind_impl(const ptx_instruction *pI, ptx_thread_info *thread)
-{
-  const operand_info &dst  = pI->dst();
+void bfind_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
+  const operand_info &dst = pI->dst();
   const operand_info &src1 = pI->src1();
   const unsigned i_type = pI->get_type();
 
-  const ptx_reg_t src1_data = thread->get_operand_value(src1, dst, i_type, thread, 1);
-  const int msb = ( i_type == U32_TYPE || i_type == S32_TYPE) ? 31 : 63;
+  const ptx_reg_t src1_data =
+      thread->get_operand_value(src1, dst, i_type, thread, 1);
+  const int msb = (i_type == U32_TYPE || i_type == S32_TYPE) ? 31 : 63;
 
   unsigned long a = 0;
-  switch (i_type)
-  {
-    case S32_TYPE: a = src1_data.s32; break;
-    case U32_TYPE: a = src1_data.u32; break;
-    case S64_TYPE: a = src1_data.s64; break;
-    case U64_TYPE: a = src1_data.u64; break;
-    default: assert(false); abort();
+  switch (i_type) {
+    case S32_TYPE:
+      a = src1_data.s32;
+      break;
+    case U32_TYPE:
+      a = src1_data.u32;
+      break;
+    case S64_TYPE:
+      a = src1_data.s64;
+      break;
+    case U64_TYPE:
+      a = src1_data.u64;
+      break;
+    default:
+      assert(false);
+      abort();
   }
 
   // negate negative signed inputs
-  if ( ( i_type == S32_TYPE || i_type == S64_TYPE ) && ( a & ( 1 << msb ) ) ) {
-      a = ~a;
+  if ((i_type == S32_TYPE || i_type == S64_TYPE) && (a & (1 << msb))) {
+    a = ~a;
   }
   uint32_t d_data = 0xffffffff;
   for (uint32_t i = msb; i >= 0; i--) {
-      if (a & (1<<i))  { d_data = i; break; }
+    if (a & (1 << i)) {
+      d_data = i;
+      break;
+    }
   }
 
   // if (.shiftamt && d != 0xffffffff)  { d = msb - d; }
 
   // store d
   thread->set_operand_value(dst, d_data, U32_TYPE, thread, pI);
-
-
 }
 
 void bra_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
@@ -6339,12 +6350,10 @@ void vmad_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
 #define VMAX 0
 #define VMIN 1
 
-void vmax_impl(const ptx_instruction *pI, ptx_thread_info *thread)
-{
-   video_mem_instruction(pI, thread, VMAX);
+void vmax_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
+  video_mem_instruction(pI, thread, VMAX);
 }
-void vmin_impl(const ptx_instruction *pI, ptx_thread_info *thread)
-{
+void vmin_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
   video_mem_instruction(pI, thread, VMIN);
 }
 void vset_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
@@ -6440,12 +6449,12 @@ void vote_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
   }
 }
 
-void activemask_impl( const ptx_instruction *pI, ptx_thread_info *thread )
-{
+void activemask_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
   active_mask_t l_activemask_bitset = pI->get_warp_active_mask();
-  uint32_t l_activemask_uint = static_cast<uint32_t>(l_activemask_bitset.to_ulong());
+  uint32_t l_activemask_uint =
+      static_cast<uint32_t>(l_activemask_bitset.to_ulong());
 
-  const operand_info &dst  = pI->dst();
+  const operand_info &dst = pI->dst();
   thread->set_operand_value(dst, l_activemask_uint, U32_TYPE, thread, pI);
 }
 
@@ -6527,12 +6536,12 @@ ptx_reg_t srcOperandModifiers(ptx_reg_t opData, operand_info opInfo,
   return result;
 }
 
-void video_mem_instruction(const ptx_instruction *pI, ptx_thread_info *thread, int op_code)
-{
-  const operand_info &dst  = pI->dst(); // d
-  const operand_info &src1 = pI->src1(); // a
-  const operand_info &src2 = pI->src2(); // b
-  const operand_info &src3 = pI->src3(); // c
+void video_mem_instruction(const ptx_instruction *pI, ptx_thread_info *thread,
+                           int op_code) {
+  const operand_info &dst = pI->dst();    // d
+  const operand_info &src1 = pI->src1();  // a
+  const operand_info &src2 = pI->src2();  // b
+  const operand_info &src3 = pI->src3();  // c
 
   const unsigned i_type = pI->get_type();
 
@@ -6557,19 +6566,18 @@ void video_mem_instruction(const ptx_instruction *pI, ptx_thread_info *thread, i
   auto option = options.begin();
   assert(*option == ATOMIC_MAX || *option == ATOMIC_MIN);
 
-  switch ( i_type ) {
+  switch (i_type) {
     case S32_TYPE: {
       // assert all operands are S32_TYPE:
       scalar_type = pI->get_scalar_type();
-      for (std::list<int>::iterator scalar = scalar_type.begin(); scalar != scalar_type.end(); scalar++)
-      {
+      for (std::list<int>::iterator scalar = scalar_type.begin();
+           scalar != scalar_type.end(); scalar++) {
         assert(*scalar == S32_TYPE);
       }
       assert(scalar_type.size() == 3);
       scalar_type.clear();
 
-      switch (op_code)
-      {
+      switch (op_code) {
         case VMAX:
           data.s32 = MY_MAX_I(ta.s32, tb.s32);
           break;
@@ -6580,26 +6588,23 @@ void video_mem_instruction(const ptx_instruction *pI, ptx_thread_info *thread, i
           assert(0);
       }
 
-      switch (*option)
-      {
+      switch (*option) {
         case ATOMIC_MAX:
           data.s32 = MY_MAX_I(data.s32, c.s32);
-        break;
+          break;
         case ATOMIC_MIN:
           data.s32 = MY_MIN_I(data.s32, c.s32);
-        break;
+          break;
         default:
-          assert(0); // not yet implemented
+          assert(0);  // not yet implemented
       }
       break;
-
     }
     default:
-      assert(0); // not yet implemented
+      assert(0);  // not yet implemented
   }
 
   thread->set_operand_value(dst, data, i_type, thread, pI);
 
   return;
 }
-
diff --git a/src/cuda-sim/ptx_ir.cc b/src/cuda-sim/ptx_ir.cc
index e5b5fb773..d3da4b541 100644
--- a/src/cuda-sim/ptx_ir.cc
+++ b/src/cuda-sim/ptx_ir.cc
@@ -1147,8 +1147,8 @@ static std::list<operand_info> check_operands(
     const std::list<operand_info> &operands, gpgpu_context *ctx) {
   static int g_warn_literal_operands_two_type_inst;
   if ((opcode == CVT_OP) || (opcode == SET_OP) || (opcode == SLCT_OP) ||
-      (opcode == TEX_OP) || (opcode == MMA_OP) || (opcode == DP4A_OP) || 
-      (opcode == VMIN_OP) || (opcode == VMAX_OP) ) {
+      (opcode == TEX_OP) || (opcode == MMA_OP) || (opcode == DP4A_OP) ||
+      (opcode == VMIN_OP) || (opcode == VMAX_OP)) {
     // just make sure these do not have have const operands...
     if (!g_warn_literal_operands_two_type_inst) {
       std::list<operand_info>::const_iterator o;
diff --git a/src/cuda-sim/ptx_ir.h b/src/cuda-sim/ptx_ir.h
index 42439412c..825175964 100644
--- a/src/cuda-sim/ptx_ir.h
+++ b/src/cuda-sim/ptx_ir.h
@@ -966,8 +966,8 @@ class ptx_instruction : public warp_inst_t {
   int get_pred_mod() const { return m_pred_mod; }
   const char *get_source() const { return m_source.c_str(); }
 
-  const std::list<int> get_scalar_type() const {return m_scalar_type;}
-  const std::list<int> get_options() const {return m_options;}
+  const std::list<int> get_scalar_type() const { return m_scalar_type; }
+  const std::list<int> get_options() const { return m_options; }
 
   typedef std::vector<operand_info>::const_iterator const_iterator;
 
diff --git a/src/cuda-sim/ptx_parser.cc b/src/cuda-sim/ptx_parser.cc
index afdb41ba8..86a33c2d3 100644
--- a/src/cuda-sim/ptx_parser.cc
+++ b/src/cuda-sim/ptx_parser.cc
@@ -622,13 +622,13 @@ void ptx_recognizer::add_scalar_type_spec(int type_spec) {
                     g_ptx_token_decode[type_spec].c_str());
   g_scalar_type.push_back(type_spec);
   if (g_scalar_type.size() > 1) {
-    parse_assert((g_opcode == -1) || (g_opcode == CVT_OP) ||
-                     (g_opcode == SET_OP) || (g_opcode == SLCT_OP) ||
-                     (g_opcode == TEX_OP) || (g_opcode == MMA_OP) ||
-                     (g_opcode == DP4A_OP) || (g_opcode == VMIN_OP) || 
-                     (g_opcode == VMAX_OP),
-                 "only cvt, set, slct, tex, vmin, vmax and dp4a can have more than one "
-                 "type specifier.");
+    parse_assert(
+        (g_opcode == -1) || (g_opcode == CVT_OP) || (g_opcode == SET_OP) ||
+            (g_opcode == SLCT_OP) || (g_opcode == TEX_OP) ||
+            (g_opcode == MMA_OP) || (g_opcode == DP4A_OP) ||
+            (g_opcode == VMIN_OP) || (g_opcode == VMAX_OP),
+        "only cvt, set, slct, tex, vmin, vmax and dp4a can have more than one "
+        "type specifier.");
   }
   g_scalar_type_spec = type_spec;
 }
diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 7e7d2adc4..28d3215ae 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -37,7 +37,8 @@
 
 const char *cache_request_status_str(enum cache_request_status status) {
   static const char *static_cache_request_status_str[] = {
-      "HIT", "HIT_RESERVED", "MISS", "RESERVATION_FAIL", "SECTOR_MISS", "MSHR_HIT"};
+      "HIT",         "HIT_RESERVED", "MISS", "RESERVATION_FAIL",
+      "SECTOR_MISS", "MSHR_HIT"};
 
   assert(sizeof(static_cache_request_status_str) / sizeof(const char *) ==
          NUM_CACHE_REQUEST_STATUS);
@@ -63,9 +64,9 @@ unsigned l1d_cache_config::set_bank(new_addr_type addr) const {
   // For sector cache, we select one sector per bank (sector interleaving)
   // This is what was found in Volta (one sector per bank, sector interleaving)
   // otherwise, line interleaving
-  return cache_config::hash_function(addr, l1_banks, l1_banks_byte_interleaving_log2,
-                                     l1_banks_log2,
-                                     l1_banks_hashing_function);
+  return cache_config::hash_function(addr, l1_banks,
+                                     l1_banks_byte_interleaving_log2,
+                                     l1_banks_log2, l1_banks_hashing_function);
 }
 
 unsigned cache_config::set_index(new_addr_type addr) const {
@@ -235,7 +236,7 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
                                            mem_fetch *mf, bool is_write,
                                            bool probe_mode) const {
   mem_access_sector_mask_t mask = mf->get_access_sector_mask();
-  return probe(addr, idx, mask,is_write, probe_mode, mf);
+  return probe(addr, idx, mask, is_write, probe_mode, mf);
 }
 
 enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
@@ -281,8 +282,8 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
     if (!line->is_reserved_line()) {
       // percentage of dirty lines in the cache
       // number of dirty lines / total lines in the cache
-      float dirty_line_percentage = 
-          ((float) m_dirty / (m_config.m_nset * m_config.m_assoc )) * 100;
+      float dirty_line_percentage =
+          ((float)m_dirty / (m_config.m_nset * m_config.m_assoc)) * 100;
       if (!line->is_modified_line() ||
           dirty_line_percentage >= m_config.m_wr_percent) {
         // if number of dirty lines in the cache is greater than
@@ -357,7 +358,7 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
           evicted.set_info(m_lines[idx]->m_block_addr,
                            m_lines[idx]->get_modified_size(),
                            m_lines[idx]->get_dirty_byte_mask(),
-                            m_lines[idx]->get_dirty_sector_mask());
+                           m_lines[idx]->get_dirty_sector_mask());
           m_dirty--;
         }
         m_lines[idx]->allocate(m_config.tag(addr), m_config.block_addr(addr),
@@ -372,9 +373,9 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
         bool before = m_lines[idx]->is_modified_line();
         ((sector_cache_block *)m_lines[idx])
             ->allocate_sector(time, mf->get_access_sector_mask());
-            if (before && !m_lines[idx]->is_modified_line()) {
-              m_dirty--;
-            }
+        if (before && !m_lines[idx]->is_modified_line()) {
+          m_dirty--;
+        }
       }
       break;
     case RESERVATION_FAIL:
@@ -391,16 +392,18 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
   return status;
 }
 
-void tag_array::fill(new_addr_type addr, unsigned time, mem_fetch *mf, bool is_write) {
-  fill(addr, time, mf->get_access_sector_mask(), mf->get_access_byte_mask(), is_write);
+void tag_array::fill(new_addr_type addr, unsigned time, mem_fetch *mf,
+                     bool is_write) {
+  fill(addr, time, mf->get_access_sector_mask(), mf->get_access_byte_mask(),
+       is_write);
 }
 
 void tag_array::fill(new_addr_type addr, unsigned time,
-                     mem_access_sector_mask_t mask, mem_access_byte_mask_t byte_mask,
-                     bool is_write) {
+                     mem_access_sector_mask_t mask,
+                     mem_access_byte_mask_t byte_mask, bool is_write) {
   // assert( m_config.m_alloc_policy == ON_FILL );
   unsigned idx;
-  enum cache_request_status status = probe(addr, idx, mask,is_write);
+  enum cache_request_status status = probe(addr, idx, mask, is_write);
   bool before = m_lines[idx]->is_modified_line();
   // assert(status==MISS||status==SECTOR_MISS); // MSHR should have prevented
   // redundant memory request
@@ -423,7 +426,8 @@ void tag_array::fill(new_addr_type addr, unsigned time,
 
 void tag_array::fill(unsigned index, unsigned time, mem_fetch *mf) {
   assert(m_config.m_alloc_policy == ON_MISS);
-  m_lines[index]->fill(time, mf->get_access_sector_mask(), mf->get_access_byte_mask());
+  m_lines[index]->fill(time, mf->get_access_sector_mask(),
+                       mf->get_access_byte_mask());
   m_dirty++;
 }
 
@@ -437,7 +441,7 @@ void tag_array::flush() {
         m_lines[i]->set_status(INVALID, mem_access_sector_mask_t().set(j));
       }
     }
-  
+
   m_dirty = 0;
   is_used = false;
 }
@@ -794,8 +798,8 @@ void cache_stats::print_stats(FILE *fout, const char *cache_name) const {
               m_stats[type][status]);
 
       if (status != RESERVATION_FAIL && status != MSHR_HIT)
-      // MSHR_HIT is a special type of SECTOR_MISS
-      // so its already included in the SECTOR_MISS
+        // MSHR_HIT is a special type of SECTOR_MISS
+        // so its already included in the SECTOR_MISS
         total_access[type] += m_stats[type][status];
     }
   }
@@ -1335,10 +1339,10 @@ enum cache_request_status data_cache::wr_miss_wa_naive(
       assert(status ==
              MISS);  // SECTOR_MISS and HIT_RESERVED should not send write back
       mem_fetch *wb = m_memfetch_creator->alloc(
-        evicted.m_block_addr,m_wrbk_type,
-        mf->get_access_warp_mask(), evicted.m_byte_mask,
-        evicted.m_sector_mask, evicted.m_modified_size,
-        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,-1,-1,-1,NULL);
+          evicted.m_block_addr, m_wrbk_type, mf->get_access_warp_mask(),
+          evicted.m_byte_mask, evicted.m_sector_mask, evicted.m_modified_size,
+          true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, -1, -1, -1,
+          NULL);
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
       wb->set_chip(mf->get_tlx_addr().chip);
@@ -1388,10 +1392,10 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
       // (already modified lower level)
       if (wb && (m_config.m_write_policy != WRITE_THROUGH)) {
         mem_fetch *wb = m_memfetch_creator->alloc(
-        evicted.m_block_addr,m_wrbk_type,
-        mf->get_access_warp_mask(), evicted.m_byte_mask,
-        evicted.m_sector_mask, evicted.m_modified_size,
-        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,-1,-1,-1,NULL);
+            evicted.m_block_addr, m_wrbk_type, mf->get_access_warp_mask(),
+            evicted.m_byte_mask, evicted.m_sector_mask, evicted.m_modified_size,
+            true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, -1, -1, -1,
+            NULL);
         // the evicted block may have wrong chip id when advanced L2 hashing  is
         // used, so set the right chip address from the original mf
         wb->set_chip(mf->get_tlx_addr().chip);
@@ -1461,10 +1465,10 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
       // (already modified lower level)
       if (wb && (m_config.m_write_policy != WRITE_THROUGH)) {
         mem_fetch *wb = m_memfetch_creator->alloc(
-        evicted.m_block_addr,m_wrbk_type,
-        mf->get_access_warp_mask(), evicted.m_byte_mask,
-        evicted.m_sector_mask, evicted.m_modified_size,
-        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,-1,-1,-1,NULL);
+            evicted.m_block_addr, m_wrbk_type, mf->get_access_warp_mask(),
+            evicted.m_byte_mask, evicted.m_sector_mask, evicted.m_modified_size,
+            true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, -1, -1, -1,
+            NULL);
         // the evicted block may have wrong chip id when advanced L2 hashing  is
         // used, so set the right chip address from the original mf
         wb->set_chip(mf->get_tlx_addr().chip);
@@ -1514,7 +1518,6 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
     send_write_request(mf, cache_event(WRITE_REQUEST_SENT), time, events);
   }
 
-
   bool wb = false;
   evicted_block_info evicted;
 
@@ -1538,7 +1541,7 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
   } else {
     block->set_m_readable(false, mf->get_access_sector_mask());
     if (m_status == HIT_RESERVED)
-        block->set_readable_on_fill(true, mf->get_access_sector_mask());
+      block->set_readable_on_fill(true, mf->get_access_sector_mask());
   }
 
   if (m_status != RESERVATION_FAIL) {
@@ -1546,10 +1549,10 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
     // (already modified lower level)
     if (wb && (m_config.m_write_policy != WRITE_THROUGH)) {
       mem_fetch *wb = m_memfetch_creator->alloc(
-        evicted.m_block_addr,m_wrbk_type,
-        mf->get_access_warp_mask(), evicted.m_byte_mask,
-        evicted.m_sector_mask, evicted.m_modified_size,
-        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,-1,-1,-1,NULL);
+          evicted.m_block_addr, m_wrbk_type, mf->get_access_warp_mask(),
+          evicted.m_byte_mask, evicted.m_sector_mask, evicted.m_modified_size,
+          true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, -1, -1, -1,
+          NULL);
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
       wb->set_chip(mf->get_tlx_addr().chip);
@@ -1596,7 +1599,7 @@ enum cache_request_status data_cache::rd_hit_base(
       m_tag_array->inc_dirty();
     }
     block->set_status(MODIFIED,
-                      mf->get_access_sector_mask());  // mark line as 
+                      mf->get_access_sector_mask());  // mark line as
     block->set_byte_mask(mf);
   }
   return HIT;
@@ -1628,10 +1631,10 @@ enum cache_request_status data_cache::rd_miss_base(
     // (already modified lower level)
     if (wb && (m_config.m_write_policy != WRITE_THROUGH)) {
       mem_fetch *wb = m_memfetch_creator->alloc(
-        evicted.m_block_addr,m_wrbk_type,
-        mf->get_access_warp_mask(), evicted.m_byte_mask,
-        evicted.m_sector_mask, evicted.m_modified_size,
-        true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,-1,-1,-1,NULL);
+          evicted.m_block_addr, m_wrbk_type, mf->get_access_warp_mask(),
+          evicted.m_byte_mask, evicted.m_sector_mask, evicted.m_modified_size,
+          true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, -1, -1, -1,
+          NULL);
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
       wb->set_chip(mf->get_tlx_addr().chip);
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 007403f5a..7a2a8d94d 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -84,7 +84,7 @@ struct evicted_block_info {
     m_block_addr = block_addr;
     m_modified_size = modified_size;
   }
-  void set_info(new_addr_type block_addr, unsigned modified_size, 
+  void set_info(new_addr_type block_addr, unsigned modified_size,
                 mem_access_byte_mask_t byte_mask,
                 mem_access_sector_mask_t sector_mask) {
     m_block_addr = block_addr;
@@ -121,8 +121,8 @@ struct cache_block_t {
   virtual void allocate(new_addr_type tag, new_addr_type block_addr,
                         unsigned time,
                         mem_access_sector_mask_t sector_mask) = 0;
-  virtual void fill(unsigned time, mem_access_sector_mask_t sector_mask, 
-                      mem_access_byte_mask_t byte_mask) = 0;
+  virtual void fill(unsigned time, mem_access_sector_mask_t sector_mask,
+                    mem_access_byte_mask_t byte_mask) = 0;
 
   virtual bool is_invalid_line() = 0;
   virtual bool is_valid_line() = 0;
@@ -183,15 +183,14 @@ struct line_cache_block : public cache_block_t {
     m_set_readable_on_fill = false;
     m_set_byte_mask_on_fill = false;
   }
-  virtual void fill(unsigned time, mem_access_sector_mask_t sector_mask, 
-              mem_access_byte_mask_t byte_mask) {
+  virtual void fill(unsigned time, mem_access_sector_mask_t sector_mask,
+                    mem_access_byte_mask_t byte_mask) {
     // if(!m_ignore_on_fill_status)
     //	assert( m_status == RESERVED );
 
     m_status = m_set_modified_on_fill ? MODIFIED : VALID;
-    
-    if (m_set_readable_on_fill)
-        m_readable = true;
+
+    if (m_set_readable_on_fill) m_readable = true;
     if (m_set_byte_mask_on_fill) set_byte_mask(byte_mask);
 
     m_fill_time = time;
@@ -358,10 +357,10 @@ struct sector_cache_block : public cache_block_t {
     //	if(!m_ignore_on_fill_status[sidx])
     //	         assert( m_status[sidx] == RESERVED );
     m_status[sidx] = m_set_modified_on_fill[sidx] ? MODIFIED : VALID;
-    
+
     if (m_set_readable_on_fill[sidx]) {
-        m_readable[sidx] = true;
-        m_set_readable_on_fill[sidx] = false;
+      m_readable[sidx] = true;
+      m_set_readable_on_fill[sidx] = false;
     }
     if (m_set_byte_mask_on_fill) set_byte_mask(byte_mask);
 
@@ -416,8 +415,7 @@ struct sector_cache_block : public cache_block_t {
   virtual mem_access_sector_mask_t get_dirty_sector_mask() {
     mem_access_sector_mask_t sector_mask;
     for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; i++) {
-      if (m_status[i] == MODIFIED) 
-        sector_mask.set(i);
+      if (m_status[i] == MODIFIED) sector_mask.set(i);
     }
     return sector_mask;
   }
@@ -575,7 +573,7 @@ class cache_config {
       }
       exit_parse_error();
     }
-    
+
     switch (ct) {
       case 'N':
         m_cache_type = NORMAL;
@@ -631,18 +629,19 @@ class cache_config {
     if (m_alloc_policy == STREAMING) {
       /*
       For streaming cache:
-      (1) we set the alloc policy to be on-fill to remove all line_alloc_fail stalls.
-      if the whole memory is allocated to the L1 cache, then make the allocation to be on_MISS
-      otherwise, make it ON_FILL to eliminate line allocation fails. 
-      i.e. MSHR throughput is the same, independent on the L1 cache size/associativity
-      So, we set the allocation policy per kernel basis, see shader.cc, max_cta() function
-      
+      (1) we set the alloc policy to be on-fill to remove all line_alloc_fail
+      stalls. if the whole memory is allocated to the L1 cache, then make the
+      allocation to be on_MISS otherwise, make it ON_FILL to eliminate line
+      allocation fails. i.e. MSHR throughput is the same, independent on the L1
+      cache size/associativity So, we set the allocation policy per kernel
+      basis, see shader.cc, max_cta() function
+
       (2) We also set the MSHRs to be equal to max
       allocated cache lines. This is possible by moving TAG to be shared
       between cache line and MSHR enrty (i.e. for each cache line, there is
       an MSHR rntey associated with it). This is the easiest think we can
       think of to model (mimic) L1 streaming cache in Pascal and Volta
-      
+
       For more information about streaming cache, see:
       http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
       https://ieeexplore.ieee.org/document/8344474/
@@ -697,8 +696,8 @@ class cache_config {
     }
 
     // detect invalid configuration
-    if ((m_alloc_policy == ON_FILL || m_alloc_policy == STREAMING) 
-        and m_write_policy == WRITE_BACK) {
+    if ((m_alloc_policy == ON_FILL || m_alloc_policy == STREAMING) and
+        m_write_policy == WRITE_BACK) {
       // A writeback cache with allocate-on-fill policy will inevitably lead to
       // deadlock: The deadlock happens when an incoming cache-fill evicts a
       // dirty line, generating a writeback request.  If the memory subsystem is
@@ -746,7 +745,7 @@ class cache_config {
         break;
       case 'X':
         m_set_index_function = BITWISE_XORING_FUNCTION;
-        break;        
+        break;
       default:
         exit_parse_error();
     }
@@ -779,7 +778,9 @@ class cache_config {
 
   virtual unsigned set_index(new_addr_type addr) const;
 
-  virtual unsigned get_max_cache_multiplier() const { return MAX_DEFAULT_CACHE_SIZE_MULTIBLIER;}
+  virtual unsigned get_max_cache_multiplier() const {
+    return MAX_DEFAULT_CACHE_SIZE_MULTIBLIER;
+  }
 
   unsigned hash_function(new_addr_type addr, unsigned m_nset,
                          unsigned m_line_sz_log2, unsigned m_nset_log2,
@@ -826,9 +827,7 @@ class cache_config {
   write_allocate_policy_t get_write_allocate_policy() {
     return m_write_alloc_policy;
   }
-  write_policy_t get_write_policy() {
-    return m_write_policy;
-  }
+  write_policy_t get_write_policy() { return m_write_policy; }
 
  protected:
   void exit_parse_error() {
@@ -903,17 +902,17 @@ class l1d_cache_config : public cache_config {
   unsigned l1_banks_byte_interleaving_log2;
   unsigned l1_banks_hashing_function;
   unsigned m_unified_cache_size;
-  virtual unsigned get_max_cache_multiplier() const { 
-      // set * assoc * cacheline size. Then convert Byte to KB
-      // gpgpu_unified_cache_size is in KB while original_sz is in B
-      if (m_unified_cache_size > 0) {
-        unsigned original_size = m_nset * original_m_assoc * m_line_sz / 1024;
-        assert(m_unified_cache_size % original_size == 0);
-        return m_unified_cache_size / original_size;
-      } else {
-        return MAX_DEFAULT_CACHE_SIZE_MULTIBLIER;
-      }    
+  virtual unsigned get_max_cache_multiplier() const {
+    // set * assoc * cacheline size. Then convert Byte to KB
+    // gpgpu_unified_cache_size is in KB while original_sz is in B
+    if (m_unified_cache_size > 0) {
+      unsigned original_size = m_nset * original_m_assoc * m_line_sz / 1024;
+      assert(m_unified_cache_size % original_size == 0);
+      return m_unified_cache_size / original_size;
+    } else {
+      return MAX_DEFAULT_CACHE_SIZE_MULTIBLIER;
     }
+  }
 };
 
 class l2_cache_config : public cache_config {
@@ -936,8 +935,7 @@ class tag_array {
                                   mem_fetch *mf, bool is_write,
                                   bool probe_mode = false) const;
   enum cache_request_status probe(new_addr_type addr, unsigned &idx,
-                                  mem_access_sector_mask_t mask,
-                                  bool is_write,
+                                  mem_access_sector_mask_t mask, bool is_write,
                                   bool probe_mode = false,
                                   mem_fetch *mf = NULL) const;
   enum cache_request_status access(new_addr_type addr, unsigned time,
@@ -948,7 +946,7 @@ class tag_array {
 
   void fill(new_addr_type addr, unsigned time, mem_fetch *mf, bool is_write);
   void fill(unsigned idx, unsigned time, mem_fetch *mf);
-  void fill(new_addr_type addr, unsigned time, mem_access_sector_mask_t mask, 
+  void fill(new_addr_type addr, unsigned time, mem_access_sector_mask_t mask,
             mem_access_byte_mask_t byte_mask, bool is_write);
 
   unsigned size() const { return m_config.get_num_lines(); }
@@ -967,9 +965,7 @@ class tag_array {
   void update_cache_parameters(cache_config &config);
   void add_pending_line(mem_fetch *mf);
   void remove_pending_line(mem_fetch *mf);
-  void inc_dirty() {
-    m_dirty++;
-  }
+  void inc_dirty() { m_dirty++; }
 
  protected:
   // This constructor is intended for use only from derived classes that wish to
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index df3004772..56ede056c 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -249,7 +249,8 @@ void shader_core_config::reg_options(class OptionParser *opp) {
                          " {<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_"
                          "alloc>,<mshr>:<N>:<merge>,<mq> | none}",
                          "none");
-  option_parser_register(opp,"-gpgpu_l1_cache_write_ratio",OPT_UINT32,&m_L1D_config.m_wr_percent,"L1D write ratio","0");
+  option_parser_register(opp, "-gpgpu_l1_cache_write_ratio", OPT_UINT32,
+                         &m_L1D_config.m_wr_percent, "L1D write ratio", "0");
   option_parser_register(opp, "-gpgpu_l1_banks", OPT_UINT32,
                          &m_L1D_config.l1_banks, "The number of L1 cache banks",
                          "1");
@@ -327,11 +328,12 @@ void shader_core_config::reg_options(class OptionParser *opp) {
   option_parser_register(
       opp, "-gpgpu_shmem_size", OPT_UINT32, &gpgpu_shmem_size,
       "Size of shared memory per shader core (default 16kB)", "16384");
+  option_parser_register(opp, "-gpgpu_shmem_option", OPT_CSTR,
+                         &gpgpu_shmem_option,
+                         "Option list of shared memory sizes", "0");
   option_parser_register(
-      opp, "-gpgpu_shmem_option", OPT_CSTR, &gpgpu_shmem_option,
-      "Option list of shared memory sizes", "0");
-  option_parser_register(
-      opp, "-gpgpu_unified_l1d_size", OPT_UINT32, &m_L1D_config.m_unified_cache_size,
+      opp, "-gpgpu_unified_l1d_size", OPT_UINT32,
+      &m_L1D_config.m_unified_cache_size,
       "Size of unified data cache(L1D + shared memory) in KB", "0");
   option_parser_register(opp, "-gpgpu_adaptive_cache_config", OPT_BOOL,
                          &adaptive_cache_config, "adaptive_cache_config", "0");
diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index 0db6bd44c..57e8ea97c 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -57,20 +57,18 @@ mem_fetch *partition_mf_allocator::alloc(new_addr_type addr,
   return mf;
 }
 
-mem_fetch *partition_mf_allocator::alloc(new_addr_type addr, mem_access_type type,
-                            const active_mask_t &active_mask,
-                            const mem_access_byte_mask_t &byte_mask,
-                            const mem_access_sector_mask_t &sector_mask,
-                            unsigned size, bool wr,
-                            unsigned long long cycle,
-                            unsigned wid, unsigned sid,
-                            unsigned tpc, mem_fetch *original_mf) const {
-  mem_access_t access(type, addr, size, wr, active_mask, byte_mask, 
-                        sector_mask, m_memory_config->gpgpu_ctx);
+mem_fetch *partition_mf_allocator::alloc(
+    new_addr_type addr, mem_access_type type, const active_mask_t &active_mask,
+    const mem_access_byte_mask_t &byte_mask,
+    const mem_access_sector_mask_t &sector_mask, unsigned size, bool wr,
+    unsigned long long cycle, unsigned wid, unsigned sid, unsigned tpc,
+    mem_fetch *original_mf) const {
+  mem_access_t access(type, addr, size, wr, active_mask, byte_mask, sector_mask,
+                      m_memory_config->gpgpu_ctx);
   mem_fetch *mf =
-    new mem_fetch(access, NULL, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE, wid,
-                  sid, tpc, m_memory_config, cycle,original_mf);
-    return mf;
+      new mem_fetch(access, NULL, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE,
+                    wid, sid, tpc, m_memory_config, cycle, original_mf);
+  return mf;
 }
 memory_partition_unit::memory_partition_unit(unsigned partition_id,
                                              const memory_config *config,
@@ -725,11 +723,12 @@ memory_sub_partition::breakdown_request_to_sector_requests(mem_fetch *mf) {
       for (unsigned k = i * SECTOR_SIZE; k < (i + 1) * SECTOR_SIZE; k++) {
         mask.set(k);
       }
-      mem_fetch *n_mf = m_mf_allocator->alloc(mf->get_addr() + SECTOR_SIZE * i, 
-        mf->get_access_type(),mf->get_access_warp_mask(), 
-        mf->get_access_byte_mask() & mask,std::bitset<SECTOR_CHUNCK_SIZE>().set(i), 
-        SECTOR_SIZE,mf->is_write(),m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
-        mf->get_wid(),mf->get_sid(), mf->get_tpc(),mf);
+      mem_fetch *n_mf = m_mf_allocator->alloc(
+          mf->get_addr() + SECTOR_SIZE * i, mf->get_access_type(),
+          mf->get_access_warp_mask(), mf->get_access_byte_mask() & mask,
+          std::bitset<SECTOR_CHUNCK_SIZE>().set(i), SECTOR_SIZE, mf->is_write(),
+          m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, mf->get_wid(),
+          mf->get_sid(), mf->get_tpc(), mf);
 
       result.push_back(n_mf);
     }
@@ -746,11 +745,12 @@ memory_sub_partition::breakdown_request_to_sector_requests(mem_fetch *mf) {
       for (unsigned k = i * SECTOR_SIZE; k < (i + 1) * SECTOR_SIZE; k++) {
         mask.set(k);
       }
-      mem_fetch *n_mf = m_mf_allocator->alloc(mf->get_addr(), 
-        mf->get_access_type(),mf->get_access_warp_mask(), 
-        mf->get_access_byte_mask() & mask,std::bitset<SECTOR_CHUNCK_SIZE>().set(i), 
-        SECTOR_SIZE,mf->is_write(),m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
-        mf->get_wid(),mf->get_sid(), mf->get_tpc(),mf);
+      mem_fetch *n_mf = m_mf_allocator->alloc(
+          mf->get_addr(), mf->get_access_type(), mf->get_access_warp_mask(),
+          mf->get_access_byte_mask() & mask,
+          std::bitset<SECTOR_CHUNCK_SIZE>().set(i), SECTOR_SIZE, mf->is_write(),
+          m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, mf->get_wid(),
+          mf->get_sid(), mf->get_tpc(), mf);
 
       result.push_back(n_mf);
     }
@@ -761,11 +761,12 @@ memory_sub_partition::breakdown_request_to_sector_requests(mem_fetch *mf) {
         for (unsigned k = i * SECTOR_SIZE; k < (i + 1) * SECTOR_SIZE; k++) {
           mask.set(k);
         }
-        mem_fetch *n_mf = m_mf_allocator->alloc(mf->get_addr() + SECTOR_SIZE * i, 
-          mf->get_access_type(),mf->get_access_warp_mask(), 
-          mf->get_access_byte_mask() & mask,std::bitset<SECTOR_CHUNCK_SIZE>().set(i), 
-          SECTOR_SIZE,mf->is_write(),m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, 
-          mf->get_wid(),mf->get_sid(), mf->get_tpc(),mf);
+        mem_fetch *n_mf = m_mf_allocator->alloc(
+            mf->get_addr() + SECTOR_SIZE * i, mf->get_access_type(),
+            mf->get_access_warp_mask(), mf->get_access_byte_mask() & mask,
+            std::bitset<SECTOR_CHUNCK_SIZE>().set(i), SECTOR_SIZE,
+            mf->is_write(), m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,
+            mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf);
 
         result.push_back(n_mf);
       }
diff --git a/src/gpgpu-sim/l2cache.h b/src/gpgpu-sim/l2cache.h
index 59432b88d..beed76562 100644
--- a/src/gpgpu-sim/l2cache.h
+++ b/src/gpgpu-sim/l2cache.h
@@ -52,13 +52,12 @@ class partition_mf_allocator : public mem_fetch_allocator {
                            unsigned size, bool wr,
                            unsigned long long cycle) const;
   virtual mem_fetch *alloc(new_addr_type addr, mem_access_type type,
-                            const active_mask_t &active_mask,
-                            const mem_access_byte_mask_t &byte_mask,
-                            const mem_access_sector_mask_t &sector_mask,
-                            unsigned size, bool wr,
-                            unsigned long long cycle,
-                            unsigned wid, unsigned sid,
-                            unsigned tpc, mem_fetch *original_mf) const;
+                           const active_mask_t &active_mask,
+                           const mem_access_byte_mask_t &byte_mask,
+                           const mem_access_sector_mask_t &sector_mask,
+                           unsigned size, bool wr, unsigned long long cycle,
+                           unsigned wid, unsigned sid, unsigned tpc,
+                           mem_fetch *original_mf) const;
 
  private:
   const memory_config *m_memory_config;
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 51366deb4..c65affdb6 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -62,21 +62,19 @@ mem_fetch *shader_core_mem_fetch_allocator::alloc(
   return mf;
 }
 
-mem_fetch *shader_core_mem_fetch_allocator::alloc(new_addr_type addr, mem_access_type type,
-                            const active_mask_t &active_mask,
-                            const mem_access_byte_mask_t &byte_mask,
-                            const mem_access_sector_mask_t &sector_mask,
-                            unsigned size, bool wr,
-                            unsigned long long cycle,
-                            unsigned wid, unsigned sid,
-                            unsigned tpc, mem_fetch *original_mf) const {
-    mem_access_t access(type, addr, size, wr, active_mask, byte_mask, 
-                          sector_mask, m_memory_config->gpgpu_ctx);
-    mem_fetch *mf =
-      new mem_fetch(access, NULL, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE, wid,
-                    m_core_id, m_cluster_id, m_memory_config, cycle,original_mf);
-      return mf;
-  }
+mem_fetch *shader_core_mem_fetch_allocator::alloc(
+    new_addr_type addr, mem_access_type type, const active_mask_t &active_mask,
+    const mem_access_byte_mask_t &byte_mask,
+    const mem_access_sector_mask_t &sector_mask, unsigned size, bool wr,
+    unsigned long long cycle, unsigned wid, unsigned sid, unsigned tpc,
+    mem_fetch *original_mf) const {
+  mem_access_t access(type, addr, size, wr, active_mask, byte_mask, sector_mask,
+                      m_memory_config->gpgpu_ctx);
+  mem_fetch *mf = new mem_fetch(
+      access, NULL, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE, wid, m_core_id,
+      m_cluster_id, m_memory_config, cycle, original_mf);
+  return mf;
+}
 /////////////////////////////////////////////////////////////////////////////
 
 std::list<unsigned> shader_core_ctx::get_regs_written(const inst_t &fvt) const {
@@ -142,8 +140,8 @@ void shader_core_ctx::create_front_pipeline() {
              m_pipeline_reg[ID_OC_INT].get_size());
     for (int j = 0; j < m_config->m_specialized_unit.size(); j++) {
       if (m_config->m_specialized_unit[j].num_units > 0)
-         assert(m_config->gpgpu_num_sched_per_core ==
-             m_config->m_specialized_unit[j].id_oc_spec_reg_width);
+        assert(m_config->gpgpu_num_sched_per_core ==
+               m_config->m_specialized_unit[j].id_oc_spec_reg_width);
     }
   }
 
@@ -187,15 +185,18 @@ void shader_core_ctx::create_schedulers() {
   // must currently occur after all inputs have been initialized.
   std::string sched_config = m_config->gpgpu_scheduler_string;
   const concrete_scheduler scheduler =
-      sched_config.find("lrr") != std::string::npos ? CONCRETE_SCHEDULER_LRR
-      : sched_config.find("two_level_active") != std::string::npos
-          ? CONCRETE_SCHEDULER_TWO_LEVEL_ACTIVE
-      : sched_config.find("gto") != std::string::npos ? CONCRETE_SCHEDULER_GTO
-      : sched_config.find("old") != std::string::npos
-          ? CONCRETE_SCHEDULER_OLDEST_FIRST
-      : sched_config.find("warp_limiting") != std::string::npos
-          ? CONCRETE_SCHEDULER_WARP_LIMITING
-          : NUM_CONCRETE_SCHEDULERS;
+      sched_config.find("lrr") != std::string::npos
+          ? CONCRETE_SCHEDULER_LRR
+          : sched_config.find("two_level_active") != std::string::npos
+                ? CONCRETE_SCHEDULER_TWO_LEVEL_ACTIVE
+                : sched_config.find("gto") != std::string::npos
+                      ? CONCRETE_SCHEDULER_GTO
+                      : sched_config.find("old") != std::string::npos
+                            ? CONCRETE_SCHEDULER_OLDEST_FIRST
+                            : sched_config.find("warp_limiting") !=
+                                      std::string::npos
+                                  ? CONCRETE_SCHEDULER_WARP_LIMITING
+                                  : NUM_CONCRETE_SCHEDULERS;
   assert(scheduler != NUM_CONCRETE_SCHEDULERS);
 
   for (unsigned i = 0; i < m_config->gpgpu_num_sched_per_core; i++) {
@@ -1246,20 +1247,21 @@ void scheduler_unit::cycle() {
                 previous_issued_inst_exec_type = exec_unit_type_t::MEM;
               }
             } else {
-
               // This code need to be refactored
               if (pI->op != TENSOR_CORE_OP && pI->op != SFU_OP &&
                   pI->op != DP_OP && !(pI->op >= SPEC_UNIT_START_ID)) {
                 bool execute_on_SP = false;
                 bool execute_on_INT = false;
 
-              bool sp_pipe_avail =
-                  (m_shader->m_config->gpgpu_num_sp_units > 0) &&
-                  m_sp_out->has_free(m_shader->m_config->sub_core_model, m_id);
-              bool int_pipe_avail =
-                  (m_shader->m_config->gpgpu_num_int_units > 0) &&
-                  m_int_out->has_free(m_shader->m_config->sub_core_model, m_id);
-                  
+                bool sp_pipe_avail =
+                    (m_shader->m_config->gpgpu_num_sp_units > 0) &&
+                    m_sp_out->has_free(m_shader->m_config->sub_core_model,
+                                       m_id);
+                bool int_pipe_avail =
+                    (m_shader->m_config->gpgpu_num_int_units > 0) &&
+                    m_int_out->has_free(m_shader->m_config->sub_core_model,
+                                        m_id);
+
                 // if INT unit pipline exist, then execute ALU and INT
                 // operations on INT unit and SP-FPU on SP unit (like in Volta)
                 // if INT unit pipline does not exist, then execute all ALU, INT
@@ -1320,10 +1322,10 @@ void scheduler_unit::cycle() {
                          (pI->op == DP_OP) &&
                          !(diff_exec_units && previous_issued_inst_exec_type ==
                                                   exec_unit_type_t::DP)) {
-               
                 bool dp_pipe_avail =
-                (m_shader->m_config->gpgpu_num_dp_units > 0) &&
-                m_dp_out->has_free(m_shader->m_config->sub_core_model, m_id);
+                    (m_shader->m_config->gpgpu_num_dp_units > 0) &&
+                    m_dp_out->has_free(m_shader->m_config->sub_core_model,
+                                       m_id);
 
                 if (dp_pipe_avail) {
                   m_shader->issue_warp(*m_dp_out, pI, active_mask, warp_id,
@@ -1340,10 +1342,10 @@ void scheduler_unit::cycle() {
                         (pI->op == SFU_OP) || (pI->op == ALU_SFU_OP)) &&
                        !(diff_exec_units && previous_issued_inst_exec_type ==
                                                 exec_unit_type_t::SFU)) {
-
                 bool sfu_pipe_avail =
-                (m_shader->m_config->gpgpu_num_sfu_units > 0) &&
-                m_sfu_out->has_free(m_shader->m_config->sub_core_model, m_id);
+                    (m_shader->m_config->gpgpu_num_sfu_units > 0) &&
+                    m_sfu_out->has_free(m_shader->m_config->sub_core_model,
+                                        m_id);
 
                 if (sfu_pipe_avail) {
                   m_shader->issue_warp(*m_sfu_out, pI, active_mask, warp_id,
@@ -1356,11 +1358,10 @@ void scheduler_unit::cycle() {
               } else if ((pI->op == TENSOR_CORE_OP) &&
                          !(diff_exec_units && previous_issued_inst_exec_type ==
                                                   exec_unit_type_t::TENSOR)) {
-                  
                 bool tensor_core_pipe_avail =
-                (m_shader->m_config->gpgpu_num_tensor_core_units > 0) &&
-                m_tensor_core_out->has_free(
-                    m_shader->m_config->sub_core_model, m_id);
+                    (m_shader->m_config->gpgpu_num_tensor_core_units > 0) &&
+                    m_tensor_core_out->has_free(
+                        m_shader->m_config->sub_core_model, m_id);
 
                 if (tensor_core_pipe_avail) {
                   m_shader->issue_warp(*m_tensor_core_out, pI, active_mask,
@@ -2007,8 +2008,10 @@ void ldst_unit::L1_latency_queue_cycle() {
         l1_latency_queue[j][0] = NULL;
         if (m_config->m_L1D_config.get_write_policy() != WRITE_THROUGH &&
             mf_next->get_inst().is_store() &&
-            (m_config->m_L1D_config.get_write_allocate_policy() == FETCH_ON_WRITE ||
-            m_config->m_L1D_config.get_write_allocate_policy() == LAZY_FETCH_ON_READ) &&
+            (m_config->m_L1D_config.get_write_allocate_policy() ==
+                 FETCH_ON_WRITE ||
+             m_config->m_L1D_config.get_write_allocate_policy() ==
+                 LAZY_FETCH_ON_READ) &&
             !was_writeallocate_sent(events)) {
           unsigned dec_ack =
               (m_config->m_L1D_config.get_mshr_type() == SECTOR_ASSOC)
@@ -2316,7 +2319,7 @@ void dp_unit ::issue(register_set &source_reg) {
 }
 
 void specialized_unit ::issue(register_set &source_reg) {
-  warp_inst_t **ready_reg = 
+  warp_inst_t **ready_reg =
       source_reg.get_ready(m_config->sub_core_model, m_issue_reg_id);
   // m_core->incexecstat((*ready_reg));
   (*ready_reg)->op_pipe = SPECIALIZED__OP;
@@ -3349,15 +3352,15 @@ unsigned int shader_core_config::max_cta(const kernel_info_t &k) const {
         unsigned max_assoc = m_L1D_config.get_max_assoc();
 
         for (std::vector<unsigned>::const_iterator it = shmem_opt_list.begin();
-              it < shmem_opt_list.end(); it++) {
+             it < shmem_opt_list.end(); it++) {
           if (total_shmem <= *it) {
-            float l1_ratio = 1 - ((float) *(it) / total_unified);
+            float l1_ratio = 1 - ((float)*(it) / total_unified);
             m_L1D_config.set_assoc(max_assoc * l1_ratio);
             l1d_configured = true;
             break;
           }
         }
-        
+
         assert(l1d_configured && "no shared memory option found");
         break;
       }
@@ -3365,16 +3368,16 @@ unsigned int shader_core_config::max_cta(const kernel_info_t &k) const {
         assert(0);
     }
 
-    if(m_L1D_config.is_streaming()) {
-      //for streaming cache, if the whole memory is allocated
-      //to the L1 cache, then make the allocation to be on_MISS
-      //otherwise, make it ON_FILL to eliminate line allocation fails
-      //i.e. MSHR throughput is the same, independent on the L1 cache size/associativity
-      if(total_shmem == 0) {
+    if (m_L1D_config.is_streaming()) {
+      // for streaming cache, if the whole memory is allocated
+      // to the L1 cache, then make the allocation to be on_MISS
+      // otherwise, make it ON_FILL to eliminate line allocation fails
+      // i.e. MSHR throughput is the same, independent on the L1 cache
+      // size/associativity
+      if (total_shmem == 0) {
         m_L1D_config.set_allocation_policy(ON_MISS);
         printf("GPGPU-Sim: Reconfigure L1 allocation to ON_MISS\n");
-      }
-      else {
+      } else {
         m_L1D_config.set_allocation_policy(ON_FILL);
         printf("GPGPU-Sim: Reconfigure L1 allocation to ON_FILL\n");
       }
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 866231357..2d2f051b5 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -1496,8 +1496,8 @@ class shader_core_config : public core_config {
         break;  // we only accept continuous specialized_units, i.e., 1,2,3,4
     }
 
-    //parse gpgpu_shmem_option for adpative cache config
-    if(adaptive_cache_config) {
+    // parse gpgpu_shmem_option for adpative cache config
+    if (adaptive_cache_config) {
       for (unsigned i = 0; i < strlen(gpgpu_shmem_option); i++) {
         char option[4];
         int j = 0;
@@ -1520,7 +1520,6 @@ class shader_core_config : public core_config {
       }
       std::sort(shmem_opt_list.begin(), shmem_opt_list.end());
     }
-
   }
   void reg_options(class OptionParser *opp);
   unsigned max_cta(const kernel_info_t &k) const;
@@ -1899,13 +1898,11 @@ class shader_core_mem_fetch_allocator : public mem_fetch_allocator {
   mem_fetch *alloc(new_addr_type addr, mem_access_type type, unsigned size,
                    bool wr, unsigned long long cycle) const;
   mem_fetch *alloc(new_addr_type addr, mem_access_type type,
-                            const active_mask_t &active_mask,
-                            const mem_access_byte_mask_t &byte_mask,
-                            const mem_access_sector_mask_t &sector_mask,
-                            unsigned size, bool wr,
-                            unsigned long long cycle,
-                            unsigned wid, unsigned sid,
-                            unsigned tpc, mem_fetch *original_mf) const;
+                   const active_mask_t &active_mask,
+                   const mem_access_byte_mask_t &byte_mask,
+                   const mem_access_sector_mask_t &sector_mask, unsigned size,
+                   bool wr, unsigned long long cycle, unsigned wid,
+                   unsigned sid, unsigned tpc, mem_fetch *original_mf) const;
   mem_fetch *alloc(const warp_inst_t &inst, const mem_access_t &access,
                    unsigned long long cycle) const {
     warp_inst_t inst_copy = inst;

From 778962ed40707369c97a03a3864cc1ee6c7470b6 Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Wed, 26 May 2021 16:37:39 -0400
Subject: [PATCH 080/154] updating the configs based on the tuner output

---
 .../tested-cfgs/SM75_RTX2060/gpgpusim.config  | 118 ++++++++----------
 .../tested-cfgs/SM86_RTX3070/gpgpusim.config  | 108 +++++++---------
 2 files changed, 100 insertions(+), 126 deletions(-)

diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index f715f3aa4..f35af1b64 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -1,8 +1,3 @@
-# This config models the Turing RTX 2060
-# For more info about turing architecture:
-# 1- https://www.nvidia.com/content/dam/en-zz/Solutions/design-visualization/technologies/turing-architecture/NVIDIA-Turing-Architecture-Whitepaper.pdf
-# 2- "RTX on—The NVIDIA Turing GPU", IEEE MICRO 2020
-
 # functional simulator specification
 -gpgpu_ptx_instruction_classification 0
 -gpgpu_ptx_sim_mode 0
@@ -13,7 +8,8 @@
 -gpgpu_heap_size_limit 8388608
 -gpgpu_runtime_sync_depth_limit 2
 -gpgpu_runtime_pending_launch_count_limit 2048
--gpgpu_kernel_launch_latency 5000
+-gpgpu_kernel_launch_latency  7571
+-gpgpu_TB_launch_latency 0
 
 # Compute Capability
 -gpgpu_compute_capability_major 7
@@ -27,31 +23,27 @@
 -gpgpu_n_clusters 30
 -gpgpu_n_cores_per_cluster 1
 -gpgpu_n_mem 12
--gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_n_sub_partition_per_mchannel 2
 
-# volta clock domains
+# clock domains
 #-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
--gpgpu_clock_domains 1365.0:1365.0:1365.0:3500.0
-# boost mode
-# -gpgpu_clock_domains 1680.0:1680.0:1680.0:3500.0
+-gpgpu_clock_domains 1365:1365:1365:3500.5
 
 # shader core pipeline config
 -gpgpu_shader_registers 65536
 -gpgpu_registers_per_block 65536
 -gpgpu_occupancy_sm_number 75
 
-# This implies a maximum of 32 warps/SM
--gpgpu_shader_core_pipeline 1024:32 
--gpgpu_shader_cta 32
+-gpgpu_shader_core_pipeline 1024:32
+-gpgpu_shader_cta 16
 -gpgpu_simd_model 1
 
 # Pipeline widths and number of FUs
 # ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
-## Turing has 4 SP SIMD units, 4 INT units, 4 SFU units, 8 Tensor core units
-## We need to scale the number of pipeline registers to be equal to the number of SP units
--gpgpu_pipeline_widths 4,0,4,4,4,4,0,4,4,4,8,4,4
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
 -gpgpu_num_sp_units 4
 -gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
 -gpgpu_num_int_units 4
 -gpgpu_tensor_core_avail 1
 -gpgpu_num_tensor_core_units 4
@@ -59,32 +51,18 @@
 # Instruction latencies and initiation intervals
 # "ADD,MAX,MUL,MAD,DIV"
 # All Div operations are executed on SFU unit
--ptx_opcode_latency_int 4,13,4,5,145,32
--ptx_opcode_initiation_int 2,2,2,2,8,4
--ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_latency_int 4,4,4,4,21
+-ptx_opcode_initiation_int 2,2,2,2,2
+-ptx_opcode_latency_fp 4,4,4,4,39
 -ptx_opcode_initiation_fp 2,2,2,2,4
--ptx_opcode_latency_dp 8,19,8,8,330
--ptx_opcode_initiation_dp 4,4,4,4,130
--ptx_opcode_latency_sfu 100
+-ptx_opcode_latency_dp 54,54,54,54,330
+-ptx_opcode_initiation_dp 64,64,64,64,130
+-ptx_opcode_latency_sfu 21
 -ptx_opcode_initiation_sfu 8
 -ptx_opcode_latency_tesnor 64
 -ptx_opcode_initiation_tensor 64
 
-# Turing has four schedulers per core
--gpgpu_num_sched_per_core 4
-# Greedy then oldest scheduler
--gpgpu_scheduler gto
-## In Turing, a warp scheduler can issue 1 inst per cycle
--gpgpu_max_insn_issue_per_warp 1
--gpgpu_dual_issue_diff_exec_units 1
-
-# shared memory bankconflict detection 
--gpgpu_shmem_num_banks 32
--gpgpu_shmem_limited_broadcast 0
--gpgpu_shmem_warp_parts 1
--gpgpu_coalesce_arch 75
-
-# Trung has sub core model, in which each scheduler has its own register file and EUs
+# sub core model: in which each scheduler has its own register file and EUs
 # i.e. schedulers are isolated
 -gpgpu_sub_core_model 1
 # disable specialized operand collectors and use generic operand collectors instead
@@ -92,31 +70,46 @@
 -gpgpu_operand_collector_num_units_gen 8
 -gpgpu_operand_collector_num_in_ports_gen 8
 -gpgpu_operand_collector_num_out_ports_gen 8
-# turing has 8 banks dual-port, 4 schedulers, two banks per scheduler
-# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
--gpgpu_num_reg_banks 16
+# register banks
+-gpgpu_num_reg_banks 8
 -gpgpu_reg_file_port_throughput 2
 
+# warp scheduling
+-gpgpu_num_sched_per_core 4
+-gpgpu_scheduler gto
+# a warp scheduler issue mode
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
 # <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
 # ** Optional parameter - Required when mshr_type==Texture Fifo
+# In adaptive cache, we adaptively assign the remaining shared memory to L1 cache 
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
 -gpgpu_adaptive_cache_config 1
--gpgpu_shmem_option 32,64
--gpgpu_unified_l1d_size 96
+-gpgpu_shmem_option 0,8,16,32,64,64
+-gpgpu_unified_l1d_size 64
 # L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
--gpgpu_l1_cache_write_ratio 25
--gpgpu_l1_latency 20
+-gpgpu_cache:dl1 S:4:128:128,L:T:m:L:L,A:256:32,16:0,32
+-gpgpu_l1_latency 32
 -gpgpu_gmem_skip_L1D 0
--gpgpu_n_cluster_ejection_buffer_size 32
 -gpgpu_flush_l1_cache 1
-# shared memory configuration
+-gpgpu_n_cluster_ejection_buffer_size 32
+-gpgpu_l1_cache_write_ratio 25
+
+# shared memory  configuration
 -gpgpu_shmem_size 65536
 -gpgpu_shmem_sizeDefault 65536
--gpgpu_shmem_per_block 65536
--gpgpu_smem_latency 20
+-gpgpu_shmem_per_block 49152
+-gpgpu_smem_latency 30
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 75
 
-# 64 sets, each 128 bytes 16-way for each memory sub partition (128 KB per memory sub partition). This gives us 3MB L2 cache
+# L2 cache
 -gpgpu_cache:dl2 S:64:128:16,L:B:m:L:P,A:192:4,32:0,32
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64
@@ -127,34 +120,31 @@
 -gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
 -gpgpu_inst_fetch_throughput 4
 # 128 KB Tex
-# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+# Note, TEX is deprected since Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
 -gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
 # 64 KB Const
 -gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
 -gpgpu_perfect_inst_const_cache 1
 
 # interconnection
-#-network_mode 1 
-#-inter_config_file config_turing_islip.icnt
 # use built-in local xbar
 -network_mode 2
 -icnt_in_buffer_limit 512
 -icnt_out_buffer_limit 512
 -icnt_subnets 2
--icnt_arbiter_algo 1
 -icnt_flit_size 40
+-icnt_arbiter_algo 1
 
 # memory partition latency config 
--gpgpu_l2_rop_latency 160
--dram_latency 100
+-gpgpu_l2_rop_latency 194
+-dram_latency 96
 
-# dram model config
+# dram sched config
 -gpgpu_dram_scheduler 1
 -gpgpu_frfcfs_dram_sched_queue_size 64
 -gpgpu_dram_return_queue_size 192
 
-# Turing has GDDR6
-# http://monitorinsider.com/GDDR6.html
+# dram model config
 -gpgpu_n_mem_per_ctrlr 1
 -gpgpu_dram_buswidth 2
 -gpgpu_dram_burst_length 16
@@ -162,9 +152,9 @@
 -gpgpu_mem_address_mask 1
 -gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCC.BCCSSSSS
 
-# Use the same GDDR5 timing, scaled to 3500MHZ
--gpgpu_dram_timing_opt "nbk=16:CCD=4:RRD=10:RCD=20:RAS=50:RP=20:RC=62:
-                        CL=20:WL=8:CDLR=9:WR=20:nbkgrp=4:CCDL=4:RTPL=4"
+# Mem timing 
+-gpgpu_dram_timing_opt nbk=16:CCD=4:RRD=12:RCD=24:RAS=55:RP=24:RC=78:CL=24:WL=8:CDLR=10:WR=24:nbkgrp=4:CCDL=6:RTPL=4
+-dram_dual_bus_interface 0
 
 # select lower bits for bnkgrp to increase bnkgrp parallelism
 -dram_bnk_indexing_policy 0
@@ -179,7 +169,7 @@
 -enable_ptx_file_line_stats 1
 -visualizer_enabled 0
 
-# power model configs, disable it untill we create a real energy model for Volta
+# power model configs, disable it untill we create a real energy model
 -power_simulation_enabled 0
 
 # tracing functionality
diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
index 02cdb9ec7..a68703f09 100644
--- a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
+++ b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
@@ -1,19 +1,14 @@
-# This config models the Ampere RTX 3070
-# For more info about Ampere architecture:
-# https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21730-inside-the-nvidia-ampere-architecture.pdf
-# https://www.nvidia.com/content/dam/en-zz/Solutions/geforce/ampere/pdf/NVIDIA-ampere-GA102-GPU-Architecture-Whitepaper-V1.pdf
-# https://en.wikipedia.org/wiki/GeForce_30_series
 # functional simulator specification
 -gpgpu_ptx_instruction_classification 0
 -gpgpu_ptx_sim_mode 0
--gpgpu_ptx_force_max_capability 86 
+-gpgpu_ptx_force_max_capability 86
 
 # Device Limits
 -gpgpu_stack_size_limit 1024
 -gpgpu_heap_size_limit 8388608
 -gpgpu_runtime_sync_depth_limit 2
 -gpgpu_runtime_pending_launch_count_limit 2048
--gpgpu_kernel_launch_latency 5000
+-gpgpu_kernel_launch_latency  7872
 -gpgpu_TB_launch_latency 0
 
 # Compute Capability
@@ -30,26 +25,21 @@
 -gpgpu_n_mem 16
 -gpgpu_n_sub_partition_per_mchannel 2
 
-# Ampere clock domains
+# clock domains
 #-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
--gpgpu_clock_domains 1320.0:1320.0:1320.0:3500.0
-# boost mode
-# -gpgpu_clock_domains 1780.0:1780.0:1780.0:3500.0
+-gpgpu_clock_domains 1132:1132:1132:3500.5
 
 # shader core pipeline config
 -gpgpu_shader_registers 65536
 -gpgpu_registers_per_block 65536
 -gpgpu_occupancy_sm_number 86
 
-# This implies a maximum of 64 warps/SM
--gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_core_pipeline 1536:32
 -gpgpu_shader_cta 32
 -gpgpu_simd_model 1
 
 # Pipeline widths and number of FUs
 # ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
-## Ampere GA102 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
-## we need to scale the number of pipeline registers to be equal to the number of SP units
 -gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
 -gpgpu_num_sp_units 4
 -gpgpu_num_sfu_units 4
@@ -61,18 +51,18 @@
 # Instruction latencies and initiation intervals
 # "ADD,MAX,MUL,MAD,DIV"
 # All Div operations are executed on SFU unit
--ptx_opcode_latency_int 4,13,4,5,145,21
--ptx_opcode_initiation_int 2,2,2,2,8,4
--ptx_opcode_latency_fp 4,13,4,5,39
--ptx_opcode_initiation_fp 2,2,2,2,4
--ptx_opcode_latency_dp 8,19,8,8,330
--ptx_opcode_initiation_dp 4,4,4,4,130
--ptx_opcode_latency_sfu 100
+-ptx_opcode_latency_int 4,4,4,4,21
+-ptx_opcode_initiation_int 2,2,2,2,2
+-ptx_opcode_latency_fp 4,4,4,4,39
+-ptx_opcode_initiation_fp 1,1,1,1,2
+-ptx_opcode_latency_dp 55,55,55,55,330
+-ptx_opcode_initiation_dp 64,64,64,64,130
+-ptx_opcode_latency_sfu 21
 -ptx_opcode_initiation_sfu 8
--ptx_opcode_latency_tesnor 32
--ptx_opcode_initiation_tensor 32
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
 
-# Ampere has sub core model, in which each scheduler has its own register file and EUs
+# sub core model: in which each scheduler has its own register file and EUs
 # i.e. schedulers are isolated
 -gpgpu_sub_core_model 1
 # disable specialized operand collectors and use generic operand collectors instead
@@ -80,50 +70,47 @@
 -gpgpu_operand_collector_num_units_gen 8
 -gpgpu_operand_collector_num_in_ports_gen 8
 -gpgpu_operand_collector_num_out_ports_gen 8
-# Ampere has 24 double-ported banks, 4 schedulers, 6 banks per scheduler
--gpgpu_num_reg_banks 24
+# register banks
+-gpgpu_num_reg_banks 8
 -gpgpu_reg_file_port_throughput 2
 
-# shared memory bankconflict detection 
--gpgpu_shmem_num_banks 32
--gpgpu_shmem_limited_broadcast 0
--gpgpu_shmem_warp_parts 1
--gpgpu_coalesce_arch 86
-
-# Ampere has four schedulers per core
+# warp scheduling
 -gpgpu_num_sched_per_core 4
-# Greedy then oldest scheduler
 -gpgpu_scheduler gto
-## In Ampere, a warp scheduler can issue 1 inst per cycle
+# a warp scheduler issue mode
 -gpgpu_max_insn_issue_per_warp 1
 -gpgpu_dual_issue_diff_exec_units 1
 
 ## L1/shared memory configuration
 # <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
 # ** Optional parameter - Required when mshr_type==Texture Fifo
-# In Ampere, we assign the remaining shared memory to L1 cache 
-# if the assigned shd mem = 0, then L1 cache = 128KB
-# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#global-memory-8-x
-# disable this mode in case of multi kernels/apps execution
+# In adaptive cache, we adaptively assign the remaining shared memory to L1 cache 
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
 -gpgpu_adaptive_cache_config 1
 -gpgpu_shmem_option 0,8,16,32,64,100
 -gpgpu_unified_l1d_size 128
-# Ampere unified cache has four banks
+# L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
--gpgpu_l1_cache_write_ratio 25
+-gpgpu_cache:dl1 S:4:128:256,L:T:m:L:L,A:384:48,16:0,32
+-gpgpu_l1_latency 39
 -gpgpu_gmem_skip_L1D 0
--gpgpu_l1_latency 20
--gpgpu_n_cluster_ejection_buffer_size 32
 -gpgpu_flush_l1_cache 1
-# shared memory configuration
+-gpgpu_n_cluster_ejection_buffer_size 32
+-gpgpu_l1_cache_write_ratio 25
+
+# shared memory  configuration
 -gpgpu_shmem_size 102400
 -gpgpu_shmem_sizeDefault 102400
--gpgpu_shmem_per_block 102400
--gpgpu_smem_latency 20
+-gpgpu_shmem_per_block 49152
+-gpgpu_smem_latency 29
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 86
 
-# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 3MB L2 cache
--gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+# L2 cache
+-gpgpu_cache:dl2 S:64:128:16,L:B:m:L:P,A:192:4,32:0,32
 -gpgpu_cache:dl2_texture_only 0
 -gpgpu_dram_partition_queues 64:64:64:64
 -gpgpu_perf_sim_memcpy 1
@@ -133,15 +120,13 @@
 -gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
 -gpgpu_inst_fetch_throughput 4
 # 128 KB Tex
-# Note, TEX is deprecated, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+# Note, TEX is deprected since Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
 -gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
 # 64 KB Const
 -gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
 -gpgpu_perfect_inst_const_cache 1
 
 # interconnection
-#-network_mode 1 
-#-inter_config_file config_ampere_islip.icnt
 # use built-in local xbar
 -network_mode 2
 -icnt_in_buffer_limit 512
@@ -151,16 +136,15 @@
 -icnt_arbiter_algo 1
 
 # memory partition latency config 
--gpgpu_l2_rop_latency 160
--dram_latency 100
+-gpgpu_l2_rop_latency 187
+-dram_latency 254
 
-# dram model config
+# dram sched config
 -gpgpu_dram_scheduler 1
 -gpgpu_frfcfs_dram_sched_queue_size 64
 -gpgpu_dram_return_queue_size 192
 
-# Ampere RTX3060 has GDDR6
-# http://monitorinsider.com/GDDR6.html
+# dram model config
 -gpgpu_n_mem_per_ctrlr 1
 -gpgpu_dram_buswidth 2
 -gpgpu_dram_burst_length 16
@@ -168,9 +152,9 @@
 -gpgpu_mem_address_mask 1
 -gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCC.BCCSSSSS
 
-# Use the same GDDR5 timing, scaled to 3500MHZ
--gpgpu_dram_timing_opt "nbk=16:CCD=4:RRD=10:RCD=20:RAS=50:RP=20:RC=62:
-                        CL=20:WL=8:CDLR=9:WR=20:nbkgrp=4:CCDL=4:RTPL=4"
+# Mem timing 
+-gpgpu_dram_timing_opt nbk=16:CCD=4:RRD=12:RCD=24:RAS=55:RP=24:RC=78:CL=24:WL=8:CDLR=10:WR=24:nbkgrp=4:CCDL=6:RTPL=4
+-dram_dual_bus_interface 0
 
 # select lower bits for bnkgrp to increase bnkgrp parallelism
 -dram_bnk_indexing_policy 0
@@ -185,7 +169,7 @@
 -enable_ptx_file_line_stats 1
 -visualizer_enabled 0
 
-# power model configs, disable it untill we create a real energy model for Ampere
+# power model configs, disable it untill we create a real energy model
 -power_simulation_enabled 0
 
 # tracing functionality

From 3eea0140bc19dc1822d40e29d1aa55643894c6d3 Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Wed, 26 May 2021 19:44:28 -0400
Subject: [PATCH 081/154] changing kernel latency

---
 configs/tested-cfgs/SM75_RTX2060/gpgpusim.config | 2 +-
 configs/tested-cfgs/SM86_RTX3070/gpgpusim.config | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index f35af1b64..a9943703a 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -8,7 +8,7 @@
 -gpgpu_heap_size_limit 8388608
 -gpgpu_runtime_sync_depth_limit 2
 -gpgpu_runtime_pending_launch_count_limit 2048
--gpgpu_kernel_launch_latency  7571
+-gpgpu_kernel_launch_latency 5000
 -gpgpu_TB_launch_latency 0
 
 # Compute Capability
diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
index a68703f09..fda3851d0 100644
--- a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
+++ b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
@@ -8,7 +8,7 @@
 -gpgpu_heap_size_limit 8388608
 -gpgpu_runtime_sync_depth_limit 2
 -gpgpu_runtime_pending_launch_count_limit 2048
--gpgpu_kernel_launch_latency  7872
+-gpgpu_kernel_launch_latency 5000
 -gpgpu_TB_launch_latency 0
 
 # Compute Capability

From 6ad461a95ac71e0597274c4f750ce03bb3a6871e Mon Sep 17 00:00:00 2001
From: "Mahmoud Khairy A. Abdallah" <abdallm@purdue.edu>
Date: Thu, 27 May 2021 15:38:26 -0400
Subject: [PATCH 082/154] fixing configs

---
 configs/tested-cfgs/SM75_RTX2060/gpgpusim.config | 8 ++++----
 configs/tested-cfgs/SM86_RTX3070/gpgpusim.config | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index a9943703a..cc3152c59 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -55,7 +55,7 @@
 -ptx_opcode_initiation_int 2,2,2,2,2
 -ptx_opcode_latency_fp 4,4,4,4,39
 -ptx_opcode_initiation_fp 2,2,2,2,4
--ptx_opcode_latency_dp 54,54,54,54,330
+-ptx_opcode_latency_dp 64,64,64,64,330
 -ptx_opcode_initiation_dp 64,64,64,64,130
 -ptx_opcode_latency_sfu 21
 -ptx_opcode_initiation_sfu 8
@@ -87,11 +87,11 @@
 # In adaptive cache, we adaptively assign the remaining shared memory to L1 cache 
 # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
 -gpgpu_adaptive_cache_config 1
--gpgpu_shmem_option 0,8,16,32,64,64
--gpgpu_unified_l1d_size 64
+-gpgpu_shmem_option 32,64
+-gpgpu_unified_l1d_size 96
 # L1 cache configuration
 -gpgpu_l1_banks 4
--gpgpu_cache:dl1 S:4:128:128,L:T:m:L:L,A:256:32,16:0,32
+-gpgpu_cache:dl1 S:4:128:64,L:T:m:L:L,A:256:32,16:0,32
 -gpgpu_l1_latency 32
 -gpgpu_gmem_skip_L1D 0
 -gpgpu_flush_l1_cache 1
diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
index fda3851d0..098cb1d20 100644
--- a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
+++ b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
@@ -55,7 +55,7 @@
 -ptx_opcode_initiation_int 2,2,2,2,2
 -ptx_opcode_latency_fp 4,4,4,4,39
 -ptx_opcode_initiation_fp 1,1,1,1,2
--ptx_opcode_latency_dp 55,55,55,55,330
+-ptx_opcode_latency_dp 64,64,64,64,330
 -ptx_opcode_initiation_dp 64,64,64,64,130
 -ptx_opcode_latency_sfu 21
 -ptx_opcode_initiation_sfu 8

From 110aeb12257b030b32cdc47e4cca0ed1089ac855 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Mon, 31 May 2021 15:55:18 -0400
Subject: [PATCH 083/154] rewrite shmem_option parsing

---
 src/gpgpu-sim/shader.h | 24 +++++-------------------
 1 file changed, 5 insertions(+), 19 deletions(-)

diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 2d2f051b5..4c6de0683 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -1498,25 +1498,11 @@ class shader_core_config : public core_config {
 
     // parse gpgpu_shmem_option for adpative cache config
     if (adaptive_cache_config) {
-      for (unsigned i = 0; i < strlen(gpgpu_shmem_option); i++) {
-        char option[4];
-        int j = 0;
-        while (gpgpu_shmem_option[i] != ',' && i < strlen(gpgpu_shmem_option)) {
-          if (gpgpu_shmem_option[i] == ' ') {
-            // skip spaces
-            i++;
-          } else {
-            if (!isdigit(gpgpu_shmem_option[i])) {
-              // check for non digits, which should not be here
-              assert(0 && "invalid config: -gpgpu_shmem_option");
-            }
-            option[j] = gpgpu_shmem_option[i];
-            j++;
-            i++;
-          }
-        }
-        // convert KB -> B
-        shmem_opt_list.push_back((unsigned)atoi(option) * 1024);
+      std::stringstream ss(gpgpu_shmem_option);
+      while (ss.good()) {
+        std::string option;
+        std::getline(ss, option, ',');
+        shmem_opt_list.push_back((unsigned)std::stoi(option) * 1024);
       }
       std::sort(shmem_opt_list.begin(), shmem_opt_list.end());
     }

From 04462cbf5b56e0416c3a733b4214351ac227f4c0 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Thu, 3 Jun 2021 13:55:44 -0400
Subject: [PATCH 084/154] update readable

---
 src/gpgpu-sim/gpu-cache.cc | 30 +++++++++++++++++++++++++++---
 src/gpgpu-sim/gpu-cache.h  |  2 +-
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 28d3215ae..a35f5022d 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -284,10 +284,12 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
       // number of dirty lines / total lines in the cache
       float dirty_line_percentage =
           ((float)m_dirty / (m_config.m_nset * m_config.m_assoc)) * 100;
+      // If the cacheline is from a load op (not modified), 
+      // or the total dirty cacheline is above a specific value,
+      // Then this cacheline is eligible to be considered for replacement candidate
+      // i.e. Only evict clean cachelines until total dirty cachelines reach the limit.
       if (!line->is_modified_line() ||
           dirty_line_percentage >= m_config.m_wr_percent) {
-        // if number of dirty lines in the cache is greater than
-        // a specific value
         all_reserved = false;
         if (line->is_invalid_line()) {
           invalid_line = index;
@@ -354,7 +356,7 @@ enum cache_request_status tag_array::access(new_addr_type addr, unsigned time,
       if (m_config.m_alloc_policy == ON_MISS) {
         if (m_lines[idx]->is_modified_line()) {
           wb = true;
-          m_lines[idx]->set_byte_mask(mf);
+          // m_lines[idx]->set_byte_mask(mf);
           evicted.set_info(m_lines[idx]->m_block_addr,
                            m_lines[idx]->get_modified_size(),
                            m_lines[idx]->get_dirty_byte_mask(),
@@ -1191,6 +1193,25 @@ void data_cache::send_write_request(mem_fetch *mf, cache_event request,
   mf->set_status(m_miss_queue_status, time);
 }
 
+void data_cache::update_m_readable(mem_fetch *mf, unsigned cache_index) {
+  cache_block_t *block = m_tag_array->get_block(cache_index);
+  for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; i++) {
+    if (mf->get_access_sector_mask().test(i)) {
+      bool all_set = true;
+      for (unsigned k = i * SECTOR_SIZE; k < (i + 1) * SECTOR_SIZE; k++) {
+        // If any bit in the byte mask (within the sector) is not set, 
+        // the sector is unreadble
+        if (!block->get_dirty_byte_mask().test(k)) {
+          all_set = false;
+          break;
+        }
+      }
+      if (all_set)
+        block->set_m_readable(true, mf->get_access_sector_mask());
+    }
+  }
+}
+
 /****** Write-hit functions (Set by config file) ******/
 
 /// Write-back hit: Mark block as modified
@@ -1207,6 +1228,7 @@ cache_request_status data_cache::wr_hit_wb(new_addr_type addr,
   }
   block->set_status(MODIFIED, mf->get_access_sector_mask());
   block->set_byte_mask(mf);
+  update_m_readable(mf,cache_index);
 
   return HIT;
 }
@@ -1230,6 +1252,7 @@ cache_request_status data_cache::wr_hit_wt(new_addr_type addr,
   }
   block->set_status(MODIFIED, mf->get_access_sector_mask());
   block->set_byte_mask(mf);
+  update_m_readable(mf,cache_index);
 
   // generate a write-through
   send_write_request(mf, cache_event(WRITE_REQUEST_SENT), time, events);
@@ -1543,6 +1566,7 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
     if (m_status == HIT_RESERVED)
       block->set_readable_on_fill(true, mf->get_access_sector_mask());
   }
+  update_m_readable(mf,cache_index);
 
   if (m_status != RESERVATION_FAIL) {
     // If evicted block is modified and not a write-through
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 7a2a8d94d..67d084cbf 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -1570,7 +1570,7 @@ class data_cache : public baseline_cache {
   /// Sends write request to lower level memory (write or writeback)
   void send_write_request(mem_fetch *mf, cache_event request, unsigned time,
                           std::list<cache_event> &events);
-
+  void update_m_readable(mem_fetch *mf, unsigned cache_index);
   // Member Function pointers - Set by configuration options
   // to the functions below each grouping
   /******* Write-hit configs *******/

From e9d781a467dd21c3ec3f1508aede803cb3ffb2c3 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Thu, 3 Jun 2021 13:56:04 -0400
Subject: [PATCH 085/154] minor improvements

---
 src/gpgpu-sim/l2cache.cc |  9 +++++----
 src/gpgpu-sim/shader.cc  | 34 ++++++++++++++--------------------
 2 files changed, 19 insertions(+), 24 deletions(-)

diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index 57e8ea97c..f1c761fe5 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -716,7 +716,7 @@ memory_sub_partition::breakdown_request_to_sector_requests(mem_fetch *mf) {
   if (mf->get_data_size() == SECTOR_SIZE &&
       mf->get_access_sector_mask().count() == 1) {
     result.push_back(mf);
-  } else if (mf->get_data_size() == 128) {
+  } else if (mf->get_data_size() == MAX_MEMORY_ACCESS_SIZE) {
     // break down every sector
     mem_access_byte_mask_t mask;
     for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; i++) {
@@ -732,11 +732,12 @@ memory_sub_partition::breakdown_request_to_sector_requests(mem_fetch *mf) {
 
       result.push_back(n_mf);
     }
+    // This is for constant cache
   } else if (mf->get_data_size() == 64 &&
-             (mf->get_access_sector_mask().to_string() == "1111" ||
-              mf->get_access_sector_mask().to_string() == "0000")) {
+             (mf->get_access_sector_mask().all() ||
+              mf->get_access_sector_mask().none())) {
     unsigned start;
-    if (mf->get_addr() % 128 == 0)
+    if (mf->get_addr() % MAX_MEMORY_ACCESS_SIZE == 0)
       start = 0;
     else
       start = 2;
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index c65affdb6..0f6631229 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3344,30 +3344,24 @@ unsigned int shader_core_config::max_cta(const kernel_info_t &k) const {
     // Unified cache config is in KB. Converting to B
     unsigned total_unified = m_L1D_config.m_unified_cache_size * 1024;
 
-    switch (adaptive_cache_config) {
-      case FIXED:
-        break;
-      case ADAPTIVE_CACHE: {
-        bool l1d_configured = false;
-        unsigned max_assoc = m_L1D_config.get_max_assoc();
-
-        for (std::vector<unsigned>::const_iterator it = shmem_opt_list.begin();
-             it < shmem_opt_list.end(); it++) {
-          if (total_shmem <= *it) {
-            float l1_ratio = 1 - ((float)*(it) / total_unified);
-            m_L1D_config.set_assoc(max_assoc * l1_ratio);
-            l1d_configured = true;
-            break;
-          }
-        }
-
-        assert(l1d_configured && "no shared memory option found");
+    bool l1d_configured = false;
+    unsigned max_assoc = m_L1D_config.get_max_assoc();
+
+    for (std::vector<unsigned>::const_iterator it = shmem_opt_list.begin();
+         it < shmem_opt_list.end(); it++) {
+      if (total_shmem <= *it) {
+        float l1_ratio = 1 - ((float)*(it) / total_unified);
+        // make sure the ratio is between 0 and 1
+        assert(0 <= l1_ratio && l1_ratio <= 1);
+        // round to nearest instead of round down
+        m_L1D_config.set_assoc(max_assoc * l1_ratio + 0.5f);
+        l1d_configured = true;
         break;
       }
-      default:
-        assert(0);
     }
 
+    assert(l1d_configured && "no shared memory option found");
+
     if (m_L1D_config.is_streaming()) {
       // for streaming cache, if the whole memory is allocated
       // to the L1 cache, then make the allocation to be on_MISS

From 0f088dc11a47cb3de905de3483f6a1c019b7d283 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Wed, 16 Jun 2021 10:22:57 -0400
Subject: [PATCH 086/154] correct dirty counter

---
 src/gpgpu-sim/gpu-cache.cc | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index a35f5022d..c93ac5fbc 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -428,9 +428,11 @@ void tag_array::fill(new_addr_type addr, unsigned time,
 
 void tag_array::fill(unsigned index, unsigned time, mem_fetch *mf) {
   assert(m_config.m_alloc_policy == ON_MISS);
-  m_lines[index]->fill(time, mf->get_access_sector_mask(),
-                       mf->get_access_byte_mask());
-  m_dirty++;
+  bool before = m_lines[index]->is_modified_line();
+  m_lines[index]->fill(time, mf->get_access_sector_mask(), mf->get_access_byte_mask());
+  if (m_lines[index]->is_modified_line() && !before) {
+    m_dirty++;
+  }
 }
 
 // TODO: we need write back the flushed data to the upper level

From 3cf24b8afea9a519fa052e68cf10c1f774ab5f68 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Tue, 22 Jun 2021 20:27:43 -0400
Subject: [PATCH 087/154] WT in lazy fetch on read

---
 src/gpgpu-sim/gpu-cache.cc | 28 +++++-----------------------
 1 file changed, 5 insertions(+), 23 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index c93ac5fbc..7416246f0 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -1511,35 +1511,17 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
     new_addr_type addr, unsigned cache_index, mem_fetch *mf, unsigned time,
     std::list<cache_event> &events, enum cache_request_status status) {
   new_addr_type block_addr = m_config.block_addr(addr);
-  new_addr_type mshr_addr = m_config.mshr_addr(mf->get_addr());
 
   // if the request writes to the whole cache line/sector, then, write and set
   // cache line Modified. and no need to send read request to memory or reserve
   // mshr
 
-  // Write allocate, maximum 2 requests (write miss, write back request)
-  // Conservatively ensure the worst-case request can be handled this
-  // cycle
-  if (m_config.m_write_policy == WRITE_THROUGH) {
-    bool mshr_hit = m_mshrs.probe(mshr_addr);
-    bool mshr_avail = !m_mshrs.full(mshr_addr);
-    if (miss_queue_full(1) ||
-        (!(mshr_hit && mshr_avail) &&
-         !(!mshr_hit && mshr_avail &&
-           (m_miss_queue.size() < m_config.m_miss_queue_size)))) {
-      // check what is the exactly the failure reason
-      if (miss_queue_full(1))
-        m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
-      else if (mshr_hit && !mshr_avail)
-        m_stats.inc_fail_stats(mf->get_access_type(), MSHR_MERGE_ENRTY_FAIL);
-      else if (!mshr_hit && !mshr_avail)
-        m_stats.inc_fail_stats(mf->get_access_type(), MSHR_ENRTY_FAIL);
-      else
-        assert(0);
-
-      return RESERVATION_FAIL;
-    }
+  if (miss_queue_full(0)) {
+    m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
+    return RESERVATION_FAIL;  // cannot handle request this cycle
+  }
 
+  if (m_config.m_write_policy == WRITE_THROUGH) {
     send_write_request(mf, cache_event(WRITE_REQUEST_SENT), time, events);
   }
 

From b1befa8422493e0deb45811e6b87399355a532ed Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Mon, 16 Aug 2021 18:11:30 -0400
Subject: [PATCH 088/154] Adding restricted round robin scheduler

---
 src/gpgpu-sim/shader.cc | 43 ++++++++++++++++++++++++++++++++++++++++-
 src/gpgpu-sim/shader.h  | 28 +++++++++++++++++++++++++++
 2 files changed, 70 insertions(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 0f6631229..7cee40fc9 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -191,6 +191,8 @@ void shader_core_ctx::create_schedulers() {
                 ? CONCRETE_SCHEDULER_TWO_LEVEL_ACTIVE
                 : sched_config.find("gto") != std::string::npos
                       ? CONCRETE_SCHEDULER_GTO
+                      : sched_config.find("rrr") != std::string::npos
+                            ? CONCRETE_SCHEDULER_RRR
                       : sched_config.find("old") != std::string::npos
                             ? CONCRETE_SCHEDULER_OLDEST_FIRST
                             : sched_config.find("warp_limiting") !=
@@ -225,6 +227,14 @@ void shader_core_ctx::create_schedulers() {
             &m_pipeline_reg[ID_OC_TENSOR_CORE], m_specilized_dispatch_reg,
             &m_pipeline_reg[ID_OC_MEM], i));
         break;
+      case CONCRETE_SCHEDULER_RRR:
+        schedulers.push_back(new rrr_scheduler(
+            m_stats, this, m_scoreboard, m_simt_stack, &m_warp,
+            &m_pipeline_reg[ID_OC_SP], &m_pipeline_reg[ID_OC_DP],
+            &m_pipeline_reg[ID_OC_SFU], &m_pipeline_reg[ID_OC_INT],
+            &m_pipeline_reg[ID_OC_TENSOR_CORE], m_specilized_dispatch_reg,
+            &m_pipeline_reg[ID_OC_MEM], i));
+        break;
       case CONCRETE_SCHEDULER_OLDEST_FIRST:
         schedulers.push_back(new oldest_scheduler(
             m_stats, this, m_scoreboard, m_simt_stack, &m_warp,
@@ -1101,6 +1111,33 @@ void scheduler_unit::order_lrr(
   }
 }
 
+template <class T>
+void scheduler_unit::order_rrr(
+    std::vector<T> &result_list, const typename std::vector<T> &input_list,
+    const typename std::vector<T>::const_iterator &last_issued_from_input,
+    unsigned num_warps_to_add) {
+  result_list.clear();
+
+  if (m_num_issued_last_cycle > 0 || warp(m_current_turn_warp).done_exit() ||
+      warp(m_current_turn_warp).waiting()) {
+    std::vector<shd_warp_t *>::const_iterator iter =
+      (last_issued_from_input == input_list.end()) ? 
+        input_list.begin() : last_issued_from_input + 1;
+    for (unsigned count = 0; count < num_warps_to_add; ++iter, ++count) {
+      if (iter == input_list.end()) {
+      iter = input_list.begin();
+      }
+      unsigned warp_id = (*iter)->get_warp_id();
+      if (!(*iter)->done_exit() && !(*iter)->waiting()) {
+        result_list.push_back(*iter);
+        m_current_turn_warp = warp_id;
+        break;
+      }
+    }
+  } else {
+    result_list.push_back(&warp(m_current_turn_warp));
+  }
+}
 /**
  * A general function to order things in an priority-based way.
  * The core usage of the function is similar to order_lrr.
@@ -1433,7 +1470,7 @@ void scheduler_unit::cycle() {
           m_last_supervised_issued = supervised_iter;
         }
       }
-
+      m_num_issued_last_cycle = issued;
       if (issued == 1)
         m_stats->single_issue_nums[m_id]++;
       else if (issued > 1)
@@ -1482,6 +1519,10 @@ void lrr_scheduler::order_warps() {
   order_lrr(m_next_cycle_prioritized_warps, m_supervised_warps,
             m_last_supervised_issued, m_supervised_warps.size());
 }
+void rrr_scheduler::order_warps() {
+  order_rrr(m_next_cycle_prioritized_warps, m_supervised_warps,
+            m_last_supervised_issued, m_supervised_warps.size());
+}
 
 void gto_scheduler::order_warps() {
   order_by_priority(m_next_cycle_prioritized_warps, m_supervised_warps,
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 4c6de0683..9cb256a29 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -321,6 +321,7 @@ enum concrete_scheduler {
   CONCRETE_SCHEDULER_LRR = 0,
   CONCRETE_SCHEDULER_GTO,
   CONCRETE_SCHEDULER_TWO_LEVEL_ACTIVE,
+  CONCRETE_SCHEDULER_RRR,
   CONCRETE_SCHEDULER_WARP_LIMITING,
   CONCRETE_SCHEDULER_OLDEST_FIRST,
   NUM_CONCRETE_SCHEDULERS
@@ -372,6 +373,12 @@ class scheduler_unit {  // this can be copied freely, so can be used in std
       const typename std::vector<T> &input_list,
       const typename std::vector<T>::const_iterator &last_issued_from_input,
       unsigned num_warps_to_add);
+  template <typename T>
+  void order_rrr(
+      typename std::vector<T> &result_list,
+      const typename std::vector<T> &input_list,
+      const typename std::vector<T>::const_iterator &last_issued_from_input,
+      unsigned num_warps_to_add);
 
   enum OrderingType {
     // The item that issued last is prioritized first then the sorted result
@@ -430,6 +437,8 @@ class scheduler_unit {  // this can be copied freely, so can be used in std
   register_set *m_tensor_core_out;
   register_set *m_mem_out;
   std::vector<register_set *> &m_spec_cores_out;
+  unsigned m_num_issued_last_cycle;
+  unsigned m_current_turn_warp;
 
   int m_id;
 };
@@ -453,6 +462,25 @@ class lrr_scheduler : public scheduler_unit {
   }
 };
 
+class rrr_scheduler : public scheduler_unit {
+ public:
+  rrr_scheduler(shader_core_stats *stats, shader_core_ctx *shader,
+                Scoreboard *scoreboard, simt_stack **simt,
+                std::vector<shd_warp_t *> *warp, register_set *sp_out,
+                register_set *dp_out, register_set *sfu_out,
+                register_set *int_out, register_set *tensor_core_out,
+                std::vector<register_set *> &spec_cores_out,
+                register_set *mem_out, int id)
+      : scheduler_unit(stats, shader, scoreboard, simt, warp, sp_out, dp_out,
+                       sfu_out, int_out, tensor_core_out, spec_cores_out,
+                       mem_out, id) {}
+  virtual ~rrr_scheduler() {}
+  virtual void order_warps();
+  virtual void done_adding_supervised_warps() {
+    m_last_supervised_issued = m_supervised_warps.end();
+  }
+};
+
 class gto_scheduler : public scheduler_unit {
  public:
   gto_scheduler(shader_core_stats *stats, shader_core_ctx *shader,

From b6581477462ea15d92967588277c4fe822a67bf7 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Mon, 16 Aug 2021 18:15:20 -0400
Subject: [PATCH 089/154] better oc selecting when sub core enabled

---
 src/gpgpu-sim/shader.cc |  3 +++
 src/gpgpu-sim/shader.h  | 45 ++++++++++++++++++++++++++++++++++++-----
 2 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 7cee40fc9..bcfda1867 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3997,6 +3997,9 @@ void opndcoll_rfu_t::init(unsigned num_banks, shader_core_ctx *shader) {
     m_cu[j]->init(j, num_banks, m_bank_warp_shift, shader->get_config(), this,
                   sub_core_model, reg_id, m_num_banks_per_sched);
   }
+  for (unsigned j = 0; j < m_dispatch_units.size(); j++) {
+    m_dispatch_units[j].init(sub_core_model,m_num_warp_scheds);
+  }
   m_initialized = true;
 }
 
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 9cb256a29..f2fac1209 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -950,13 +950,44 @@ class opndcoll_rfu_t {  // operand collector based register file unit
       m_num_collectors = (*cus).size();
       m_next_cu = 0;
     }
+    void init(bool sub_core_model, unsigned num_warp_scheds) {
+      m_sub_core_model = sub_core_model;
+      m_num_warp_scheds = num_warp_scheds;
+      if (m_sub_core_model) {
+        m_last_cu_set = new unsigned(m_num_warp_scheds);
+        for (unsigned i = 0; i < m_num_warp_scheds; i++)
+        {
+          m_last_cu_set[i] = i * m_num_collectors / m_num_warp_scheds;
+        }
+      }
+      
+    }
 
     collector_unit_t *find_ready() {
-      for (unsigned n = 0; n < m_num_collectors; n++) {
-        unsigned c = (m_last_cu + n + 1) % m_num_collectors;
-        if ((*m_collector_units)[c].ready()) {
-          m_last_cu = c;
-          return &((*m_collector_units)[c]);
+      if (m_sub_core_model) {
+        assert(m_num_collectors % m_num_warp_scheds == 0 &&
+                 m_num_collectors >= m_num_warp_scheds);
+        unsigned cusPerSched = m_num_collectors / m_num_warp_scheds;
+        for (unsigned i = 0; i < m_num_warp_scheds; i++) {
+          unsigned cuLowerBound = i * cusPerSched;
+          unsigned cuUpperBound = cuLowerBound + cusPerSched;
+          assert(0 <= cuLowerBound && cuUpperBound <= m_num_collectors);
+          assert(cuLowerBound <= m_last_cu_set[i] && m_last_cu_set[i] <= cuUpperBound);
+          for (unsigned j = cuLowerBound; j < cuUpperBound; j++) {
+            unsigned c = cuLowerBound + (m_last_cu_set[i] + j + 1) % cusPerSched;
+            if ((*m_collector_units)[c].ready()) {
+            m_last_cu_set[i] = c;
+            return &((*m_collector_units)[c]);
+            }
+          }
+        }
+      } else {
+        for (unsigned n = 0; n < m_num_collectors; n++) {
+          unsigned c = (m_last_cu + n + 1) % m_num_collectors;
+          if ((*m_collector_units)[c].ready()) {
+            m_last_cu = c;
+            return &((*m_collector_units)[c]);
+          }
         }
       }
       return NULL;
@@ -966,7 +997,11 @@ class opndcoll_rfu_t {  // operand collector based register file unit
     unsigned m_num_collectors;
     std::vector<collector_unit_t> *m_collector_units;
     unsigned m_last_cu;  // dispatch ready cu's rr
+    unsigned *m_last_cu_set;
     unsigned m_next_cu;  // for initialization
+
+    bool m_sub_core_model;
+    unsigned m_num_warp_scheds;
   };
 
   // opndcoll_rfu_t data members

From a8256e50a6d25338f659da76ff9c3595132f54b2 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Mon, 23 Aug 2021 13:06:13 -0400
Subject: [PATCH 090/154] Update volta to use lrr scheduler

---
 configs/tested-cfgs/SM7_QV100/gpgpusim.config  | 2 +-
 configs/tested-cfgs/SM7_TITANV/gpgpusim.config | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index 5f22a42b0..425bc1690 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -99,7 +99,7 @@
 # Volta has four schedulers per core
 -gpgpu_num_sched_per_core 4
 # Greedy then oldest scheduler
--gpgpu_scheduler gto
+-gpgpu_scheduler lrr
 ## In Volta, a warp scheduler can issue 1 inst per cycle
 -gpgpu_max_insn_issue_per_warp 1
 -gpgpu_dual_issue_diff_exec_units 1
diff --git a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
index c44563fb6..0c69c7084 100644
--- a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
@@ -100,7 +100,7 @@
 # Volta has four schedulers per core
 -gpgpu_num_sched_per_core 4
 # Greedy then oldest scheduler
--gpgpu_scheduler gto
+-gpgpu_scheduler lrr
 ## In Volta, a warp scheduler can issue 1 inst per cycle
 -gpgpu_max_insn_issue_per_warp 1
 -gpgpu_dual_issue_diff_exec_units 1

From 84c4f46fb78b529ab2447d7a676f5b3ac2d9c05f Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Mon, 23 Aug 2021 13:06:54 -0400
Subject: [PATCH 091/154] Ampere and Turing also lrr scheduler

---
 configs/tested-cfgs/SM75_RTX2060/gpgpusim.config | 2 +-
 configs/tested-cfgs/SM86_RTX3070/gpgpusim.config | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index cc3152c59..0ae91a50f 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -76,7 +76,7 @@
 
 # warp scheduling
 -gpgpu_num_sched_per_core 4
--gpgpu_scheduler gto
+-gpgpu_scheduler lrr
 # a warp scheduler issue mode
 -gpgpu_max_insn_issue_per_warp 1
 -gpgpu_dual_issue_diff_exec_units 1
diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
index 098cb1d20..854378151 100644
--- a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
+++ b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
@@ -76,7 +76,7 @@
 
 # warp scheduling
 -gpgpu_num_sched_per_core 4
--gpgpu_scheduler gto
+-gpgpu_scheduler lrr
 # a warp scheduler issue mode
 -gpgpu_max_insn_issue_per_warp 1
 -gpgpu_dual_issue_diff_exec_units 1

From 84c6cf45131e42b1a724ebf7977987a9ddb70db9 Mon Sep 17 00:00:00 2001
From: VijayKandiah <vkz4947@peroni.cs.northwestern.edu>
Date: Sun, 17 Oct 2021 02:18:10 -0500
Subject: [PATCH 092/154] AccelWattch dev Integration

---
 CHANGES                                       |    4 +
 COPYRIGHT                                     |   30 +
 Makefile                                      |   16 +-
 README.md                                     |   49 +-
 .../SM6_TITANX/accelwattch_ptx_sim.xml        |  623 +++++++++
 .../SM6_TITANX/accelwattch_ptx_sim_alt.xml    |  623 +++++++++
 .../SM6_TITANX/accelwattch_sass_sim.xml       |  613 +++++++++
 .../SM6_TITANX/accelwattch_sass_sim_alt.xml   |  613 +++++++++
 .../tested-cfgs/SM6_TITANX/gpgpusim.config    |   33 +-
 .../tested-cfgs/SM75_RTX2060/gpgpusim.config  |    3 +-
 .../SM75_RTX2060_S/accelwattch_ptx_sim.xml    |  623 +++++++++
 .../accelwattch_ptx_sim_alt.xml               |  623 +++++++++
 .../SM75_RTX2060_S/accelwattch_sass_sim.xml   |  613 +++++++++
 .../accelwattch_sass_sim_alt.xml              |  613 +++++++++
 .../SM75_RTX2060_S/config_turing_islip.icnt   |   73 ++
 .../SM75_RTX2060_S/gpgpusim.config            |  210 +++
 .../SM7_QV100/accelwattch_ptx_sim.xml         |  623 +++++++++
 .../SM7_QV100/accelwattch_ptx_sim_alt.xml     |  623 +++++++++
 .../SM7_QV100/accelwattch_sass_hw.xml         |  613 +++++++++
 .../SM7_QV100/accelwattch_sass_hybrid.xml     |  613 +++++++++
 .../SM7_QV100/accelwattch_sass_sim.xml        |  613 +++++++++
 .../SM7_QV100/accelwattch_sass_sim_alt.xml    |  613 +++++++++
 configs/tested-cfgs/SM7_QV100/gpgpusim.config |   38 +-
 configs/tested-cfgs/SM7_QV100/hw_perf.csv     |   26 +
 .../SM7_TITANV/accelwattch_ptx_sim.xml        |  623 +++++++++
 .../SM7_TITANV/accelwattch_ptx_sim_alt.xml    |  623 +++++++++
 .../SM7_TITANV/accelwattch_sass_hw.xml        |  613 +++++++++
 .../SM7_TITANV/accelwattch_sass_hybrid.xml    |  613 +++++++++
 .../SM7_TITANV/accelwattch_sass_sim.xml       |  613 +++++++++
 .../SM7_TITANV/accelwattch_sass_sim_alt.xml   |  613 +++++++++
 .../tested-cfgs/SM7_TITANV/gpgpusim.config    |    3 -
 format-code.sh                                |    4 +-
 setup_environment                             |   14 +-
 src/abstract_hardware_model.cc                |   28 +-
 src/abstract_hardware_model.h                 |   69 +-
 src/{gpuwattch => accelwattch}/Alpha21364.xml |    0
 src/{gpuwattch => accelwattch}/Niagara1.xml   |    0
 .../Niagara1_sharing.xml                      |    0
 .../Niagara1_sharing_DC.xml                   |    0
 .../Niagara1_sharing_SBT.xml                  |    0
 .../Niagara1_sharing_ST.xml                   |    0
 src/{gpuwattch => accelwattch}/Niagara2.xml   |    0
 src/{gpuwattch => accelwattch}/Penryn.xml     |    0
 src/{gpuwattch => accelwattch}/README         |    0
 src/{gpuwattch => accelwattch}/XML_Parse.cc   |  361 +++++-
 src/{gpuwattch => accelwattch}/XML_Parse.h    |   60 +-
 src/{gpuwattch => accelwattch}/Xeon.xml       |    0
 src/{gpuwattch => accelwattch}/arch_const.h   |    0
 src/{gpuwattch => accelwattch}/array.cc       |    0
 src/{gpuwattch => accelwattch}/array.h        |    0
 .../basic_components.cc                       |    0
 .../basic_components.h                        |    0
 src/{gpuwattch => accelwattch}/cacti/README   |    0
 .../cacti/Ucache.cc                           |    4 +-
 src/{gpuwattch => accelwattch}/cacti/Ucache.h |    0
 .../cacti/arbiter.cc                          |    0
 .../cacti/arbiter.h                           |    0
 src/{gpuwattch => accelwattch}/cacti/area.cc  |    0
 src/{gpuwattch => accelwattch}/cacti/area.h   |    0
 src/{gpuwattch => accelwattch}/cacti/bank.cc  |    0
 src/{gpuwattch => accelwattch}/cacti/bank.h   |    0
 .../cacti/basic_circuit.cc                    |    0
 .../cacti/basic_circuit.h                     |    0
 .../cacti/batch_tests                         |    0
 .../cacti/cache.cfg                           |    0
 src/{gpuwattch => accelwattch}/cacti/cacti.i  |    0
 src/{gpuwattch => accelwattch}/cacti/cacti.mk |    2 +-
 .../cacti/cacti_interface.cc                  |    0
 .../cacti/cacti_interface.h                   |    0
 .../cacti/component.cc                        |    0
 .../cacti/component.h                         |    0
 src/{gpuwattch => accelwattch}/cacti/const.h  |    0
 .../cacti/contention.dat                      |    0
 .../cacti/crossbar.cc                         |    0
 .../cacti/crossbar.h                          |    0
 .../cacti/decoder.cc                          |    0
 .../cacti/decoder.h                           |    0
 .../cacti/highradix.cc                        |    0
 .../cacti/highradix.h                         |    0
 .../cacti/htree2.cc                           |    0
 src/{gpuwattch => accelwattch}/cacti/htree2.h |    0
 src/{gpuwattch => accelwattch}/cacti/io.cc    |    0
 src/{gpuwattch => accelwattch}/cacti/io.h     |    0
 src/{gpuwattch => accelwattch}/cacti/main.cc  |    0
 src/{gpuwattch => accelwattch}/cacti/makefile |    0
 src/{gpuwattch => accelwattch}/cacti/mat.cc   |    0
 src/{gpuwattch => accelwattch}/cacti/mat.h    |    0
 src/{gpuwattch => accelwattch}/cacti/nuca.cc  |    0
 src/{gpuwattch => accelwattch}/cacti/nuca.h   |    0
 .../cacti/out_batch_test_result.csv           |    0
 .../cacti/parameter.cc                        |    0
 .../cacti/parameter.h                         |    0
 .../cacti/router.cc                           |    0
 src/{gpuwattch => accelwattch}/cacti/router.h |    0
 .../cacti/subarray.cc                         |    0
 .../cacti/subarray.h                          |    0
 .../cacti/technology.cc                       |    0
 src/{gpuwattch => accelwattch}/cacti/uca.cc   |    0
 src/{gpuwattch => accelwattch}/cacti/uca.h    |    0
 src/{gpuwattch => accelwattch}/cacti/wire.cc  |    0
 src/{gpuwattch => accelwattch}/cacti/wire.h   |    0
 src/{gpuwattch => accelwattch}/core.cc        |    0
 src/{gpuwattch => accelwattch}/core.h         |    0
 src/{gpuwattch => accelwattch}/fermi.xml      |    0
 src/{gpuwattch => accelwattch}/globalvar.h    |    0
 src/{gpuwattch => accelwattch}/gpgpu.xml      |    0
 .../gpgpu_sim.verify                          |    0
 src/accelwattch/gpgpu_sim_wrapper.cc          | 1143 +++++++++++++++++
 .../gpgpu_sim_wrapper.h                       |   81 +-
 .../gpgpu_static.xml                          |    0
 .../interconnect.cc                           |    0
 src/{gpuwattch => accelwattch}/interconnect.h |    0
 .../iocontrollers.cc                          |    0
 .../iocontrollers.h                           |    0
 src/{gpuwattch => accelwattch}/logic.cc       |    0
 src/{gpuwattch => accelwattch}/logic.h        |    0
 src/{gpuwattch => accelwattch}/main.cc        |    0
 src/{gpuwattch => accelwattch}/makefile       |    0
 src/{gpuwattch => accelwattch}/mcpat.mk       |    2 +-
 .../mcpatXeonCore.mk                          |    0
 src/{gpuwattch => accelwattch}/memoryctrl.cc  |    0
 src/{gpuwattch => accelwattch}/memoryctrl.h   |    0
 src/{gpuwattch => accelwattch}/noc.cc         |    0
 src/{gpuwattch => accelwattch}/noc.h          |    0
 src/{gpuwattch => accelwattch}/processor.cc   |   12 +-
 src/{gpuwattch => accelwattch}/processor.h    |    0
 src/{gpuwattch => accelwattch}/quadro.xml     |    0
 .../results/Alpha21364                        |    0
 .../results/Alpha21364_90nm                   |    0
 src/{gpuwattch => accelwattch}/results/Penryn |    0
 src/{gpuwattch => accelwattch}/results/T1     |    0
 .../results/T1_DC_64                          |    0
 .../results/T1_SBT_64                         |    0
 .../results/T1_ST_64                          |    0
 src/{gpuwattch => accelwattch}/results/T2     |    0
 .../results/Xeon_core                         |    0
 .../results/Xeon_uncore                       |    0
 src/{gpuwattch => accelwattch}/sharedcache.cc |    0
 src/{gpuwattch => accelwattch}/sharedcache.h  |    0
 .../technology_xeon_core.cc                   |    0
 src/{gpuwattch => accelwattch}/version.h      |    0
 src/{gpuwattch => accelwattch}/xmlParser.cc   |    0
 src/{gpuwattch => accelwattch}/xmlParser.h    |    0
 src/cuda-sim/cuda-sim.cc                      |  203 ++-
 src/cuda-sim/instructions.cc                  |   33 +-
 src/cuda-sim/ptx.l                            |   67 +-
 src/cuda-sim/ptx_ir.cc                        |   25 +-
 src/gpgpu-sim/dram.cc                         |   26 +-
 src/gpgpu-sim/dram.h                          |   25 +-
 src/gpgpu-sim/gpu-cache.cc                    |   22 +-
 src/gpgpu-sim/gpu-cache.h                     |   21 +-
 src/gpgpu-sim/gpu-sim.cc                      |  170 ++-
 src/gpgpu-sim/gpu-sim.h                       |   63 +-
 src/gpgpu-sim/l2cache.cc                      |   30 +-
 src/gpgpu-sim/l2cache.h                       |   23 +-
 src/gpgpu-sim/power_interface.cc              |  456 ++++++-
 src/gpgpu-sim/power_interface.h               |   35 +-
 src/gpgpu-sim/power_stat.cc                   |  467 ++++---
 src/gpgpu-sim/power_stat.h                    |  832 ++++++++----
 src/gpgpu-sim/shader.cc                       |   94 +-
 src/gpgpu-sim/shader.h                        |  423 ++++--
 src/gpgpu-sim/stat-tool.cc                    |    2 -
 src/gpgpu-sim/stat-tool.h                     |    2 +
 src/gpuwattch/gpgpu_sim_wrapper.cc            |  863 -------------
 version                                       |    2 +-
 165 files changed, 16621 insertions(+), 1868 deletions(-)
 create mode 100644 configs/tested-cfgs/SM6_TITANX/accelwattch_ptx_sim.xml
 create mode 100644 configs/tested-cfgs/SM6_TITANX/accelwattch_ptx_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM6_TITANX/accelwattch_sass_sim.xml
 create mode 100644 configs/tested-cfgs/SM6_TITANX/accelwattch_sass_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM75_RTX2060_S/accelwattch_ptx_sim.xml
 create mode 100644 configs/tested-cfgs/SM75_RTX2060_S/accelwattch_ptx_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM75_RTX2060_S/accelwattch_sass_sim.xml
 create mode 100644 configs/tested-cfgs/SM75_RTX2060_S/accelwattch_sass_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM75_RTX2060_S/config_turing_islip.icnt
 create mode 100644 configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config
 create mode 100644 configs/tested-cfgs/SM7_QV100/accelwattch_ptx_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/accelwattch_ptx_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/accelwattch_sass_hw.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/accelwattch_sass_hybrid.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/accelwattch_sass_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/accelwattch_sass_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_QV100/hw_perf.csv
 create mode 100644 configs/tested-cfgs/SM7_TITANV/accelwattch_ptx_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_TITANV/accelwattch_ptx_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_TITANV/accelwattch_sass_hw.xml
 create mode 100644 configs/tested-cfgs/SM7_TITANV/accelwattch_sass_hybrid.xml
 create mode 100644 configs/tested-cfgs/SM7_TITANV/accelwattch_sass_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_TITANV/accelwattch_sass_sim_alt.xml
 rename src/{gpuwattch => accelwattch}/Alpha21364.xml (100%)
 rename src/{gpuwattch => accelwattch}/Niagara1.xml (100%)
 rename src/{gpuwattch => accelwattch}/Niagara1_sharing.xml (100%)
 rename src/{gpuwattch => accelwattch}/Niagara1_sharing_DC.xml (100%)
 rename src/{gpuwattch => accelwattch}/Niagara1_sharing_SBT.xml (100%)
 rename src/{gpuwattch => accelwattch}/Niagara1_sharing_ST.xml (100%)
 rename src/{gpuwattch => accelwattch}/Niagara2.xml (100%)
 rename src/{gpuwattch => accelwattch}/Penryn.xml (100%)
 rename src/{gpuwattch => accelwattch}/README (100%)
 rename src/{gpuwattch => accelwattch}/XML_Parse.cc (92%)
 rename src/{gpuwattch => accelwattch}/XML_Parse.h (89%)
 rename src/{gpuwattch => accelwattch}/Xeon.xml (100%)
 rename src/{gpuwattch => accelwattch}/arch_const.h (100%)
 rename src/{gpuwattch => accelwattch}/array.cc (100%)
 rename src/{gpuwattch => accelwattch}/array.h (100%)
 rename src/{gpuwattch => accelwattch}/basic_components.cc (100%)
 rename src/{gpuwattch => accelwattch}/basic_components.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/README (100%)
 rename src/{gpuwattch => accelwattch}/cacti/Ucache.cc (99%)
 rename src/{gpuwattch => accelwattch}/cacti/Ucache.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/arbiter.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/arbiter.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/area.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/area.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/bank.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/bank.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/basic_circuit.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/basic_circuit.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/batch_tests (100%)
 rename src/{gpuwattch => accelwattch}/cacti/cache.cfg (100%)
 rename src/{gpuwattch => accelwattch}/cacti/cacti.i (100%)
 rename src/{gpuwattch => accelwattch}/cacti/cacti.mk (96%)
 rename src/{gpuwattch => accelwattch}/cacti/cacti_interface.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/cacti_interface.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/component.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/component.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/const.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/contention.dat (100%)
 rename src/{gpuwattch => accelwattch}/cacti/crossbar.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/crossbar.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/decoder.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/decoder.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/highradix.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/highradix.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/htree2.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/htree2.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/io.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/io.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/main.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/makefile (100%)
 rename src/{gpuwattch => accelwattch}/cacti/mat.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/mat.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/nuca.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/nuca.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/out_batch_test_result.csv (100%)
 rename src/{gpuwattch => accelwattch}/cacti/parameter.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/parameter.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/router.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/router.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/subarray.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/subarray.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/technology.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/uca.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/uca.h (100%)
 rename src/{gpuwattch => accelwattch}/cacti/wire.cc (100%)
 rename src/{gpuwattch => accelwattch}/cacti/wire.h (100%)
 rename src/{gpuwattch => accelwattch}/core.cc (100%)
 rename src/{gpuwattch => accelwattch}/core.h (100%)
 rename src/{gpuwattch => accelwattch}/fermi.xml (100%)
 rename src/{gpuwattch => accelwattch}/globalvar.h (100%)
 rename src/{gpuwattch => accelwattch}/gpgpu.xml (100%)
 rename src/{gpuwattch => accelwattch}/gpgpu_sim.verify (100%)
 create mode 100644 src/accelwattch/gpgpu_sim_wrapper.cc
 rename src/{gpuwattch => accelwattch}/gpgpu_sim_wrapper.h (68%)
 rename src/{gpuwattch => accelwattch}/gpgpu_static.xml (100%)
 rename src/{gpuwattch => accelwattch}/interconnect.cc (100%)
 rename src/{gpuwattch => accelwattch}/interconnect.h (100%)
 rename src/{gpuwattch => accelwattch}/iocontrollers.cc (100%)
 rename src/{gpuwattch => accelwattch}/iocontrollers.h (100%)
 rename src/{gpuwattch => accelwattch}/logic.cc (100%)
 rename src/{gpuwattch => accelwattch}/logic.h (100%)
 rename src/{gpuwattch => accelwattch}/main.cc (100%)
 rename src/{gpuwattch => accelwattch}/makefile (100%)
 rename src/{gpuwattch => accelwattch}/mcpat.mk (97%)
 rename src/{gpuwattch => accelwattch}/mcpatXeonCore.mk (100%)
 rename src/{gpuwattch => accelwattch}/memoryctrl.cc (100%)
 rename src/{gpuwattch => accelwattch}/memoryctrl.h (100%)
 rename src/{gpuwattch => accelwattch}/noc.cc (100%)
 rename src/{gpuwattch => accelwattch}/noc.h (100%)
 rename src/{gpuwattch => accelwattch}/processor.cc (99%)
 rename src/{gpuwattch => accelwattch}/processor.h (100%)
 rename src/{gpuwattch => accelwattch}/quadro.xml (100%)
 rename src/{gpuwattch => accelwattch}/results/Alpha21364 (100%)
 rename src/{gpuwattch => accelwattch}/results/Alpha21364_90nm (100%)
 rename src/{gpuwattch => accelwattch}/results/Penryn (100%)
 rename src/{gpuwattch => accelwattch}/results/T1 (100%)
 rename src/{gpuwattch => accelwattch}/results/T1_DC_64 (100%)
 rename src/{gpuwattch => accelwattch}/results/T1_SBT_64 (100%)
 rename src/{gpuwattch => accelwattch}/results/T1_ST_64 (100%)
 rename src/{gpuwattch => accelwattch}/results/T2 (100%)
 rename src/{gpuwattch => accelwattch}/results/Xeon_core (100%)
 rename src/{gpuwattch => accelwattch}/results/Xeon_uncore (100%)
 rename src/{gpuwattch => accelwattch}/sharedcache.cc (100%)
 rename src/{gpuwattch => accelwattch}/sharedcache.h (100%)
 rename src/{gpuwattch => accelwattch}/technology_xeon_core.cc (100%)
 rename src/{gpuwattch => accelwattch}/version.h (100%)
 rename src/{gpuwattch => accelwattch}/xmlParser.cc (100%)
 rename src/{gpuwattch => accelwattch}/xmlParser.h (100%)
 delete mode 100644 src/gpuwattch/gpgpu_sim_wrapper.cc

diff --git a/CHANGES b/CHANGES
index 7964153c0..5d1cd1082 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,4 +1,8 @@
 LOG:
+Version 4.2.0 vs 4.1.0 
+- Added AccelWattch power model v1.0 which replaces GPUWattch. 
+- Added AccelWattch XML configuration files for SM7_QV100, SM7_TITANV, SM75_RTX2060_S, SM6_TITANX. Note that all these AccelWattch XML configuration files are tuned only for SM7_QV100. 
+
 Version 4.1.0 versus 4.0.0
 -Features:
 1- Supporting L1 write-allocate with sub-sector writing policy as in Volta+ hardware, and changing the Volta+ cards config to make L1 write-allocate with write-through
diff --git a/COPYRIGHT b/COPYRIGHT
index a4eea2915..1c949f93e 100644
--- a/COPYRIGHT
+++ b/COPYRIGHT
@@ -44,3 +44,33 @@ per UBC policy 88, item 2.3 on literary works) these students names appear in
 the copyright notices of the respective files. UBC is also mentioned in the 
 copyright notice to highlight that was the author's affiliation when the work 
 was performed.
+
+NOTE 3: AccelWattch and all its components are covered by the following license and copyright.
+Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/Makefile b/Makefile
index d248211cd..82ea39928 100644
--- a/Makefile
+++ b/Makefile
@@ -87,7 +87,7 @@ ifneq ($(GPGPUSIM_POWER_MODEL),)
 		MCPAT_DBG_FLAG = dbg
 	endif
 
-	MCPAT_OBJ_DIR = $(SIM_OBJ_FILES_DIR)/gpuwattch
+	MCPAT_OBJ_DIR = $(SIM_OBJ_FILES_DIR)/accelwattch
 
 	MCPAT = $(MCPAT_OBJ_DIR)/*.o
 endif
@@ -117,24 +117,24 @@ check_setup_environment:
 	 fi 
 
 check_power:
-	@if [ -d "$(GPGPUSIM_ROOT)/src/gpuwattch/" -a ! -n "$(GPGPUSIM_POWER_MODEL)" ]; then \
+	@if [ -d "$(GPGPUSIM_ROOT)/src/accelwattch/" -a ! -n "$(GPGPUSIM_POWER_MODEL)" ]; then \
 		echo ""; \
-		echo "	Power model detected in default directory ($(GPGPUSIM_ROOT)/src/gpuwattch) but GPGPUSIM_POWER_MODEL not set."; \
-		echo "	Please re-run setup_environment or manually set GPGPUSIM_POWER_MODEL to the gpuwattch directory if you would like to include the GPGPU-Sim Power Model."; \
+		echo "	Power model detected in default directory ($(GPGPUSIM_ROOT)/src/accelwattch) but GPGPUSIM_POWER_MODEL not set."; \
+		echo "	Please re-run setup_environment or manually set GPGPUSIM_POWER_MODEL to the accelwattch directory if you would like to include the GPGPU-Sim Power Model."; \
 		echo ""; \
 		true; \
 	elif [ ! -d "$(GPGPUSIM_POWER_MODEL)" ]; then \
 		echo ""; \
 		echo "ERROR ** Power model directory invalid."; \
 		echo "($(GPGPUSIM_POWER_MODEL)) is not a valid directory."; \
-		echo "Please set GPGPUSIM_POWER_MODEL to the GPGPU-Sim gpuwattch directory."; \
+		echo "Please set GPGPUSIM_POWER_MODEL to the GPGPU-Sim accelwattch directory."; \
 		echo ""; \
 		exit 101; \
 	elif [ -n "$(GPGPUSIM_POWER_MODEL)" -a ! -f "$(GPGPUSIM_POWER_MODEL)/gpgpu_sim.verify" ]; then \
 		echo ""; \
 		echo "ERROR ** Power model directory invalid."; \
 		echo "gpgpu_sim.verify not found in $(GPGPUSIM_POWER_MODEL)."; \
-		echo "Please ensure that GPGPUSIM_POWER_MODEL points to a valid gpuwattch directory and that you have the correct GPGPU-Sim mcpat distribution."; \
+		echo "Please ensure that GPGPUSIM_POWER_MODEL points to a valid accelwattch directory and that you have the correct GPGPU-Sim mcpat distribution."; \
 		echo ""; \
 		exit 102; \
 	fi
@@ -243,8 +243,8 @@ makedirs:
 	if [ ! -d $(SIM_OBJ_FILES_DIR)/libopencl/bin ]; then mkdir -p $(SIM_OBJ_FILES_DIR)/libopencl/bin; fi;
 	if [ ! -d $(SIM_OBJ_FILES_DIR)/$(INTERSIM) ]; then mkdir -p $(SIM_OBJ_FILES_DIR)/$(INTERSIM); fi;
 	if [ ! -d $(SIM_OBJ_FILES_DIR)/cuobjdump_to_ptxplus ]; then mkdir -p $(SIM_OBJ_FILES_DIR)/cuobjdump_to_ptxplus; fi;
-	if [ ! -d $(SIM_OBJ_FILES_DIR)/gpuwattch ]; then mkdir -p $(SIM_OBJ_FILES_DIR)/gpuwattch; fi;
-	if [ ! -d $(SIM_OBJ_FILES_DIR)/gpuwattch/cacti ]; then mkdir -p $(SIM_OBJ_FILES_DIR)/gpuwattch/cacti; fi;
+	if [ ! -d $(SIM_OBJ_FILES_DIR)/accelwattch ]; then mkdir -p $(SIM_OBJ_FILES_DIR)/accelwattch; fi;
+	if [ ! -d $(SIM_OBJ_FILES_DIR)/accelwattch/cacti ]; then mkdir -p $(SIM_OBJ_FILES_DIR)/accelwattch/cacti; fi;
 
 all:
 	$(MAKE) gpgpusim
diff --git a/README.md b/README.md
index 9bb891659..da0893585 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 Welcome to GPGPU-Sim, a cycle-level simulator modeling contemporary graphics
 processing units (GPUs) running GPU computing workloads written in CUDA or
 OpenCL. Also included in GPGPU-Sim is a performance visualization tool called
-AerialVision and a configurable and extensible energy model called GPUWattch.
-GPGPU-Sim and GPUWattch have been rigorously validated with performance and
+AerialVision and a configurable and extensible power model called AccelWattch.
+GPGPU-Sim and AccelWattch have been rigorously validated with performance and
 power measurements of real hardware GPUs.
 
 This version of GPGPU-Sim has been tested with a subset of CUDA version 4.2,
@@ -38,12 +38,11 @@ Md Aamir Raihan, Negar Goli, Tor Aamodt,
 Modeling Deep Learning Accelerator Enabled GPUs, arXiv:1811.08309, 
 https://arxiv.org/abs/1811.08309
 
-If you use the GPUWattch energy model in your research, please cite:
+If you use the AccelWattch power model in your research, please cite:
 
-Jingwen Leng, Tayler Hetherington, Ahmed ElTantawy, Syed Gilani, Nam Sung Kim,
-Tor M. Aamodt, Vijay Janapa Reddi, GPUWattch: Enabling Energy Optimizations in
-GPGPUs, In proceedings of the ACM/IEEE International Symposium on Computer
-Architecture (ISCA 2013), Tel-Aviv, Israel, June 23-27, 2013.
+Vijay Kandiah, Scott Peverelle, Mahmoud Khairy, Junrui Pan, Amogh Manjunath, Timothy G. Rogers, Tor M. Aamodt, and Nikos Hardavellas. 2021.
+AccelWattch: A Power Modeling Framework for Modern GPUs. In MICRO54: 54th Annual IEEE/ACM International Symposium on Microarchitecture
+(MICRO ’21), October 18–22, 2021, Virtual Event, Greece.
 
 If you use the support for CUDA dynamic parallelism in your research, please cite:
 
@@ -62,8 +61,8 @@ This file contains instructions on installing, building and running GPGPU-Sim.
 Detailed documentation on what GPGPU-Sim models, how to configure it, and a
 guide to the source code can be found here: <http://gpgpu-sim.org/manual/>.
 Instructions for building doxygen source code documentation are included below.
-Detailed documentation on GPUWattch including how to configure it and a guide
-to the source code can be found here: <http://gpgpu-sim.org/gpuwattch/>.
+
+Previous versions of GPGPU-Sim (3.2.0 to 4.1.0) included the [GPUWattch Energy model](http://gpgpu-sim.org/gpuwattch/) which has been replaced by AccelWattch version 1.0 in GPGPU-Sim version 4.2.0. AccelWattch supports modern GPUs and is validated against a NVIDIA Volta QV100 GPU. Detailed documentation on AccelWattch can be found here: [AccelWattch Overview](https://github.com/VijayKandiah/accel-sim-framework#accelwattch-overview) and [AccelWattch MICRO'21 Artifact Manual](https://github.com/VijayKandiah/accel-sim-framework/blob/release/AccelWattch.md).
 
 If you have questions, please sign up for the google groups page (see
 gpgpu-sim.org), but note that use of this simulator does not imply any level of
@@ -108,21 +107,20 @@ library (part of the CUDA toolkit). Code to interface with the CUDA Math
 library is contained in cuda-math.h, which also includes several structures
 derived from vector_types.h (one of the CUDA header files).
 
-## GPUWattch Energy Model
+## AccelWattch Power Model
 
-GPUWattch (introduced in GPGPU-Sim 3.2.0) was developed by researchers at the
-University of British Columbia, the University of Texas at Austin, and the
-University of Wisconsin-Madison. Contributors to GPUWattch include Tor
-Aamodt's research group at the University of British Columbia: Tayler
-Hetherington and Ahmed ElTantawy; Vijay Reddi's research group at the
-University of Texas at Austin: Jingwen Leng; and Nam Sung Kim's research group
-at the University of Wisconsin-Madison: Syed Gilani.
+AccelWattch (introduced in GPGPU-Sim 4.2.0) was developed by researchers at 
+Northwestern University, Purdue University, and the University of British Columbia. 
+Contributors to AccelWattch include Nikos Hardavellas's research group at Northwestern University: 
+Vijay Kandiah; Tor Aamodt's research group at the University of British Columbia: Scott Peverelle; 
+and Timothy Rogers's research group at Purdue University: Mahmoud Khairy, Junrui Pan, and Amogh Manjunath. 
 
-GPUWattch leverages McPAT, which was developed by Sheng Li et al. at the
+AccelWattch leverages McPAT, which was developed by Sheng Li et al. at the
 University of Notre Dame, Hewlett-Packard Labs, Seoul National University, and
-the University of California, San Diego. The paper can be found at
+the University of California, San Diego. The McPAT paper can be found at
 http://www.hpl.hp.com/research/mcpat/micro09.pdf.
 
+
 # INSTALLING, BUILDING and RUNNING GPGPU-Sim
 
 Assuming all dependencies required by GPGPU-Sim are installed on your system,
@@ -316,15 +314,16 @@ need to re-compile your application simply to run it on GPGPU-Sim.
 To revert back to running on the hardware, remove GPGPU-Sim from your
 LD_LIBRARY_PATH environment variable.
 
-The following GPGPU-Sim configuration options are used to enable GPUWattch
+The following GPGPU-Sim configuration options are used to enable AccelWattch
 
 	-power_simulation_enabled 1 (1=Enabled, 0=Not enabled)
-	-gpuwattch_xml_file <filename>.xml
-
+	-power_simulation_mode 0 (0=AccelWattch_SASS_SIM or AccelWattch_PTX_SIM, 1=AccelWattch_SASS_HW, 2=AccelWattch_SASS_HYBRID)
+	-accelwattch_xml_file <filename>.xml
 
-The GPUWattch XML configuration file name is set to gpuwattch.xml by default and
-currently only supplied for GTX480 (default=gpuwattch_gtx480.xml). Please refer to
-<http://gpgpu-sim.org/gpuwattch/> for more information.
+The AccelWattch XML configuration file name is set to accelwattch_sass_sim.xml by default and is
+currently provided for SM7_QV100, SM7_TITANV, SM75_RTX2060_S, and SM6_TITANX. 
+Note that all these AccelWattch XML configuration files are tuned only for SM7_QV100. Please refer to
+<https://github.com/VijayKandiah/accel-sim-framework#accelwattch-overview> for more information.
 
 Running OpenCL applications is identical to running CUDA applications. However,
 OpenCL applications need to communicate with the NVIDIA driver in order to
diff --git a/configs/tested-cfgs/SM6_TITANX/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM6_TITANX/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM6_TITANX/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM6_TITANX/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM6_TITANX/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM6_TITANX/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM6_TITANX/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM6_TITANX/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM6_TITANX/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM6_TITANX/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM6_TITANX/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM6_TITANX/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM6_TITANX/gpgpusim.config b/configs/tested-cfgs/SM6_TITANX/gpgpusim.config
index 5b243a5b6..652f0a09e 100644
--- a/configs/tested-cfgs/SM6_TITANX/gpgpusim.config
+++ b/configs/tested-cfgs/SM6_TITANX/gpgpusim.config
@@ -1,3 +1,32 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
 # This config models the Pascal GP102 (NVIDIA TITAN X)
 # For more info about this card, see Nvidia White paper
 # http://international.download.nvidia.com/geforce-com/international/pdfs/GeForce_GTX_1080_Whitepaper_FINAL.pdf
@@ -28,6 +57,7 @@
 -gpgpu_n_cores_per_cluster 1
 -gpgpu_n_mem 12
 -gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
 
 # Pascal clock domains
 #-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
@@ -170,11 +200,8 @@
 -enable_ptx_file_line_stats 1
 -visualizer_enabled 0
 
-# power model configs, disable it untill we create a real energy model for Pascal 102
--power_simulation_enabled 0
 
 # tracing functionality
 #-trace_enabled 1
 #-trace_components WARP_SCHEDULER,SCOREBOARD
 #-trace_sampling_core 0
-
diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index 0ae91a50f..2a9bff015 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -175,5 +175,4 @@
 # tracing functionality
 #-trace_enabled 1
 #-trace_components WARP_SCHEDULER,SCOREBOARD
-#-trace_sampling_core 0
-
+#-trace_sampling_core 0
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM75_RTX2060_S/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM75_RTX2060_S/config_turing_islip.icnt b/configs/tested-cfgs/SM75_RTX2060_S/config_turing_islip.icnt
new file mode 100644
index 000000000..eed1c34b6
--- /dev/null
+++ b/configs/tested-cfgs/SM75_RTX2060_S/config_turing_islip.icnt
@@ -0,0 +1,73 @@
+//52*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 52;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 64;
+input_buffer_size = 256;
+ejection_buffer_size = 64;
+boundary_buffer_size = 64;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config
new file mode 100644
index 000000000..0fb4742e1
--- /dev/null
+++ b/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config
@@ -0,0 +1,210 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+# This config models the Turing RTX 2060 Super
+# For more info about turing architecture:
+# 1- https://www.nvidia.com/content/dam/en-zz/Solutions/design-visualization/technologies/turing-architecture/NVIDIA-Turing-Architecture-Whitepaper.pdf
+# 2- "RTX on—The NVIDIA Turing GPU", IEEE MICRO 2020
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 75
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 5
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 34
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 16
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1905.0:1905.0:1905.0:3500.0
+# boost mode
+# -gpgpu_clock_domains 1680.0:1680.0:1680.0:3500.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 75
+
+# This implies a maximum of 32 warps/SM
+-gpgpu_shader_core_pipeline 1024:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Turing has 4 SP SIMD units, 4 INT units, 4 SFU units, 8 Tensor core units
+## We need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,0,4,4,4,4,0,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,32
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Turing has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Turing, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 75
+
+# Trung has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# turing has 8 banks dual-port, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+-gpgpu_adaptive_cache_config 0
+-gpgpu_l1_banks 4
+-gpgpu_cache:dl1  S:1:128:512,L:L:s:N:L,A:256:8,16:0,32
+-gpgpu_shmem_size 65536
+-gpgpu_shmem_sizeDefault 65536
+-gpgpu_shmem_per_block 65536
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_n_cluster_ejection_buffer_size 32
+-gpgpu_l1_latency 20
+-gpgpu_smem_latency 20
+-gpgpu_flush_l1_cache 1
+
+# 64 sets, each 128 bytes 16-way for each memory sub partition (128 KB per memory sub partition). This gives us 4MB L2 cache
+-gpgpu_cache:dl2 S:64:128:16,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 1
+-gpgpu_memory_partition_indexing 0
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_turing_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_arbiter_algo 1
+-icnt_flit_size 40
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# Turing has GDDR6
+# http://monitorinsider.com/GDDR6.html
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 2
+-gpgpu_dram_burst_length 16
+-dram_data_command_freq_ratio 4
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCC.BCCSSSSS
+
+# Use the same GDDR5 timing, scaled to 3500MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=4:RRD=10:RCD=20:RAS=50:RP=20:RC=62:
+                        CL=20:WL=8:CDLR=9:WR=20:nbkgrp=4:CCDL=4:RTPL=4"
+
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
diff --git a/configs/tested-cfgs/SM7_QV100/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_QV100/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index 425bc1690..76c99b7d6 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -1,4 +1,34 @@
-# This config models the Volta
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
 # For more info about volta architecture:
 # http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
 # https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
@@ -34,10 +64,11 @@
 -gpgpu_n_cores_per_cluster 1
 -gpgpu_n_mem 32
 -gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
 
 # volta clock domains
 #-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
--gpgpu_clock_domains 1132.0:1132.0:1132.0:850.0
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
 # boost mode
 # -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
 
@@ -199,9 +230,6 @@
 -enable_ptx_file_line_stats 1
 -visualizer_enabled 0
 
-# power model configs, disable it untill we create a real energy model for Volta
--power_simulation_enabled 0
-
 # tracing functionality
 #-trace_enabled 1
 #-trace_components WARP_SCHEDULER,SCOREBOARD
diff --git a/configs/tested-cfgs/SM7_QV100/hw_perf.csv b/configs/tested-cfgs/SM7_QV100/hw_perf.csv
new file mode 100644
index 000000000..aa88bb256
--- /dev/null
+++ b/configs/tested-cfgs/SM7_QV100/hw_perf.csv
@@ -0,0 +1,26 @@
+Benchmark,Kernel,L1_RH,L1_RM,L1_WH,L1_WM,CC_ACC,SHRD_ACC,DRAM_Rd,DRAM_Wr,L2_RH,L2_RM,L2_WH,L2_WM,NOC,Pipeline_Duty,Num_Idle_SMs,Elapsed_Cycles,Chip Voltage
+b+tree-rodinia-3.1,findRangeK,1634256.0,561818.0,40785.0,19032.0,0.0,0.0,259346.0,3524.0,396522.0,259508.0,60000.0,0.0,1343246.0,0.3268163900773488,5.064000000000002,66542.7,1.0
+b+tree-rodinia-3.1,findK,1318908.0,525035.0,42619.0,7404.0,0.0,0.0,255317.0,2582.0,366918.0,255364.0,50000.0,0.0,1250108.0,0.2740918672650619,3.191999999999995,80883.0,1.0
+backprop-rodinia-3.1,_Z22bpnn_layerforward_CUDAPfS_S_S_ii,49152.0,143738.0,192432.0,4232.0,0.0,413696.0,147464.0,60097.0,29059.0,147460.0,196608.0,0.0,704512.0,0.5619432556155418,7.520000000000007,23324.775,1.0
+backprop-rodinia-3.1,_Z24bpnn_adjust_weights_cudaPfiS_iS_S_,465990.0,277805.0,327015.0,887.0,0.0,0.0,286738.0,190646.0,54315.0,286734.0,327686.0,0.0,1263518.0,0.20116733697224465,9.496000000000002,32578.425,1.0
+hotspot-rodinia-3.1,_Z14calculate_tempiPfS_S_iiiiffffff,4250.0,691050.0,0.0,175104.0,0.0,997428.0,262147.0,66263.0,486965.0,262144.0,175104.0,0.0,1732988.0,0.9470499252952201,3.3200000000000074,56438.825,1.0
+kmeans-rodinia-3.1,_Z11kmeansPointPfiiiPiS_S_S0_,0.0,0.0,0.0,102400.0,4352107.0,0.0,12302960.0,92472.5,6742186.0,12321532.0,102400.0,0.0,26022036.0,0.11420395712434231,1.5799999999999947,894550.775,1.0
+srad_v1-rodinia-3.1,_Z4sradfiilPiS_S_S_PfS0_S0_S0_fS0_S0_,158304.87000000002,89035.40999999999,0.0,143700.0,0.0,0.0,28986.500000000033,45424.200000000026,68135.7,28984.00000000001,143700.0,0.0,481258.2600000001,0.5320091849844065,15.272880000000004,14251.741749999997,1.0
+parboil-sad,_Z11mb_sad_calcPtS_ii,101840.0,415925.0,2102177.0,7289373.0,0.0,10033920.0,257308.0,8720433.0,8754664.0,257280.0,9390720.0,0.0,36398656.0,0.25130932753519797,0.19199999999999662,6551129.125,1.0
+parboil-sgemm,_Z9mysgemmNTPKfiS0_iPfiiff,7109956.0,2452728.0,133388.0,1284.0,0.0,8642304.0,393092.0,36894.0,2059512.0,393088.0,135168.0,0.0,5176696.0,0.5495706862295477,1.8799999999999972,358744.025,1.0
+parboil-mri-q,_Z12ComputeQ_GPUiiPfS_S_S_S_,0.0,163840.0,65184.0,154.0,17617612.5,0.0,164356.0,0.0,0.0,163840.0,65536.0,0.0,458752.0,0.5767256645623982,12.363999999999997,691892.925,1.0
+dct8x8,_Z14CUDAkernel1DCTPfiiiy,0.0,0.0,552.8,32121.9,786431.9999999999,114688.00000000001,32786.0,0.0,16383.999999999998,32767.999999999996,32767.999999999996,0.0,131071.99999999999,0.06091433507559575,7.7799999999999985,24207.632500000003,1.0
+dct8x8,_Z14CUDAkernel2DCTPfS_i,0.0,32768.00000000002,0.0,32768.00000000002,0.0,49152.00000000004,32773.25742574254,0.0,0.0,32768.00000000002,32768.00000000002,0.0,131072.0000000001,0.14345732731755537,30.750257425742568,5822.941584158416,1.0
+binomialOptions,_Z21binomialOptionsKernelv,0.0,0.0,0.0,1024.0,23688.0,16778240.0,640.0,0.0,0.0,0.0,1024.0,0.0,2048.0,0.6457304629145744,1.9519999999999982,1366301.225,1.0
+fastWalshTransform,_Z15fwtBatch2KernelPfS_i,0.0,1048576.0000000002,774120.4444444445,271536.22222222225,0.0,0.0,1048581.888888889,945003.222222222,0.0,1048576.0000000002,1048576.0000000002,0.0,4194304.000000001,0.0867005928407203,2.574222222222223,120947.73472222223,1.0
+fastWalshTransform,_Z15fwtBatch1KernelPfS_i,0.0,1048576.0,645060.0,403890.6666666666,0.0,3407872.0,1048581.0,950303.3333333333,0.0,1048576.0,1048576.0,0.0,4194304.0,0.3836524328760675,2.621333333333329,149487.8,1.0
+histogram,_Z17histogram64KernelPjP5uint4j,0.0,2097152.0,0.0,34960.0,0.0,4893504.000000001,2097184.2941176468,26959.294117647052,0.0,2097152.0,34960.0,0.0,4264223.999999999,0.3361853461559831,3.706823529411762,146480.14411764703,1.0
+mergeSort,_Z21mergeSortSharedKernelILj1EEvPjS0_S0_S0_j,0.0,1048576.0,0.0,1048576.0,0.0,12976128.0,1048580.0,950169.0,0.0,1048576.0,1048576.0,0.0,4194304.0,0.9137102229423307,1.1600000000000055,439316.525,1.0
+mergeSort,_Z30mergeElementaryIntervalsKernelILj1EEvPjS0_S0_S0_S0_S0_jj,152481.75,1127706.3333333333,439852.24999999994,829969.9166666665,0.0,3670010.1666666665,1056772.0000000002,959704.0833333334,199523.16666666666,1056768.0,1269875.1666666667,0.0,4878632.833333334,0.44812863772322986,1.6420000000000003,157457.05,1.0
+quasirandomGenerator,_Z26quasirandomGeneratorKernelPfjj,0.0,0.0,0.0,393215.9999999999,47616.000000000015,0.0,21.0,294938.38095238095,0.0,0.0,393215.9999999999,0.0,786431.9999999998,0.6109600290450061,17.68266666666667,80626.8130952381,1.0
+quasirandomGenerator,_Z16inverseCNDKernelPfPjj,0.0,0.0,0.0,393215.9999999999,0.0,0.0,5.952380952380952,294941.6666666666,0.0,0.0,393215.9999999999,0.0,786431.9999999998,0.307434624439692,5.790476190476192,58367.4988095238,1.0
+sobolQRNG,_Z15sobolGPU_kerneljjPjPf,172832.0,31976.0,0.0,1250000.0,0.0,1899700.0,405.0,1151641.0,31592.0,400.0,1250000.0,0.0,2563936.0,0.6380044567750587,2.7840000000000042,112087.775,1.0
+cutlass_perf_test_k1,_ZN7cutlass4gemm16gemm_kernel_nolbINS0_12GemmMainloopINS0_10GemmTraitsINS0_14WmmaGemmConfigILNS_12MatrixLayout4KindE1ELS6_1ENS_5ShapeILi64ELi128ELi128ELi1EEE6__halfS9_ffNS7_ILi64ELi32ELi64ELi1EEENS7_ILi16ELi16ELi16ELi1EEELi8ELi8ELi8ELi8ELi4ELi4ELi4EEENS0_16GlobalLoadStreamILNS_11GemmOperand4KindE0ENS0_20GemmGlobalIteratorAbINS0_20GemmGlobalTileTraitsILSF_0ELS6_1EKS9_NS7_ILi1ELi64ELi128ELi1EEENS7_ILi1ELi8ELi32ELi1EEELi8EEEiEENS_17TileStoreIteratorINS0_27GemmSharedStoreTileAbTraitsIS9_NS7_ILi1ELi64ELi136ELi1EEENS7_ILi1ELi16ELi16ELi1EEELi8EEES9_LNS_15IteratorAdvance4KindE1ELNS_11MemorySpace4KindE1EiS9_LNS_19FragmentElementType4KindE0ENS7_ILi0ELi0ELi0ELi0EEEEENS_4CopyINS_8FragmentIS9_Li32ELm16EEEEEEENSD_ILSF_1ENSG_INSH_ILSF_1ELS6_1ESI_NS7_ILi1ELi128ELi64ELi1EEENS7_ILi1ELi4ELi64ELi1EEELi8EEEiEENSN_INSO_IS9_NS7_ILi1ELi128ELi72ELi1EEENS7_ILi1ELi32ELi8ELi1EEELi8EEES9_LST_1ELSV_1EiS9_LSX_0ESY_EES13_EENS0_16SharedLoadStreamINS_16TileLoadIteratorINS0_29WmmaGemmSharedLoadTileATraitsILS6_1ES9_SP_NS7_ILi1ELi4ELi2ELi1EEELi16ENS7_ILi1ELi1ELi4ELi1EEENS7_ILi2176ELi0ELi32ELi0EEESB_EES9_LST_1ELSV_1EiNS_10WmmaMatrixILSF_0ELS6_1ES9_SB_EELSX_1ESY_EENS10_INS11_IS1M_Li4ELm16EEEEEEENS1E_INS1F_INS0_29WmmaGemmSharedLoadTileBTraitsILS6_1ES9_S19_S1H_Li1152ENS7_ILi1ELi1ELi2ELi1EEENS7_ILi16ELi0ELi4608ELi1EEESB_EES9_LST_1ELSV_1EiNS1L_ILSF_1ELS6_1ES9_SB_EELSX_1ESY_EENS10_INS11_IS1V_Li2ELm16EEEEEEENS0_12GemmEpilogueINS0_28SimplifiedGemmEpilogueTraitsISC_NS0_13LinearScalingIfNS0_19FragmentMultiplyAddIffLb1EEEEEiNS0_28WmmaGemmEpilogueTraitsHelperISC_fS25_iEEEEEENS0_20IdentityBlockSwizzleEiNS0_17ClearAccumulatorsIfLi1EEEEEEEEEvNT_6ParamsE,0.0,460800.0,0.0,5120.0,160.0,577120.0000000001,412167.99999999994,42.285714285714285,48640.0,412160.0,5120.0,0.0,931840.0,0.24658369358809393,60.32228571428572,139808.59999999998,1.0
+cutlass_perf_test_k2,_ZN7cutlass4gemm16gemm_kernel_nolbINS0_12GemmMainloopINS0_10GemmTraitsINS0_14WmmaGemmConfigILNS_12MatrixLayout4KindE1ELS6_1ENS_5ShapeILi64ELi128ELi128ELi1EEE6__halfS9_ffNS7_ILi64ELi32ELi64ELi1EEENS7_ILi16ELi16ELi16ELi1EEELi8ELi8ELi8ELi8ELi4ELi4ELi4EEENS0_16GlobalLoadStreamILNS_11GemmOperand4KindE0ENS0_20GemmGlobalIteratorAbINS0_20GemmGlobalTileTraitsILSF_0ELS6_1EKS9_NS7_ILi1ELi64ELi128ELi1EEENS7_ILi1ELi8ELi32ELi1EEELi8EEEiEENS_17TileStoreIteratorINS0_27GemmSharedStoreTileAbTraitsIS9_NS7_ILi1ELi64ELi136ELi1EEENS7_ILi1ELi16ELi16ELi1EEELi8EEES9_LNS_15IteratorAdvance4KindE1ELNS_11MemorySpace4KindE1EiS9_LNS_19FragmentElementType4KindE0ENS7_ILi0ELi0ELi0ELi0EEEEENS_4CopyINS_8FragmentIS9_Li32ELm16EEEEEEENSD_ILSF_1ENSG_INSH_ILSF_1ELS6_1ESI_NS7_ILi1ELi128ELi64ELi1EEENS7_ILi1ELi4ELi64ELi1EEELi8EEEiEENSN_INSO_IS9_NS7_ILi1ELi128ELi72ELi1EEENS7_ILi1ELi32ELi8ELi1EEELi8EEES9_LST_1ELSV_1EiS9_LSX_0ESY_EES13_EENS0_16SharedLoadStreamINS_16TileLoadIteratorINS0_29WmmaGemmSharedLoadTileATraitsILS6_1ES9_SP_NS7_ILi1ELi4ELi2ELi1EEELi16ENS7_ILi1ELi1ELi4ELi1EEENS7_ILi2176ELi0ELi32ELi0EEESB_EES9_LST_1ELSV_1EiNS_10WmmaMatrixILSF_0ELS6_1ES9_SB_EELSX_1ESY_EENS10_INS11_IS1M_Li4ELm16EEEEEEENS1E_INS1F_INS0_29WmmaGemmSharedLoadTileBTraitsILS6_1ES9_S19_S1H_Li1152ENS7_ILi1ELi1ELi2ELi1EEENS7_ILi16ELi0ELi4608ELi1EEESB_EES9_LST_1ELSV_1EiNS1L_ILSF_1ELS6_1ES9_SB_EELSX_1ESY_EENS10_INS11_IS1V_Li2ELm16EEEEEEENS0_12GemmEpilogueINS0_28SimplifiedGemmEpilogueTraitsISC_NS0_13LinearScalingIfNS0_19FragmentMultiplyAddIffLb1EEEEEiNS0_28WmmaGemmEpilogueTraitsHelperISC_fS25_iEEEEEENS0_20IdentityBlockSwizzleEiNS0_17ClearAccumulatorsIfLi1EEEEEEEEEvNT_6ParamsE,0.0,2097151.9999999995,171796.0,65782.85714285714,255.99999999999994,1464319.9999999998,1081352.2857142857,45.42857142857143,1015808.0000000002,1081344.0,237568.0,0.0,4669440.0,0.38530040572560803,48.440000000000005,228263.9035714286,1.0
+cutlass_perf_test_k3,_ZN7cutlass4gemm16gemm_kernel_nolbINS0_12GemmMainloopINS0_10GemmTraitsINS0_14WmmaGemmConfigILNS_12MatrixLayout4KindE1ELS6_1ENS_5ShapeILi64ELi128ELi128ELi1EEE6__halfS9_ffNS7_ILi64ELi32ELi64ELi1EEENS7_ILi16ELi16ELi16ELi1EEELi8ELi8ELi8ELi8ELi4ELi4ELi4EEENS0_16GlobalLoadStreamILNS_11GemmOperand4KindE0ENS0_20GemmGlobalIteratorAbINS0_20GemmGlobalTileTraitsILSF_0ELS6_1EKS9_NS7_ILi1ELi64ELi128ELi1EEENS7_ILi1ELi8ELi32ELi1EEELi8EEEiEENS_17TileStoreIteratorINS0_27GemmSharedStoreTileAbTraitsIS9_NS7_ILi1ELi64ELi136ELi1EEENS7_ILi1ELi16ELi16ELi1EEELi8EEES9_LNS_15IteratorAdvance4KindE1ELNS_11MemorySpace4KindE1EiS9_LNS_19FragmentElementType4KindE0ENS7_ILi0ELi0ELi0ELi0EEEEENS_4CopyINS_8FragmentIS9_Li32ELm16EEEEEEENSD_ILSF_1ENSG_INSH_ILSF_1ELS6_1ESI_NS7_ILi1ELi128ELi64ELi1EEENS7_ILi1ELi4ELi64ELi1EEELi8EEEiEENSN_INSO_IS9_NS7_ILi1ELi128ELi72ELi1EEENS7_ILi1ELi32ELi8ELi1EEELi8EEES9_LST_1ELSV_1EiS9_LSX_0ESY_EES13_EENS0_16SharedLoadStreamINS_16TileLoadIteratorINS0_29WmmaGemmSharedLoadTileATraitsILS6_1ES9_SP_NS7_ILi1ELi4ELi2ELi1EEELi16ENS7_ILi1ELi1ELi4ELi1EEENS7_ILi2176ELi0ELi32ELi0EEESB_EES9_LST_1ELSV_1EiNS_10WmmaMatrixILSF_0ELS6_1ES9_SB_EELSX_1ESY_EENS10_INS11_IS1M_Li4ELm16EEEEEEENS1E_INS1F_INS0_29WmmaGemmSharedLoadTileBTraitsILS6_1ES9_S19_S1H_Li1152ENS7_ILi1ELi1ELi2ELi1EEENS7_ILi16ELi0ELi4608ELi1EEESB_EES9_LST_1ELSV_1EiNS1L_ILSF_1ELS6_1ES9_SB_EELSX_1ESY_EENS10_INS11_IS1V_Li2ELm16EEEEEEENS0_12GemmEpilogueINS0_28SimplifiedGemmEpilogueTraitsISC_NS0_13LinearScalingIfNS0_19FragmentMultiplyAddIffLb1EEEEEiNS0_28WmmaGemmEpilogueTraitsHelperISC_fS25_iEEEEEENS0_20IdentityBlockSwizzleEiNS0_17ClearAccumulatorsIfLi1EEEEEEEEEvNT_6ParamsE,0.0,3276800.0000000005,429682.85714285716,164204.57142857142,640.0,2309120.0,491527.9999999999,77869.28571428571,2785279.9999999995,491519.99999999994,593920.0000000001,0.0,7741440.0,0.8525726478636384,1.832,161781.07857142857,1.0
+cudaTensorCoreGemm,_Z12compute_gemmPK6__halfS1_PKfPfff,0.0,69206016.0,0.0,2097152.0,0.0,30146560.0,16974052.0,1998866.0,52232060.0,16973824.0,2097152.0,0.0,142606336.0,0.7380984268363922,1.264000000000003,3871172.375,1.0
diff --git a/configs/tested-cfgs/SM7_TITANV/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_TITANV/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_TITANV/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_TITANV/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_TITANV/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_TITANV/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_TITANV/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
index 0c69c7084..5c6be224a 100644
--- a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
@@ -200,9 +200,6 @@
 -enable_ptx_file_line_stats 1
 -visualizer_enabled 0
 
-# power model configs, disable it untill we create a real energy model for Volta
--power_simulation_enabled 0
-
 # tracing functionality
 #-trace_enabled 1
 #-trace_components WARP_SCHEDULER,SCOREBOARD
diff --git a/format-code.sh b/format-code.sh
index 9f470854b..ac753f059 100755
--- a/format-code.sh
+++ b/format-code.sh
@@ -8,5 +8,5 @@ clang-format -i ${THIS_DIR}/src/gpgpu-sim/*.h
 clang-format -i ${THIS_DIR}/src/gpgpu-sim/*.cc
 clang-format -i ${THIS_DIR}/src/cuda-sim/*.h
 clang-format -i ${THIS_DIR}/src/cuda-sim/*.cc
-clang-format -i ${THIS_DIR}/src/gpuwattch/*.h
-clang-format -i ${THIS_DIR}/src/gpuwattch/*.cc
\ No newline at end of file
+clang-format -i ${THIS_DIR}/src/accelwattch/*.h
+clang-format -i ${THIS_DIR}/src/accelwattch/*.cc
\ No newline at end of file
diff --git a/setup_environment b/setup_environment
index 07d078844..d3ff8403c 100644
--- a/setup_environment
+++ b/setup_environment
@@ -117,18 +117,18 @@ fi
 
 # The following checks to see if the GPGPU-Sim power model is enabled.
 # GPGPUSIM_POWER_MODEL points to the directory where gpgpusim_mcpat is located.
-# If this is not set, it checks the default directory "$GPGPUSIM_ROOT/src/gpuwattch/".
-if [ -d $GPGPUSIM_ROOT/src/gpuwattch/ ]; then
-	if [ ! -f $GPGPUSIM_ROOT/src/gpuwattch/gpgpu_sim.verify ]; then
-		echo "ERROR ** gpgpu_sim.verify not found in $GPGPUSIM_ROOT/src/gpuwattch";
+# If this is not set, it checks the default directory "$GPGPUSIM_ROOT/src/accelwattch/".
+if [ -d $GPGPUSIM_ROOT/src/accelwattch/ ]; then
+	if [ ! -f $GPGPUSIM_ROOT/src/accelwattch/gpgpu_sim.verify ]; then
+		echo "ERROR ** gpgpu_sim.verify not found in $GPGPUSIM_ROOT/src/accelwattch";
 		return;
 	fi
-	export GPGPUSIM_POWER_MODEL=$GPGPUSIM_ROOT/src/gpuwattch/;
-	echo "configured with GPUWattch.";
+	export GPGPUSIM_POWER_MODEL=$GPGPUSIM_ROOT/src/accelwattch/;
+	echo "configured with AccelWattch.";
 elif [ -n "$GPGPUSIM_POWER_MODEL" ]; then
 	if [ ! -f $GPGPUSIM_POWER_MODEL/gpgpu_sim.verify ]; then
 		echo "";
-		echo "ERROR ** gpgpu_sim.verify not found in $GPGPUSIM_ROOT/src/gpuwattch/ - Either incorrect directory or incorrect McPAT version";
+		echo "ERROR ** gpgpu_sim.verify not found in $GPGPUSIM_ROOT/src/accelwattch/ - Either incorrect directory or incorrect McPAT version";
 		return;
 	fi
 	echo "configure with power model in $GPGPUSIM_POWER_MODEL.";
diff --git a/src/abstract_hardware_model.cc b/src/abstract_hardware_model.cc
index 30aee60c9..208047eeb 100644
--- a/src/abstract_hardware_model.cc
+++ b/src/abstract_hardware_model.cc
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Inderpreet Singh, Timothy Rogers,
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Inderpreet Singh, Timothy Rogers, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -26,6 +27,7 @@
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 
+
 #include "abstract_hardware_model.h"
 #include <sys/stat.h>
 #include <algorithm>
@@ -281,14 +283,16 @@ void warp_inst_t::broadcast_barrier_reduction(
 void warp_inst_t::generate_mem_accesses() {
   if (empty() || op == MEMORY_BARRIER_OP || m_mem_accesses_created) return;
   if (!((op == LOAD_OP) || (op == TENSOR_CORE_LOAD_OP) || (op == STORE_OP) ||
-        (op == TENSOR_CORE_STORE_OP)))
+        (op == TENSOR_CORE_STORE_OP) ))
     return;
   if (m_warp_active_mask.count() == 0) return;  // predicated off
 
   const size_t starting_queue_size = m_accessq.size();
 
   assert(is_load() || is_store());
-  assert(m_per_scalar_thread_valid);  // need address information per thread
+
+  //if((space.get_type() != tex_space) && (space.get_type() != const_space))
+    assert(m_per_scalar_thread_valid);  // need address information per thread
 
   bool is_write = is_store();
 
diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index 35e28ca57..f04741f75 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Inderpreet Singh,
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Inderpreet Singh, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -59,6 +60,30 @@ enum _memory_space_t {
   instruction_space
 };
 
+#ifndef COEFF_STRUCT
+#define COEFF_STRUCT
+
+struct PowerscalingCoefficients{
+    double int_coeff;
+    double int_mul_coeff;
+    double int_mul24_coeff;
+    double int_mul32_coeff;
+    double int_div_coeff;
+    double fp_coeff;
+    double dp_coeff;
+    double fp_mul_coeff;
+    double fp_div_coeff;
+    double dp_mul_coeff;
+    double dp_div_coeff;
+    double sqrt_coeff;
+    double log_coeff;
+    double sin_coeff;
+    double exp_coeff;
+    double tensor_coeff;
+    double tex_coeff;
+};
+#endif
+
 enum FuncCache {
   FuncCachePreferNone = 0,
   FuncCachePreferShared = 1,
@@ -134,8 +159,14 @@ enum special_operations_t {
   FP_SQRT_OP,
   FP_LG_OP,
   FP_SIN_OP,
-  FP_EXP_OP
+  FP_EXP_OP,
+  DP_MUL_OP,
+  DP_DIV_OP,
+  DP___OP,
+  TENSOR__OP,
+  TEX__OP
 };
+
 typedef enum special_operations_t
     special_ops;  // Required to identify for the power model
 enum operation_pipeline_t {
@@ -911,6 +942,7 @@ class inst_t {
     sp_op = OTHER_OP;
     op_pipe = UNKOWN_OP;
     mem_op = NOT_TEX;
+    const_cache_operand = 0;
     num_operands = 0;
     num_regs = 0;
     memset(out, 0, sizeof(unsigned));
@@ -939,6 +971,20 @@ class inst_t {
     return (op == STORE_OP || op == TENSOR_CORE_STORE_OP ||
             memory_op == memory_store);
   }
+
+  bool is_fp() const { return ((sp_op == FP__OP));}    //VIJAY
+  bool is_fpdiv() const { return ((sp_op == FP_DIV_OP));} 
+  bool is_fpmul() const { return ((sp_op == FP_MUL_OP));} 
+  bool is_dp() const { return ((sp_op == DP___OP));}    
+  bool is_dpdiv() const { return ((sp_op == DP_DIV_OP));} 
+  bool is_dpmul() const { return ((sp_op == DP_MUL_OP));}
+  bool is_imul() const { return ((sp_op == INT_MUL_OP));} 
+  bool is_imul24() const { return ((sp_op == INT_MUL24_OP));} 
+  bool is_imul32() const { return ((sp_op == INT_MUL32_OP));} 
+  bool is_idiv() const { return ((sp_op == INT_DIV_OP));}   
+  bool is_sfu() const {return ((sp_op == FP_SQRT_OP) || (sp_op == FP_LG_OP)  || (sp_op == FP_SIN_OP)  || (sp_op == FP_EXP_OP) || (sp_op == TENSOR__OP));}
+  bool is_alu() const {return (sp_op == INT__OP);}
+
   unsigned get_num_operands() const { return num_operands; }
   unsigned get_num_regs() const { return num_regs; }
   void set_num_regs(unsigned num) { num_regs = num; }
@@ -962,6 +1008,7 @@ class inst_t {
   operation_pipeline op_pipe;  // code (uarch visible) identify the pipeline of
                                // the operation (SP, SFU or MEM)
   mem_operation mem_op;        // code (uarch visible) identify memory type
+  bool const_cache_operand;   // has a load from constant memory as an operand
   _memory_op_t memory_op;      // memory_op used by ptxplus
   unsigned num_operands;
   unsigned num_regs;  // count vector operand as one register operand
diff --git a/src/gpuwattch/Alpha21364.xml b/src/accelwattch/Alpha21364.xml
similarity index 100%
rename from src/gpuwattch/Alpha21364.xml
rename to src/accelwattch/Alpha21364.xml
diff --git a/src/gpuwattch/Niagara1.xml b/src/accelwattch/Niagara1.xml
similarity index 100%
rename from src/gpuwattch/Niagara1.xml
rename to src/accelwattch/Niagara1.xml
diff --git a/src/gpuwattch/Niagara1_sharing.xml b/src/accelwattch/Niagara1_sharing.xml
similarity index 100%
rename from src/gpuwattch/Niagara1_sharing.xml
rename to src/accelwattch/Niagara1_sharing.xml
diff --git a/src/gpuwattch/Niagara1_sharing_DC.xml b/src/accelwattch/Niagara1_sharing_DC.xml
similarity index 100%
rename from src/gpuwattch/Niagara1_sharing_DC.xml
rename to src/accelwattch/Niagara1_sharing_DC.xml
diff --git a/src/gpuwattch/Niagara1_sharing_SBT.xml b/src/accelwattch/Niagara1_sharing_SBT.xml
similarity index 100%
rename from src/gpuwattch/Niagara1_sharing_SBT.xml
rename to src/accelwattch/Niagara1_sharing_SBT.xml
diff --git a/src/gpuwattch/Niagara1_sharing_ST.xml b/src/accelwattch/Niagara1_sharing_ST.xml
similarity index 100%
rename from src/gpuwattch/Niagara1_sharing_ST.xml
rename to src/accelwattch/Niagara1_sharing_ST.xml
diff --git a/src/gpuwattch/Niagara2.xml b/src/accelwattch/Niagara2.xml
similarity index 100%
rename from src/gpuwattch/Niagara2.xml
rename to src/accelwattch/Niagara2.xml
diff --git a/src/gpuwattch/Penryn.xml b/src/accelwattch/Penryn.xml
similarity index 100%
rename from src/gpuwattch/Penryn.xml
rename to src/accelwattch/Penryn.xml
diff --git a/src/gpuwattch/README b/src/accelwattch/README
similarity index 100%
rename from src/gpuwattch/README
rename to src/accelwattch/README
diff --git a/src/gpuwattch/XML_Parse.cc b/src/accelwattch/XML_Parse.cc
similarity index 92%
rename from src/gpuwattch/XML_Parse.cc
rename to src/accelwattch/XML_Parse.cc
index 1b9a38ae1..eaec74806 100644
--- a/src/gpuwattch/XML_Parse.cc
+++ b/src/accelwattch/XML_Parse.cc
@@ -30,12 +30,14 @@
  ***************************************************************************/
 /********************************************************************
  *      Modified by:
- ** Jingwen Leng, Univeristy of Texas, Austin                   * Syed Gilani,
- *University of Wisconsin–Madison                * Tayler Hetherington,
- *University of British Columbia         * Ahmed ElTantawy, University of
- *British Columbia             *
+ * Jingwen Leng, University of Texas, Austin                
+ * Syed Gilani, University of Wisconsin–Madison         
+ * Tayler Hetherington, University of British Columbia
+ * Ahmed ElTantawy, University of British Columbia
+ * Vijay Kandiah, Northwestern University
  ********************************************************************/
 
+
 #include "XML_Parse.h"
 #include <stdio.h>
 #include <string>
@@ -43,13 +45,14 @@
 
 using namespace std;
 
-const char* perf_count_label[] = {
-    "TOT_INST,",    "FP_INT,",  "IC_H,",     "IC_M,",        "DC_RH,",
-    "DC_RM,",       "DC_WH,",   "DC_WM,",    "TC_H,",        "TC_M,",
-    "CC_H,",        "CC_M,",    "SHRD_ACC,", "REG_RD,",      "REG_WR,",
-    "NON_REG_OPs,", "SP_ACC,",  "SFU_ACC,",  "FPU_ACC,",     "MEM_RD,",
-    "MEM_WR,",      "MEM_PRE,", "L2_RH,",    "L2_RM,",       "L2_WH,",
-    "L2_WM,",       "NOC_A,",   "PIPE_A,",   "IDLE_CORE_N,", "CONST_DYNAMICN"};
+const char * perf_count_label[] = {
+  "TOT_INST,", "FP_INT,", "IC_H,", "IC_M,", "DC_RH,", "DC_RM,", "DC_WH,", "DC_WM,",
+  "TC_H,", "TC_M,", "CC_H,", "CC_M,", "SHRD_ACC,", "REG_RD,", "REG_WR,", "NON_REG_OPs,",
+  "INT_ACC,", "FPU_ACC,", "DPU_ACC,", "INT_MUL24_ACC,", "INT_MUL32_ACC,", "INT_MUL_ACC,","INT_DIV_ACC,", 
+  "FP_MUL_ACC,", "FP_DIV_ACC,", "FP_SQRT_ACC,", "FP_LG_ACC,", "FP_SIN_ACC,", "FP_EXP_ACC,", "DP_MUL_ACC,", 
+  "DP_DIV_ACC,", "TENSOR_ACC,", "TEX_ACC,", "MEM_RD,","MEM_WR,", "MEM_PRE,", "L2_RH,", "L2_RM,", "L2_WH,",
+  "L2_WM,", "NOC_A,", "PIPE_A,", "IDLE_CORE_N,", "constant_power"};
+
 
 void ParseXML::parse(char* filepath) {
   unsigned int i, j, k, m, n;
@@ -160,6 +163,199 @@ void ParseXML::parse(char* filepath) {
           atoi(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "modeled_chip_voltage_ref") == 0) {
+      sys.modeled_chip_voltage_ref =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat1_flane") == 0) {
+      sys.static_cat1_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat2_flane") == 0) {
+      sys.static_cat2_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat3_flane") == 0) {
+      sys.static_cat3_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat4_flane") == 0) {
+      sys.static_cat4_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat5_flane") == 0) {
+      sys.static_cat5_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat6_flane") == 0) {
+      sys.static_cat6_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_shared_flane") == 0) {
+      sys.static_shared_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_l1_flane") == 0) {
+      sys.static_l1_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_l2_flane") == 0) {
+      sys.static_l2_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_light_flane") == 0) {
+      sys.static_light_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_intadd_flane") == 0) {
+      sys.static_intadd_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_intmul_flane") == 0) {
+      sys.static_intmul_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_geomean_flane") == 0) {
+      sys.static_geomean_flane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat1_addlane") == 0) {
+      sys.static_cat1_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat2_addlane") == 0) {
+      sys.static_cat2_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat3_addlane") == 0) {
+      sys.static_cat3_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat4_addlane") == 0) {
+      sys.static_cat4_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat5_addlane") == 0) {
+      sys.static_cat5_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_cat6_addlane") == 0) {
+      sys.static_cat6_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_shared_addlane") == 0) {
+      sys.static_shared_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_l1_addlane") == 0) {
+      sys.static_l1_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_l2_addlane") == 0) {
+      sys.static_l2_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_light_addlane") == 0) {
+      sys.static_light_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_intadd_addlane") == 0) {
+      sys.static_intadd_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_intmul_addlane") == 0) {
+      sys.static_intmul_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "static_geomean_addlane") == 0) {
+      sys.static_geomean_addlane =
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+      continue;
+    }
+
+
     if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
                "target_chip_area") == 0) {
       sys.target_chip_area =
@@ -419,22 +615,106 @@ void ParseXML::parse(char* filepath) {
           atof(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
-    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
-               "SP_ACC") == 0) {
-      sys.scaling_coefficients[SP_ACC] =
-          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "INT_ACC")==0) {
+      sys.scaling_coefficients[INT_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
       continue;
     }
-    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
-               "SFU_ACC") == 0) {
-      sys.scaling_coefficients[SFU_ACC] =
-          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "FP_ACC")==0) {
+      sys.scaling_coefficients[FP_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
       continue;
     }
-    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
-               "FPU_ACC") == 0) {
-      sys.scaling_coefficients[FPU_ACC] =
-          atof(xNode2.getChildNode("param", i).getAttribute("value"));
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "DP_ACC")==0) {
+      sys.scaling_coefficients[DP_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "INT_MUL24_ACC")==0) {
+      sys.scaling_coefficients[INT_MUL24_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "INT_MUL32_ACC")==0) {
+      sys.scaling_coefficients[INT_MUL32_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "INT_MUL_ACC")==0) {
+      sys.scaling_coefficients[INT_MUL_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "INT_DIV_ACC")==0) {
+      sys.scaling_coefficients[INT_DIV_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "FP_MUL_ACC")==0) {
+      sys.scaling_coefficients[FP_MUL_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "FP_DIV_ACC")==0) {
+      sys.scaling_coefficients[FP_DIV_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "FP_SQRT_ACC")==0) {
+      sys.scaling_coefficients[FP_SQRT_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "FP_LG_ACC")==0) {
+      sys.scaling_coefficients[FP_LG_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "FP_SIN_ACC")==0) {
+      sys.scaling_coefficients[FP_SIN_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "FP_EXP_ACC")==0) {
+      sys.scaling_coefficients[FP_EXP_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "DP_MUL_ACC")==0) {
+      sys.scaling_coefficients[DP_MUL_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "DP_DIV_ACC")==0) {
+      sys.scaling_coefficients[DP_DIV_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "TENSOR_ACC")==0) {
+      sys.scaling_coefficients[TENSOR_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+      continue;
+    }
+    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
+                "TEX_ACC")==0) {
+      sys.scaling_coefficients[TEX_ACC] =
+          atof(xNode2.getChildNode("param",i).getAttribute("value"));
       continue;
     }
     if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
@@ -498,8 +778,8 @@ void ParseXML::parse(char* filepath) {
       continue;
     }
     if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
-               "CONST_DYNAMICN") == 0) {
-      sys.scaling_coefficients[CONST_DYNAMICN] =
+               "constant_power") == 0) {
+      sys.scaling_coefficients[constant_power] =
           atof(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
@@ -4187,8 +4467,9 @@ void ParseXML::initialize()  // Initialize all
   // strcpy(sys.homogeneous_cores,"default");
   sys.core_tech_node = 1;
   sys.target_core_clockrate = 1;
+  sys.modeled_chip_voltage_ref = 1;
   sys.target_chip_area = 1;
-  sys.temperature = 1;
+  sys.temperature = 340;
   sys.number_cache_levels = 1;
   sys.homogeneous_cores = 1;
   sys.homogeneous_L1Directories = 1;
@@ -4198,6 +4479,34 @@ void ParseXML::initialize()  // Initialize all
   sys.homogeneous_NoCs = 1;
   sys.homogeneous_ccs = 1;
 
+  sys.static_cat1_flane = 0;
+  sys.static_cat2_flane = 0;
+  sys.static_cat3_flane = 0;
+  sys.static_cat4_flane = 0;
+  sys.static_cat5_flane = 0;
+  sys.static_cat6_flane = 0;
+  sys.static_shared_flane = 0;
+  sys.static_l1_flane = 0;
+  sys.static_l2_flane = 0;
+  sys.static_light_flane = 0;
+  sys.static_intadd_flane = 0;
+  sys.static_intmul_flane = 0;
+  sys.static_geomean_flane = 0;
+
+  sys.static_cat1_addlane = 0;
+  sys.static_cat2_addlane = 0;
+  sys.static_cat3_addlane = 0;
+  sys.static_cat4_addlane = 0;
+  sys.static_cat5_addlane = 0;
+  sys.static_cat6_addlane = 0;
+  sys.static_shared_addlane = 0;
+  sys.static_l1_addlane = 0;
+  sys.static_l2_addlane = 0;
+  sys.static_light_addlane = 0;
+  sys.static_intadd_addlane = 0;
+  sys.static_intmul_addlane = 0;
+  sys.static_geomean_addlane = 0;
+
   sys.Max_area_deviation = 1;
   sys.Max_power_deviation = 1;
   sys.device_type = 1;
diff --git a/src/gpuwattch/XML_Parse.h b/src/accelwattch/XML_Parse.h
similarity index 89%
rename from src/gpuwattch/XML_Parse.h
rename to src/accelwattch/XML_Parse.h
index 30c4e4b13..c82359faf 100644
--- a/src/gpuwattch/XML_Parse.h
+++ b/src/accelwattch/XML_Parse.h
@@ -30,10 +30,11 @@
  ***************************************************************************/
 /********************************************************************
  *      Modified by:
- ** Jingwen Leng, Univeristy of Texas, Austin                   * Syed Gilani,
- *University of Wisconsin–Madison                * Tayler Hetherington,
- *University of British Columbia         * Ahmed ElTantawy, University of
- *British Columbia             *
+ * Jingwen Leng, University of Texas, Austin                
+ * Syed Gilani, University of Wisconsin–Madison         
+ * Tayler Hetherington, University of British Columbia
+ * Ahmed ElTantawy, University of British Columbia
+ * Vijay Kandiah, Northwestern University
  ********************************************************************/
 
 #ifndef XML_PARSE_H_
@@ -69,7 +70,7 @@ ToXMLStringTool tx,tx2;
 extern const char* perf_count_label[];
 
 enum perf_count_t {
-  TOT_INST = 0,
+  TOT_INST=0,
   FP_INT,
   IC_H,
   IC_M,
@@ -85,9 +86,23 @@ enum perf_count_t {
   REG_RD,
   REG_WR,
   NON_REG_OPs,
-  SP_ACC,
-  SFU_ACC,
-  FPU_ACC,
+  INT_ACC, //SPU
+  FP_ACC, //FPU
+  DP_ACC, //FPU
+  INT_MUL24_ACC, //SFU
+  INT_MUL32_ACC, //SFU
+  INT_MUL_ACC, //SFU 
+  INT_DIV_ACC, //SFU
+  FP_MUL_ACC, //SFU
+  FP_DIV_ACC, //SFU
+  FP_SQRT_ACC, //SFU
+  FP_LG_ACC, //SFU
+  FP_SIN_ACC, //SFU
+  FP_EXP_ACC, //SFU
+  DP_MUL_ACC, //SFU
+  DP_DIV_ACC, //SFU 
+  TENSOR_ACC, //SFU
+  TEX_ACC, //SFU 
   MEM_RD,
   MEM_WR,
   MEM_PRE,
@@ -98,7 +113,7 @@ enum perf_count_t {
   NOC_A,
   PIPE_A,
   IDLE_CORE_N,
-  CONST_DYNAMICN,
+  constant_power,
   NUM_PERFORMANCE_COUNTERS
 };
 
@@ -635,6 +650,33 @@ typedef struct {
   int homogeneous_L2Directories;
   double core_tech_node;
   int target_core_clockrate;
+  double modeled_chip_voltage_ref;
+  double static_cat1_flane;
+  double static_cat2_flane;
+  double static_cat3_flane;
+  double static_cat4_flane;
+  double static_cat5_flane;
+  double static_cat6_flane;
+  double static_shared_flane;
+  double static_l1_flane;
+  double static_l2_flane;
+  double static_light_flane;
+  double static_intadd_flane;
+  double static_intmul_flane;
+  double static_geomean_flane;
+  double static_cat1_addlane;
+  double static_cat2_addlane;
+  double static_cat3_addlane;
+  double static_cat4_addlane;
+  double static_cat5_addlane;
+  double static_cat6_addlane;
+  double static_shared_addlane;
+  double static_l1_addlane;
+  double static_l2_addlane;
+  double static_light_addlane;
+  double static_intadd_addlane;
+  double static_intmul_addlane;
+  double static_geomean_addlane;
   int target_chip_area;
   int temperature;
   int number_cache_levels;
diff --git a/src/gpuwattch/Xeon.xml b/src/accelwattch/Xeon.xml
similarity index 100%
rename from src/gpuwattch/Xeon.xml
rename to src/accelwattch/Xeon.xml
diff --git a/src/gpuwattch/arch_const.h b/src/accelwattch/arch_const.h
similarity index 100%
rename from src/gpuwattch/arch_const.h
rename to src/accelwattch/arch_const.h
diff --git a/src/gpuwattch/array.cc b/src/accelwattch/array.cc
similarity index 100%
rename from src/gpuwattch/array.cc
rename to src/accelwattch/array.cc
diff --git a/src/gpuwattch/array.h b/src/accelwattch/array.h
similarity index 100%
rename from src/gpuwattch/array.h
rename to src/accelwattch/array.h
diff --git a/src/gpuwattch/basic_components.cc b/src/accelwattch/basic_components.cc
similarity index 100%
rename from src/gpuwattch/basic_components.cc
rename to src/accelwattch/basic_components.cc
diff --git a/src/gpuwattch/basic_components.h b/src/accelwattch/basic_components.h
similarity index 100%
rename from src/gpuwattch/basic_components.h
rename to src/accelwattch/basic_components.h
diff --git a/src/gpuwattch/cacti/README b/src/accelwattch/cacti/README
similarity index 100%
rename from src/gpuwattch/cacti/README
rename to src/accelwattch/cacti/README
diff --git a/src/gpuwattch/cacti/Ucache.cc b/src/accelwattch/cacti/Ucache.cc
similarity index 99%
rename from src/gpuwattch/cacti/Ucache.cc
rename to src/accelwattch/cacti/Ucache.cc
index 8f733f73b..e92e67b91 100644
--- a/src/gpuwattch/cacti/Ucache.cc
+++ b/src/accelwattch/cacti/Ucache.cc
@@ -223,7 +223,7 @@ void * calc_time_mt_wrapper(void * void_obj)
   delete tag_arr.back();
   data_arr.pop_back();
   tag_arr.pop_back();
-
+  pthread_exit(NULL);
 }
 
 
@@ -246,7 +246,7 @@ bool calculate_time(
 {
   DynamicParameter dyn_p(is_tag, pure_ram, pure_cam, Nspd, Ndwl, Ndbl, Ndcm, Ndsam_lev_1, Ndsam_lev_2, is_main_mem);
 
-  if (dyn_p.is_valid == false)
+  if (dyn_p.is_valid != true)
   {
     return false;
   }
diff --git a/src/gpuwattch/cacti/Ucache.h b/src/accelwattch/cacti/Ucache.h
similarity index 100%
rename from src/gpuwattch/cacti/Ucache.h
rename to src/accelwattch/cacti/Ucache.h
diff --git a/src/gpuwattch/cacti/arbiter.cc b/src/accelwattch/cacti/arbiter.cc
similarity index 100%
rename from src/gpuwattch/cacti/arbiter.cc
rename to src/accelwattch/cacti/arbiter.cc
diff --git a/src/gpuwattch/cacti/arbiter.h b/src/accelwattch/cacti/arbiter.h
similarity index 100%
rename from src/gpuwattch/cacti/arbiter.h
rename to src/accelwattch/cacti/arbiter.h
diff --git a/src/gpuwattch/cacti/area.cc b/src/accelwattch/cacti/area.cc
similarity index 100%
rename from src/gpuwattch/cacti/area.cc
rename to src/accelwattch/cacti/area.cc
diff --git a/src/gpuwattch/cacti/area.h b/src/accelwattch/cacti/area.h
similarity index 100%
rename from src/gpuwattch/cacti/area.h
rename to src/accelwattch/cacti/area.h
diff --git a/src/gpuwattch/cacti/bank.cc b/src/accelwattch/cacti/bank.cc
similarity index 100%
rename from src/gpuwattch/cacti/bank.cc
rename to src/accelwattch/cacti/bank.cc
diff --git a/src/gpuwattch/cacti/bank.h b/src/accelwattch/cacti/bank.h
similarity index 100%
rename from src/gpuwattch/cacti/bank.h
rename to src/accelwattch/cacti/bank.h
diff --git a/src/gpuwattch/cacti/basic_circuit.cc b/src/accelwattch/cacti/basic_circuit.cc
similarity index 100%
rename from src/gpuwattch/cacti/basic_circuit.cc
rename to src/accelwattch/cacti/basic_circuit.cc
diff --git a/src/gpuwattch/cacti/basic_circuit.h b/src/accelwattch/cacti/basic_circuit.h
similarity index 100%
rename from src/gpuwattch/cacti/basic_circuit.h
rename to src/accelwattch/cacti/basic_circuit.h
diff --git a/src/gpuwattch/cacti/batch_tests b/src/accelwattch/cacti/batch_tests
similarity index 100%
rename from src/gpuwattch/cacti/batch_tests
rename to src/accelwattch/cacti/batch_tests
diff --git a/src/gpuwattch/cacti/cache.cfg b/src/accelwattch/cacti/cache.cfg
similarity index 100%
rename from src/gpuwattch/cacti/cache.cfg
rename to src/accelwattch/cacti/cache.cfg
diff --git a/src/gpuwattch/cacti/cacti.i b/src/accelwattch/cacti/cacti.i
similarity index 100%
rename from src/gpuwattch/cacti/cacti.i
rename to src/accelwattch/cacti/cacti.i
diff --git a/src/gpuwattch/cacti/cacti.mk b/src/accelwattch/cacti/cacti.mk
similarity index 96%
rename from src/gpuwattch/cacti/cacti.mk
rename to src/accelwattch/cacti/cacti.mk
index 7f3c57338..41f9218f4 100644
--- a/src/gpuwattch/cacti/cacti.mk
+++ b/src/accelwattch/cacti/cacti.mk
@@ -1,5 +1,5 @@
 
-OUTPUT_DIR=$(SIM_OBJ_FILES_DIR)/gpuwattch/cacti
+OUTPUT_DIR=$(SIM_OBJ_FILES_DIR)/accelwattch/cacti
 TARGET = cacti
 SHELL = /bin/sh
 .PHONY: all depend clean
diff --git a/src/gpuwattch/cacti/cacti_interface.cc b/src/accelwattch/cacti/cacti_interface.cc
similarity index 100%
rename from src/gpuwattch/cacti/cacti_interface.cc
rename to src/accelwattch/cacti/cacti_interface.cc
diff --git a/src/gpuwattch/cacti/cacti_interface.h b/src/accelwattch/cacti/cacti_interface.h
similarity index 100%
rename from src/gpuwattch/cacti/cacti_interface.h
rename to src/accelwattch/cacti/cacti_interface.h
diff --git a/src/gpuwattch/cacti/component.cc b/src/accelwattch/cacti/component.cc
similarity index 100%
rename from src/gpuwattch/cacti/component.cc
rename to src/accelwattch/cacti/component.cc
diff --git a/src/gpuwattch/cacti/component.h b/src/accelwattch/cacti/component.h
similarity index 100%
rename from src/gpuwattch/cacti/component.h
rename to src/accelwattch/cacti/component.h
diff --git a/src/gpuwattch/cacti/const.h b/src/accelwattch/cacti/const.h
similarity index 100%
rename from src/gpuwattch/cacti/const.h
rename to src/accelwattch/cacti/const.h
diff --git a/src/gpuwattch/cacti/contention.dat b/src/accelwattch/cacti/contention.dat
similarity index 100%
rename from src/gpuwattch/cacti/contention.dat
rename to src/accelwattch/cacti/contention.dat
diff --git a/src/gpuwattch/cacti/crossbar.cc b/src/accelwattch/cacti/crossbar.cc
similarity index 100%
rename from src/gpuwattch/cacti/crossbar.cc
rename to src/accelwattch/cacti/crossbar.cc
diff --git a/src/gpuwattch/cacti/crossbar.h b/src/accelwattch/cacti/crossbar.h
similarity index 100%
rename from src/gpuwattch/cacti/crossbar.h
rename to src/accelwattch/cacti/crossbar.h
diff --git a/src/gpuwattch/cacti/decoder.cc b/src/accelwattch/cacti/decoder.cc
similarity index 100%
rename from src/gpuwattch/cacti/decoder.cc
rename to src/accelwattch/cacti/decoder.cc
diff --git a/src/gpuwattch/cacti/decoder.h b/src/accelwattch/cacti/decoder.h
similarity index 100%
rename from src/gpuwattch/cacti/decoder.h
rename to src/accelwattch/cacti/decoder.h
diff --git a/src/gpuwattch/cacti/highradix.cc b/src/accelwattch/cacti/highradix.cc
similarity index 100%
rename from src/gpuwattch/cacti/highradix.cc
rename to src/accelwattch/cacti/highradix.cc
diff --git a/src/gpuwattch/cacti/highradix.h b/src/accelwattch/cacti/highradix.h
similarity index 100%
rename from src/gpuwattch/cacti/highradix.h
rename to src/accelwattch/cacti/highradix.h
diff --git a/src/gpuwattch/cacti/htree2.cc b/src/accelwattch/cacti/htree2.cc
similarity index 100%
rename from src/gpuwattch/cacti/htree2.cc
rename to src/accelwattch/cacti/htree2.cc
diff --git a/src/gpuwattch/cacti/htree2.h b/src/accelwattch/cacti/htree2.h
similarity index 100%
rename from src/gpuwattch/cacti/htree2.h
rename to src/accelwattch/cacti/htree2.h
diff --git a/src/gpuwattch/cacti/io.cc b/src/accelwattch/cacti/io.cc
similarity index 100%
rename from src/gpuwattch/cacti/io.cc
rename to src/accelwattch/cacti/io.cc
diff --git a/src/gpuwattch/cacti/io.h b/src/accelwattch/cacti/io.h
similarity index 100%
rename from src/gpuwattch/cacti/io.h
rename to src/accelwattch/cacti/io.h
diff --git a/src/gpuwattch/cacti/main.cc b/src/accelwattch/cacti/main.cc
similarity index 100%
rename from src/gpuwattch/cacti/main.cc
rename to src/accelwattch/cacti/main.cc
diff --git a/src/gpuwattch/cacti/makefile b/src/accelwattch/cacti/makefile
similarity index 100%
rename from src/gpuwattch/cacti/makefile
rename to src/accelwattch/cacti/makefile
diff --git a/src/gpuwattch/cacti/mat.cc b/src/accelwattch/cacti/mat.cc
similarity index 100%
rename from src/gpuwattch/cacti/mat.cc
rename to src/accelwattch/cacti/mat.cc
diff --git a/src/gpuwattch/cacti/mat.h b/src/accelwattch/cacti/mat.h
similarity index 100%
rename from src/gpuwattch/cacti/mat.h
rename to src/accelwattch/cacti/mat.h
diff --git a/src/gpuwattch/cacti/nuca.cc b/src/accelwattch/cacti/nuca.cc
similarity index 100%
rename from src/gpuwattch/cacti/nuca.cc
rename to src/accelwattch/cacti/nuca.cc
diff --git a/src/gpuwattch/cacti/nuca.h b/src/accelwattch/cacti/nuca.h
similarity index 100%
rename from src/gpuwattch/cacti/nuca.h
rename to src/accelwattch/cacti/nuca.h
diff --git a/src/gpuwattch/cacti/out_batch_test_result.csv b/src/accelwattch/cacti/out_batch_test_result.csv
similarity index 100%
rename from src/gpuwattch/cacti/out_batch_test_result.csv
rename to src/accelwattch/cacti/out_batch_test_result.csv
diff --git a/src/gpuwattch/cacti/parameter.cc b/src/accelwattch/cacti/parameter.cc
similarity index 100%
rename from src/gpuwattch/cacti/parameter.cc
rename to src/accelwattch/cacti/parameter.cc
diff --git a/src/gpuwattch/cacti/parameter.h b/src/accelwattch/cacti/parameter.h
similarity index 100%
rename from src/gpuwattch/cacti/parameter.h
rename to src/accelwattch/cacti/parameter.h
diff --git a/src/gpuwattch/cacti/router.cc b/src/accelwattch/cacti/router.cc
similarity index 100%
rename from src/gpuwattch/cacti/router.cc
rename to src/accelwattch/cacti/router.cc
diff --git a/src/gpuwattch/cacti/router.h b/src/accelwattch/cacti/router.h
similarity index 100%
rename from src/gpuwattch/cacti/router.h
rename to src/accelwattch/cacti/router.h
diff --git a/src/gpuwattch/cacti/subarray.cc b/src/accelwattch/cacti/subarray.cc
similarity index 100%
rename from src/gpuwattch/cacti/subarray.cc
rename to src/accelwattch/cacti/subarray.cc
diff --git a/src/gpuwattch/cacti/subarray.h b/src/accelwattch/cacti/subarray.h
similarity index 100%
rename from src/gpuwattch/cacti/subarray.h
rename to src/accelwattch/cacti/subarray.h
diff --git a/src/gpuwattch/cacti/technology.cc b/src/accelwattch/cacti/technology.cc
similarity index 100%
rename from src/gpuwattch/cacti/technology.cc
rename to src/accelwattch/cacti/technology.cc
diff --git a/src/gpuwattch/cacti/uca.cc b/src/accelwattch/cacti/uca.cc
similarity index 100%
rename from src/gpuwattch/cacti/uca.cc
rename to src/accelwattch/cacti/uca.cc
diff --git a/src/gpuwattch/cacti/uca.h b/src/accelwattch/cacti/uca.h
similarity index 100%
rename from src/gpuwattch/cacti/uca.h
rename to src/accelwattch/cacti/uca.h
diff --git a/src/gpuwattch/cacti/wire.cc b/src/accelwattch/cacti/wire.cc
similarity index 100%
rename from src/gpuwattch/cacti/wire.cc
rename to src/accelwattch/cacti/wire.cc
diff --git a/src/gpuwattch/cacti/wire.h b/src/accelwattch/cacti/wire.h
similarity index 100%
rename from src/gpuwattch/cacti/wire.h
rename to src/accelwattch/cacti/wire.h
diff --git a/src/gpuwattch/core.cc b/src/accelwattch/core.cc
similarity index 100%
rename from src/gpuwattch/core.cc
rename to src/accelwattch/core.cc
diff --git a/src/gpuwattch/core.h b/src/accelwattch/core.h
similarity index 100%
rename from src/gpuwattch/core.h
rename to src/accelwattch/core.h
diff --git a/src/gpuwattch/fermi.xml b/src/accelwattch/fermi.xml
similarity index 100%
rename from src/gpuwattch/fermi.xml
rename to src/accelwattch/fermi.xml
diff --git a/src/gpuwattch/globalvar.h b/src/accelwattch/globalvar.h
similarity index 100%
rename from src/gpuwattch/globalvar.h
rename to src/accelwattch/globalvar.h
diff --git a/src/gpuwattch/gpgpu.xml b/src/accelwattch/gpgpu.xml
similarity index 100%
rename from src/gpuwattch/gpgpu.xml
rename to src/accelwattch/gpgpu.xml
diff --git a/src/gpuwattch/gpgpu_sim.verify b/src/accelwattch/gpgpu_sim.verify
similarity index 100%
rename from src/gpuwattch/gpgpu_sim.verify
rename to src/accelwattch/gpgpu_sim.verify
diff --git a/src/accelwattch/gpgpu_sim_wrapper.cc b/src/accelwattch/gpgpu_sim_wrapper.cc
new file mode 100644
index 000000000..67d9daa1f
--- /dev/null
+++ b/src/accelwattch/gpgpu_sim_wrapper.cc
@@ -0,0 +1,1143 @@
+// Copyright (c) 2009-2021, Tor M. Aamodt, Tayler Hetherington, Ahmed ElTantawy, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
+#include "gpgpu_sim_wrapper.h"
+#include <sys/stat.h>
+#define SP_BASE_POWER 0
+#define SFU_BASE_POWER 0
+
+static const char* pwr_cmp_label[] = {
+    "IBP,", "ICP,", "DCP,", "TCP,", "CCP,", "SHRDP,", "RFP,", "INTP,", 
+    "FPUP,", "DPUP,", "INT_MUL24P,", "INT_MUL32P,", "INT_MULP,", "INT_DIVP,", 
+    "FP_MULP,", "FP_DIVP,", "FP_SQRTP,", "FP_LGP,", "FP_SINP,", "FP_EXP,", 
+    "DP_MULP,", "DP_DIVP,", "TENSORP,", "TEXP,", "SCHEDP,", "L2CP,", "MCP,", "NOCP,", 
+    "DRAMP,", "PIPEP,", "IDLE_COREP,", "CONSTP", "STATICP"};
+
+enum pwr_cmp_t {
+  IBP=0,
+  ICP,
+  DCP,
+  TCP,
+  CCP,
+  SHRDP,
+  RFP,
+  INTP,
+  FPUP,
+  DPUP,
+  INT_MUL24P,
+  INT_MUL32P,
+  INT_MULP,
+  INT_DIVP,
+  FP_MULP,
+  FP_DIVP,
+  FP_SQRTP,
+  FP_LGP,
+  FP_SINP,
+  FP_EXP,
+  DP_MULP,
+  DP_DIVP,
+  TENSORP,
+  TEXP,
+  SCHEDP,
+  L2CP,
+  MCP,
+  NOCP,
+  DRAMP,
+  PIPEP,
+  IDLE_COREP,
+  CONSTP,
+  STATICP,
+  NUM_COMPONENTS_MODELLED
+};
+
+gpgpu_sim_wrapper::gpgpu_sim_wrapper(bool power_simulation_enabled,
+                                     char* xmlfile, int power_simulation_mode, bool dvfs_enabled) {
+  kernel_sample_count = 0;
+  total_sample_count = 0;
+
+  kernel_tot_power = 0;
+  avg_threads_per_warp_tot = 0;
+  num_pwr_cmps = NUM_COMPONENTS_MODELLED;
+  num_perf_counters = NUM_PERFORMANCE_COUNTERS;
+
+  // Initialize per-component counter/power vectors
+  avg_max_min_counters<double> init;
+  kernel_cmp_pwr.resize(NUM_COMPONENTS_MODELLED, init);
+  kernel_cmp_perf_counters.resize(NUM_PERFORMANCE_COUNTERS, init);
+
+  kernel_power = init;   // Per-kernel powers
+  gpu_tot_power = init;  // Global powers
+
+  sample_cmp_pwr.resize(NUM_COMPONENTS_MODELLED, 0);
+
+  sample_perf_counters.resize(NUM_PERFORMANCE_COUNTERS, 0);
+  initpower_coeff.resize(NUM_PERFORMANCE_COUNTERS, 0);
+  effpower_coeff.resize(NUM_PERFORMANCE_COUNTERS, 0);
+
+  const_dynamic_power = 0;
+  proc_power = 0;
+
+  g_power_filename = NULL;
+  g_power_trace_filename = NULL;
+  g_metric_trace_filename = NULL;
+  g_steady_state_tracking_filename = NULL;
+  xml_filename = xmlfile;
+  g_power_simulation_enabled = power_simulation_enabled;
+  g_power_simulation_mode = power_simulation_mode;
+  g_dvfs_enabled = dvfs_enabled;
+  g_power_trace_enabled = false;
+  g_steady_power_levels_enabled = false;
+  g_power_trace_zlevel = 0;
+  g_power_per_cycle_dump = false;
+  gpu_steady_power_deviation = 0;
+  gpu_steady_min_period = 0;
+
+  gpu_stat_sample_freq = 0;
+  p = new ParseXML();
+  if (g_power_simulation_enabled) {
+    p->parse(xml_filename);
+  }
+  proc = new Processor(p);
+  power_trace_file = NULL;
+  metric_trace_file = NULL;
+  steady_state_tacking_file = NULL;
+  has_written_avg = false;
+  init_inst_val = false;
+}
+
+gpgpu_sim_wrapper::~gpgpu_sim_wrapper() {}
+
+bool gpgpu_sim_wrapper::sanity_check(double a, double b) {
+  if (b == 0)
+    return (abs(a - b) < 0.00001);
+  else
+    return (abs(a - b) / abs(b) < 0.00001);
+
+  return false;
+}
+void gpgpu_sim_wrapper::init_mcpat_hw_mode(unsigned gpu_sim_cycle) {
+   p->sys.total_cycles = gpu_sim_cycle; //total simulated cycles for current kernel
+}
+
+void gpgpu_sim_wrapper::init_mcpat(
+    char* xmlfile, char* powerfilename, char* power_trace_filename,
+    char* metric_trace_filename, char* steady_state_filename,
+    bool power_sim_enabled, bool trace_enabled, bool steady_state_enabled,
+    bool power_per_cycle_dump, double steady_power_deviation,
+    double steady_min_period, int zlevel, double init_val,
+    int stat_sample_freq, int power_sim_mode, bool dvfs_enabled,
+    unsigned clock_freq, unsigned num_shaders) {
+  // Write File Headers for (-metrics trace, -power trace)
+
+  reset_counters();
+  static bool mcpat_init = true;
+
+  // initialize file name if it is not set
+  time_t curr_time;
+  time(&curr_time);
+  char* date = ctime(&curr_time);
+  char* s = date;
+  while (*s) {
+    if (*s == ' ' || *s == '\t' || *s == ':') *s = '-';
+    if (*s == '\n' || *s == '\r') *s = 0;
+    s++;
+  }
+
+  if (mcpat_init) {
+    g_power_filename = powerfilename;
+    g_power_trace_filename = power_trace_filename;
+    g_metric_trace_filename = metric_trace_filename;
+    g_steady_state_tracking_filename = steady_state_filename;
+    xml_filename = xmlfile;
+    g_power_simulation_enabled = power_sim_enabled;
+    g_power_simulation_mode = power_sim_mode;
+    g_dvfs_enabled = dvfs_enabled;
+    g_power_trace_enabled = trace_enabled;
+    g_steady_power_levels_enabled = steady_state_enabled;
+    g_power_trace_zlevel = zlevel;
+    g_power_per_cycle_dump = power_per_cycle_dump;
+    gpu_steady_power_deviation = steady_power_deviation;
+    gpu_steady_min_period = steady_min_period;
+
+    gpu_stat_sample_freq = stat_sample_freq;
+
+    // p->sys.total_cycles=gpu_stat_sample_freq*4;
+    p->sys.total_cycles = gpu_stat_sample_freq;
+    p->sys.target_core_clockrate = clock_freq;
+    p->sys.number_of_cores = num_shaders;
+    p->sys.core[0].clock_rate = clock_freq;
+    power_trace_file = NULL;
+    metric_trace_file = NULL;
+    steady_state_tacking_file = NULL;
+
+    if (g_power_trace_enabled) {
+      power_trace_file = gzopen(g_power_trace_filename, "w");
+      metric_trace_file = gzopen(g_metric_trace_filename, "w");
+      if ((power_trace_file == NULL) || (metric_trace_file == NULL)) {
+        printf("error - could not open trace files \n");
+        exit(1);
+      }
+      gzsetparams(power_trace_file, g_power_trace_zlevel, Z_DEFAULT_STRATEGY);
+
+      gzprintf(power_trace_file, "power,");
+      for (unsigned i = 0; i < num_pwr_cmps; i++) {
+        gzprintf(power_trace_file, pwr_cmp_label[i]);
+      }
+      gzprintf(power_trace_file, "\n");
+
+      gzsetparams(metric_trace_file, g_power_trace_zlevel, Z_DEFAULT_STRATEGY);
+      for (unsigned i = 0; i < num_perf_counters; i++) {
+        gzprintf(metric_trace_file, perf_count_label[i]);
+      }
+      gzprintf(metric_trace_file, "\n");
+
+      gzclose(power_trace_file);
+      gzclose(metric_trace_file);
+    }
+    if (g_steady_power_levels_enabled) {
+      steady_state_tacking_file = gzopen(g_steady_state_tracking_filename, "w");
+      if ((steady_state_tacking_file == NULL)) {
+        printf("error - could not open trace files \n");
+        exit(1);
+      }
+      gzsetparams(steady_state_tacking_file, g_power_trace_zlevel,
+                  Z_DEFAULT_STRATEGY);
+      gzprintf(steady_state_tacking_file, "start,end,power,IPC,");
+      for (unsigned i = 0; i < num_perf_counters; i++) {
+        gzprintf(steady_state_tacking_file, perf_count_label[i]);
+      }
+      gzprintf(steady_state_tacking_file, "\n");
+
+      gzclose(steady_state_tacking_file);
+    }
+
+    mcpat_init = false;
+    has_written_avg = false;
+    powerfile.open(g_power_filename);
+    int flg = chmod(g_power_filename, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+    assert(flg == 0);
+  }
+  sample_val = 0;
+  init_inst_val = init_val;  // gpu_tot_sim_insn+gpu_sim_insn;
+}
+
+void gpgpu_sim_wrapper::reset_counters() {
+  avg_max_min_counters<double> init;
+  for (unsigned i = 0; i < num_perf_counters; ++i) {
+    sample_perf_counters[i] = 0;
+    kernel_cmp_perf_counters[i] = init;
+  }
+  for (unsigned i = 0; i < num_pwr_cmps; ++i) {
+    sample_cmp_pwr[i] = 0;
+    kernel_cmp_pwr[i] = init;
+  }
+
+  // Reset per-kernel counters
+  kernel_sample_count = 0;
+  kernel_tot_power = 0;
+  kernel_power = init;
+  avg_threads_per_warp_tot = 0;
+  return;
+}
+
+void gpgpu_sim_wrapper::set_inst_power(bool clk_gated_lanes, double tot_cycles,
+                                       double busy_cycles, double tot_inst,
+                                       double int_inst, double fp_inst,
+                                       double load_inst, double store_inst,
+                                       double committed_inst) {
+  p->sys.core[0].gpgpu_clock_gated_lanes = clk_gated_lanes;
+  p->sys.core[0].total_cycles = tot_cycles;
+  p->sys.core[0].busy_cycles = busy_cycles;
+  p->sys.core[0].total_instructions =
+      tot_inst * p->sys.scaling_coefficients[TOT_INST];
+  p->sys.core[0].int_instructions =
+      int_inst * p->sys.scaling_coefficients[FP_INT];
+  p->sys.core[0].fp_instructions =
+      fp_inst * p->sys.scaling_coefficients[FP_INT];
+  p->sys.core[0].load_instructions = load_inst;
+  p->sys.core[0].store_instructions = store_inst;
+  p->sys.core[0].committed_instructions = committed_inst;
+  sample_perf_counters[FP_INT] = int_inst + fp_inst;
+  sample_perf_counters[TOT_INST] = tot_inst;
+}
+
+void gpgpu_sim_wrapper::set_regfile_power(double reads, double writes,
+                                          double ops) {
+  p->sys.core[0].int_regfile_reads =
+      reads * p->sys.scaling_coefficients[REG_RD];
+  p->sys.core[0].int_regfile_writes =
+      writes * p->sys.scaling_coefficients[REG_WR];
+  p->sys.core[0].non_rf_operands =
+      ops * p->sys.scaling_coefficients[NON_REG_OPs];
+  sample_perf_counters[REG_RD] = reads;
+  sample_perf_counters[REG_WR] = writes;
+  sample_perf_counters[NON_REG_OPs] = ops;
+}
+
+void gpgpu_sim_wrapper::set_icache_power(double hits, double misses) {
+  p->sys.core[0].icache.read_accesses =
+      hits * p->sys.scaling_coefficients[IC_H] +
+      misses * p->sys.scaling_coefficients[IC_M];
+  p->sys.core[0].icache.read_misses =
+      misses * p->sys.scaling_coefficients[IC_M];
+  sample_perf_counters[IC_H] = hits;
+  sample_perf_counters[IC_M] = misses;
+}
+
+void gpgpu_sim_wrapper::set_ccache_power(double hits, double misses) {
+  p->sys.core[0].ccache.read_accesses =
+      hits * p->sys.scaling_coefficients[CC_H] +
+      misses * p->sys.scaling_coefficients[CC_M];
+  p->sys.core[0].ccache.read_misses =
+      misses * p->sys.scaling_coefficients[CC_M];
+  sample_perf_counters[CC_H] = hits;
+  sample_perf_counters[CC_M] = misses;
+  // TODO: coalescing logic is counted as part of the caches power (this is not
+  // valid for no-caches architectures)
+}
+
+void gpgpu_sim_wrapper::set_tcache_power(double hits, double misses) {
+  p->sys.core[0].tcache.read_accesses =
+      hits * p->sys.scaling_coefficients[TC_H] +
+      misses * p->sys.scaling_coefficients[TC_M];
+  p->sys.core[0].tcache.read_misses =
+      misses * p->sys.scaling_coefficients[TC_M];
+  sample_perf_counters[TC_H] = hits;
+  sample_perf_counters[TC_M] = misses;
+  // TODO: coalescing logic is counted as part of the caches power (this is not
+  // valid for no-caches architectures)
+}
+
+void gpgpu_sim_wrapper::set_shrd_mem_power(double accesses) {
+  p->sys.core[0].sharedmemory.read_accesses =
+      accesses * p->sys.scaling_coefficients[SHRD_ACC];
+  sample_perf_counters[SHRD_ACC] = accesses;
+}
+
+void gpgpu_sim_wrapper::set_l1cache_power(double read_hits, double read_misses,
+                                          double write_hits,
+                                          double write_misses) {
+  p->sys.core[0].dcache.read_accesses =
+      read_hits * p->sys.scaling_coefficients[DC_RH] +
+      read_misses * p->sys.scaling_coefficients[DC_RM];
+  p->sys.core[0].dcache.read_misses =
+      read_misses * p->sys.scaling_coefficients[DC_RM];
+  p->sys.core[0].dcache.write_accesses =
+      write_hits * p->sys.scaling_coefficients[DC_WH] +
+      write_misses * p->sys.scaling_coefficients[DC_WM];
+  p->sys.core[0].dcache.write_misses =
+      write_misses * p->sys.scaling_coefficients[DC_WM];
+  sample_perf_counters[DC_RH] = read_hits;
+  sample_perf_counters[DC_RM] = read_misses;
+  sample_perf_counters[DC_WH] = write_hits;
+  sample_perf_counters[DC_WM] = write_misses;
+  // TODO: coalescing logic is counted as part of the caches power (this is not
+  // valid for no-caches architectures)
+}
+
+void gpgpu_sim_wrapper::set_l2cache_power(double read_hits, double read_misses,
+                                          double write_hits,
+                                          double write_misses) {
+  p->sys.l2.total_accesses = read_hits * p->sys.scaling_coefficients[L2_RH] +
+                             read_misses * p->sys.scaling_coefficients[L2_RM] +
+                             write_hits * p->sys.scaling_coefficients[L2_WH] +
+                             write_misses * p->sys.scaling_coefficients[L2_WM];
+  p->sys.l2.read_accesses = read_hits * p->sys.scaling_coefficients[L2_RH] +
+                            read_misses * p->sys.scaling_coefficients[L2_RM];
+  p->sys.l2.write_accesses = write_hits * p->sys.scaling_coefficients[L2_WH] +
+                             write_misses * p->sys.scaling_coefficients[L2_WM];
+  p->sys.l2.read_hits = read_hits * p->sys.scaling_coefficients[L2_RH];
+  p->sys.l2.read_misses = read_misses * p->sys.scaling_coefficients[L2_RM];
+  p->sys.l2.write_hits = write_hits * p->sys.scaling_coefficients[L2_WH];
+  p->sys.l2.write_misses = write_misses * p->sys.scaling_coefficients[L2_WM];
+  sample_perf_counters[L2_RH] = read_hits;
+  sample_perf_counters[L2_RM] = read_misses;
+  sample_perf_counters[L2_WH] = write_hits;
+  sample_perf_counters[L2_WM] = write_misses;
+}
+
+void gpgpu_sim_wrapper::set_num_cores(double num_core) {
+  
+  num_cores = num_core;
+}
+
+void gpgpu_sim_wrapper::set_idle_core_power(double num_idle_core) {
+  p->sys.num_idle_cores = num_idle_core;
+  sample_perf_counters[IDLE_CORE_N] = num_idle_core;
+  num_idle_cores = num_idle_core;
+}
+
+void gpgpu_sim_wrapper::set_duty_cycle_power(double duty_cycle) {
+  p->sys.core[0].pipeline_duty_cycle =
+      duty_cycle * p->sys.scaling_coefficients[PIPE_A];
+  sample_perf_counters[PIPE_A] = duty_cycle;
+}
+
+void gpgpu_sim_wrapper::set_mem_ctrl_power(double reads, double writes,
+                                           double dram_precharge) {
+  p->sys.mc.memory_accesses = reads * p->sys.scaling_coefficients[MEM_RD] +
+                              writes * p->sys.scaling_coefficients[MEM_WR];
+  p->sys.mc.memory_reads = reads * p->sys.scaling_coefficients[MEM_RD];
+  p->sys.mc.memory_writes = writes * p->sys.scaling_coefficients[MEM_WR];
+  p->sys.mc.dram_pre = dram_precharge * p->sys.scaling_coefficients[MEM_PRE];
+  sample_perf_counters[MEM_RD] = reads;
+  sample_perf_counters[MEM_WR] = writes;
+  sample_perf_counters[MEM_PRE] = dram_precharge;
+}
+
+
+void gpgpu_sim_wrapper::set_model_voltage(double model_voltage) {
+	modeled_chip_voltage = model_voltage;
+}
+
+
+void gpgpu_sim_wrapper::set_exec_unit_power(double fpu_accesses,
+                                            double ialu_accesses,
+                                            double sfu_accesses) {
+  p->sys.core[0].fpu_accesses = fpu_accesses;
+  tot_fpu_accesses = fpu_accesses;
+  //Integer ALU (not present in Tesla)
+  p->sys.core[0].ialu_accesses = ialu_accesses;
+
+  //Sfu accesses
+  p->sys.core[0].mul_accesses = sfu_accesses;
+  tot_sfu_accesses = sfu_accesses;
+}
+
+PowerscalingCoefficients * gpgpu_sim_wrapper::get_scaling_coeffs()
+{
+
+  PowerscalingCoefficients * scalingCoeffs = new PowerscalingCoefficients();
+
+  scalingCoeffs->int_coeff = p->sys.scaling_coefficients[INT_ACC];
+  scalingCoeffs->int_mul_coeff = p->sys.scaling_coefficients[INT_MUL_ACC];
+  scalingCoeffs->int_mul24_coeff = p->sys.scaling_coefficients[INT_MUL24_ACC];
+  scalingCoeffs->int_mul32_coeff = p->sys.scaling_coefficients[INT_MUL32_ACC];
+  scalingCoeffs->int_div_coeff = p->sys.scaling_coefficients[INT_DIV_ACC];
+  scalingCoeffs->fp_coeff = p->sys.scaling_coefficients[FP_ACC];
+  scalingCoeffs->dp_coeff = p->sys.scaling_coefficients[DP_ACC];
+  scalingCoeffs->fp_mul_coeff = p->sys.scaling_coefficients[FP_MUL_ACC];
+  scalingCoeffs->fp_div_coeff = p->sys.scaling_coefficients[FP_DIV_ACC];
+  scalingCoeffs->dp_mul_coeff = p->sys.scaling_coefficients[DP_MUL_ACC];
+  scalingCoeffs->dp_div_coeff = p->sys.scaling_coefficients[DP_DIV_ACC];
+  scalingCoeffs->sqrt_coeff = p->sys.scaling_coefficients[FP_SQRT_ACC];
+  scalingCoeffs->log_coeff = p->sys.scaling_coefficients[FP_LG_ACC];
+  scalingCoeffs->sin_coeff = p->sys.scaling_coefficients[FP_SIN_ACC];
+  scalingCoeffs->exp_coeff = p->sys.scaling_coefficients[FP_EXP_ACC];
+  scalingCoeffs->tensor_coeff = p->sys.scaling_coefficients[TENSOR_ACC];
+  scalingCoeffs->tex_coeff = p->sys.scaling_coefficients[TEX_ACC];
+  return scalingCoeffs;
+
+}
+
+void gpgpu_sim_wrapper::set_int_accesses(double ialu_accesses, 
+                                        double imul24_accesses, 
+                                        double imul32_accesses, 
+                                        double imul_accesses, 
+                                        double idiv_accesses)
+{
+
+  sample_perf_counters[INT_ACC]=ialu_accesses;
+  sample_perf_counters[INT_MUL24_ACC]=imul24_accesses;
+  sample_perf_counters[INT_MUL32_ACC]=imul32_accesses;
+  sample_perf_counters[INT_MUL_ACC]=imul_accesses;
+  sample_perf_counters[INT_DIV_ACC]=idiv_accesses;
+}
+
+void gpgpu_sim_wrapper::set_dp_accesses(double dpu_accesses, 
+                                        double dpmul_accesses, 
+                                        double dpdiv_accesses)
+{
+  sample_perf_counters[DP_ACC]=dpu_accesses;
+  sample_perf_counters[DP_MUL_ACC]=dpmul_accesses;
+  sample_perf_counters[DP_DIV_ACC]=dpdiv_accesses;
+}
+
+void gpgpu_sim_wrapper::set_fp_accesses(double fpu_accesses, 
+                                        double fpmul_accesses, 
+                                        double fpdiv_accesses)
+{
+  sample_perf_counters[FP_ACC]=fpu_accesses;
+  sample_perf_counters[FP_MUL_ACC]=fpmul_accesses;
+  sample_perf_counters[FP_DIV_ACC]=fpdiv_accesses;
+}
+
+void gpgpu_sim_wrapper::set_trans_accesses(double sqrt_accesses, 
+                                           double log_accesses, 
+                                           double sin_accesses, 
+                                           double exp_accesses)
+{
+
+  sample_perf_counters[FP_SQRT_ACC]=sqrt_accesses;
+  sample_perf_counters[FP_LG_ACC]=log_accesses;
+  sample_perf_counters[FP_SIN_ACC]=sin_accesses;
+  sample_perf_counters[FP_EXP_ACC]=exp_accesses;
+
+}
+
+void gpgpu_sim_wrapper::set_tensor_accesses(double tensor_accesses)
+{
+  sample_perf_counters[TENSOR_ACC]=tensor_accesses;
+
+}
+
+void gpgpu_sim_wrapper::set_tex_accesses(double tex_accesses)
+{
+  sample_perf_counters[TEX_ACC]=tex_accesses;
+
+}
+
+void gpgpu_sim_wrapper::set_avg_active_threads(float active_threads)
+{
+  avg_threads_per_warp = (unsigned)ceil(active_threads);
+  avg_threads_per_warp_tot += active_threads;
+}
+
+void gpgpu_sim_wrapper::set_active_lanes_power(double sp_avg_active_lane,
+                                               double sfu_avg_active_lane) {
+  p->sys.core[0].sp_average_active_lanes = sp_avg_active_lane;
+  p->sys.core[0].sfu_average_active_lanes = sfu_avg_active_lane;
+}
+
+void gpgpu_sim_wrapper::set_NoC_power(double noc_tot_acc) {
+  p->sys.NoC[0].total_accesses =
+      noc_tot_acc * p->sys.scaling_coefficients[NOC_A];
+  sample_perf_counters[NOC_A] = noc_tot_acc;
+}
+
+void gpgpu_sim_wrapper::power_metrics_calculations() {
+  total_sample_count++;
+  kernel_sample_count++;
+
+  // Current sample power
+  double sample_power = proc->rt_power.readOp.dynamic + sample_cmp_pwr[CONSTP] + sample_cmp_pwr[STATICP];
+  // double sample_power;
+  // for(unsigned i=0; i<num_pwr_cmps; i++){
+  //   sample_power+=sample_cmp_pwr[i]; //fix for dvfs
+  // }
+
+  // Average power
+  // Previous + new + constant dynamic power (e.g., dynamic clocking power)
+  kernel_tot_power += sample_power;
+  kernel_power.avg = kernel_tot_power / kernel_sample_count;
+  for (unsigned ind = 0; ind < num_pwr_cmps; ++ind) {
+    kernel_cmp_pwr[ind].avg += (double)sample_cmp_pwr[ind];
+  }
+
+  for (unsigned ind = 0; ind < num_perf_counters; ++ind) {
+    kernel_cmp_perf_counters[ind].avg += (double)sample_perf_counters[ind];
+  }
+
+  // Max Power
+  if (sample_power > kernel_power.max) {
+    kernel_power.max = sample_power;
+    for (unsigned ind = 0; ind < num_pwr_cmps; ++ind) {
+      kernel_cmp_pwr[ind].max = (double)sample_cmp_pwr[ind];
+    }
+    for (unsigned ind = 0; ind < num_perf_counters; ++ind) {
+      kernel_cmp_perf_counters[ind].max = sample_perf_counters[ind];
+    }
+  }
+
+  // Min Power
+  if (sample_power < kernel_power.min || (kernel_power.min == 0)) {
+    kernel_power.min = sample_power;
+    for (unsigned ind = 0; ind < num_pwr_cmps; ++ind) {
+      kernel_cmp_pwr[ind].min = (double)sample_cmp_pwr[ind];
+    }
+    for (unsigned ind = 0; ind < num_perf_counters; ++ind) {
+      kernel_cmp_perf_counters[ind].min = sample_perf_counters[ind];
+    }
+  }
+
+  gpu_tot_power.avg = (gpu_tot_power.avg + sample_power);
+  gpu_tot_power.max =
+      (sample_power > gpu_tot_power.max) ? sample_power : gpu_tot_power.max;
+  gpu_tot_power.min =
+      ((sample_power < gpu_tot_power.min) || (gpu_tot_power.min == 0))
+          ? sample_power
+          : gpu_tot_power.min;
+}
+
+void gpgpu_sim_wrapper::print_trace_files() {
+  open_files();
+
+  for (unsigned i = 0; i < num_perf_counters; ++i) {
+    gzprintf(metric_trace_file, "%f,", sample_perf_counters[i]);
+  }
+  gzprintf(metric_trace_file, "\n");
+
+  gzprintf(power_trace_file, "%f,", proc_power);
+  for (unsigned i = 0; i < num_pwr_cmps; ++i) {
+    gzprintf(power_trace_file, "%f,", sample_cmp_pwr[i]);
+  }
+  gzprintf(power_trace_file, "\n");
+
+  close_files();
+}
+
+void gpgpu_sim_wrapper::update_coefficients()
+{
+
+  initpower_coeff[FP_INT]=proc->cores[0]->get_coefficient_fpint_insts();
+  effpower_coeff[FP_INT]=initpower_coeff[FP_INT] * p->sys.scaling_coefficients[FP_INT];
+
+  initpower_coeff[TOT_INST]=proc->cores[0]->get_coefficient_tot_insts();
+  effpower_coeff[TOT_INST]=initpower_coeff[TOT_INST] * p->sys.scaling_coefficients[TOT_INST];
+
+  initpower_coeff[REG_RD]=proc->cores[0]->get_coefficient_regreads_accesses()*(proc->cores[0]->exu->rf_fu_clockRate/proc->cores[0]->exu->clockRate);
+  initpower_coeff[REG_WR]=proc->cores[0]->get_coefficient_regwrites_accesses()*(proc->cores[0]->exu->rf_fu_clockRate/proc->cores[0]->exu->clockRate);
+  initpower_coeff[NON_REG_OPs]=proc->cores[0]->get_coefficient_noregfileops_accesses()*(proc->cores[0]->exu->rf_fu_clockRate/proc->cores[0]->exu->clockRate);
+  effpower_coeff[REG_RD]=initpower_coeff[REG_RD]*p->sys.scaling_coefficients[REG_RD];
+  effpower_coeff[REG_WR]=initpower_coeff[REG_WR]*p->sys.scaling_coefficients[REG_WR];
+  effpower_coeff[NON_REG_OPs]=initpower_coeff[NON_REG_OPs]*p->sys.scaling_coefficients[NON_REG_OPs];
+
+  initpower_coeff[IC_H]=proc->cores[0]->get_coefficient_icache_hits();
+  initpower_coeff[IC_M]=proc->cores[0]->get_coefficient_icache_misses();
+  effpower_coeff[IC_H]=initpower_coeff[IC_H]*p->sys.scaling_coefficients[IC_H];
+  effpower_coeff[IC_M]=initpower_coeff[IC_M]*p->sys.scaling_coefficients[IC_M];
+
+  initpower_coeff[CC_H]=(proc->cores[0]->get_coefficient_ccache_readhits()+proc->get_coefficient_readcoalescing());
+  initpower_coeff[CC_M]=(proc->cores[0]->get_coefficient_ccache_readmisses()+proc->get_coefficient_readcoalescing());
+  effpower_coeff[CC_H]=initpower_coeff[CC_H]*p->sys.scaling_coefficients[CC_H];
+  effpower_coeff[CC_M]=initpower_coeff[CC_M]*p->sys.scaling_coefficients[CC_M];
+
+  initpower_coeff[TC_H]=(proc->cores[0]->get_coefficient_tcache_readhits()+proc->get_coefficient_readcoalescing());
+  initpower_coeff[TC_M]=(proc->cores[0]->get_coefficient_tcache_readmisses()+proc->get_coefficient_readcoalescing());
+  effpower_coeff[TC_H]=initpower_coeff[TC_H]*p->sys.scaling_coefficients[TC_H];
+  effpower_coeff[TC_M]=initpower_coeff[TC_M]*p->sys.scaling_coefficients[TC_M];
+
+  initpower_coeff[SHRD_ACC]=proc->cores[0]->get_coefficient_sharedmemory_readhits();
+  effpower_coeff[SHRD_ACC]=initpower_coeff[SHRD_ACC]*p->sys.scaling_coefficients[SHRD_ACC];
+
+  initpower_coeff[DC_RH]=(proc->cores[0]->get_coefficient_dcache_readhits() + proc->get_coefficient_readcoalescing());
+  initpower_coeff[DC_RM]=(proc->cores[0]->get_coefficient_dcache_readmisses() + proc->get_coefficient_readcoalescing());
+  initpower_coeff[DC_WH]=(proc->cores[0]->get_coefficient_dcache_writehits() + proc->get_coefficient_writecoalescing());
+  initpower_coeff[DC_WM]=(proc->cores[0]->get_coefficient_dcache_writemisses() + proc->get_coefficient_writecoalescing());
+  effpower_coeff[DC_RH]=initpower_coeff[DC_RH]*p->sys.scaling_coefficients[DC_RH];
+  effpower_coeff[DC_RM]=initpower_coeff[DC_RM]*p->sys.scaling_coefficients[DC_RM];
+  effpower_coeff[DC_WH]=initpower_coeff[DC_WH]*p->sys.scaling_coefficients[DC_WH];
+  effpower_coeff[DC_WM]=initpower_coeff[DC_WM]*p->sys.scaling_coefficients[DC_WM];
+
+  initpower_coeff[L2_RH]=proc->get_coefficient_l2_read_hits();
+  initpower_coeff[L2_RM]=proc->get_coefficient_l2_read_misses();
+  initpower_coeff[L2_WH]=proc->get_coefficient_l2_write_hits();
+  initpower_coeff[L2_WM]=proc->get_coefficient_l2_write_misses();
+  effpower_coeff[L2_RH]=initpower_coeff[L2_RH]*p->sys.scaling_coefficients[L2_RH];
+  effpower_coeff[L2_RM]=initpower_coeff[L2_RM]*p->sys.scaling_coefficients[L2_RM];
+  effpower_coeff[L2_WH]=initpower_coeff[L2_WH]*p->sys.scaling_coefficients[L2_WH];
+  effpower_coeff[L2_WM]=initpower_coeff[L2_WM]*p->sys.scaling_coefficients[L2_WM];
+
+  initpower_coeff[IDLE_CORE_N]=p->sys.idle_core_power * proc->cores[0]->executionTime;
+  effpower_coeff[IDLE_CORE_N]=initpower_coeff[IDLE_CORE_N]*p->sys.scaling_coefficients[IDLE_CORE_N];
+
+  initpower_coeff[PIPE_A]=proc->cores[0]->get_coefficient_duty_cycle();
+  effpower_coeff[PIPE_A]=initpower_coeff[PIPE_A]*p->sys.scaling_coefficients[PIPE_A];
+
+  initpower_coeff[MEM_RD]=proc->get_coefficient_mem_reads();
+  initpower_coeff[MEM_WR]=proc->get_coefficient_mem_writes();
+  initpower_coeff[MEM_PRE]=proc->get_coefficient_mem_pre();
+  effpower_coeff[MEM_RD]=initpower_coeff[MEM_RD]*p->sys.scaling_coefficients[MEM_RD];
+  effpower_coeff[MEM_WR]=initpower_coeff[MEM_WR]*p->sys.scaling_coefficients[MEM_WR];
+  effpower_coeff[MEM_PRE]=initpower_coeff[MEM_PRE]*p->sys.scaling_coefficients[MEM_PRE];
+  
+  double fp_coeff = proc->cores[0]->get_coefficient_fpu_accesses();
+  double sfu_coeff = proc->cores[0]->get_coefficient_sfu_accesses();
+
+  initpower_coeff[INT_ACC]= proc->cores[0]->get_coefficient_ialu_accesses()*(proc->cores[0]->exu->rf_fu_clockRate/proc->cores[0]->exu->clockRate);
+
+  if(tot_fpu_accesses != 0){
+    initpower_coeff[FP_ACC]= fp_coeff * sample_perf_counters[FP_ACC]/tot_fpu_accesses;
+    initpower_coeff[DP_ACC]= fp_coeff * sample_perf_counters[DP_ACC]/tot_fpu_accesses;
+  }
+  else{
+    initpower_coeff[FP_ACC]= 0;
+    initpower_coeff[DP_ACC]= 0;
+  }
+
+  if(tot_sfu_accesses != 0){
+    initpower_coeff[INT_MUL24_ACC]= sfu_coeff * sample_perf_counters[INT_MUL24_ACC]/tot_sfu_accesses;
+    initpower_coeff[INT_MUL32_ACC]= sfu_coeff * sample_perf_counters[INT_MUL32_ACC]/tot_sfu_accesses;
+    initpower_coeff[INT_MUL_ACC]= sfu_coeff * sample_perf_counters[INT_MUL_ACC]/tot_sfu_accesses;
+    initpower_coeff[INT_DIV_ACC]= sfu_coeff * sample_perf_counters[INT_DIV_ACC]/tot_sfu_accesses;
+    initpower_coeff[DP_MUL_ACC]= sfu_coeff * sample_perf_counters[DP_MUL_ACC]/tot_sfu_accesses;
+    initpower_coeff[DP_DIV_ACC]= sfu_coeff * sample_perf_counters[DP_DIV_ACC]/tot_sfu_accesses;
+    initpower_coeff[FP_MUL_ACC]= sfu_coeff * sample_perf_counters[FP_MUL_ACC]/tot_sfu_accesses;
+    initpower_coeff[FP_DIV_ACC]= sfu_coeff * sample_perf_counters[FP_DIV_ACC]/tot_sfu_accesses;
+    initpower_coeff[FP_SQRT_ACC]= sfu_coeff * sample_perf_counters[FP_SQRT_ACC]/tot_sfu_accesses;
+    initpower_coeff[FP_LG_ACC]= sfu_coeff * sample_perf_counters[FP_LG_ACC]/tot_sfu_accesses;
+    initpower_coeff[FP_SIN_ACC]= sfu_coeff * sample_perf_counters[FP_SIN_ACC]/tot_sfu_accesses;
+    initpower_coeff[FP_EXP_ACC]= sfu_coeff * sample_perf_counters[FP_EXP_ACC]/tot_sfu_accesses;
+    initpower_coeff[TENSOR_ACC]= sfu_coeff * sample_perf_counters[TENSOR_ACC]/tot_sfu_accesses;
+    initpower_coeff[TEX_ACC]= sfu_coeff * sample_perf_counters[TEX_ACC]/tot_sfu_accesses;
+  }
+  else{
+    initpower_coeff[INT_MUL24_ACC]= 0;
+    initpower_coeff[INT_MUL32_ACC]= 0;
+    initpower_coeff[INT_MUL_ACC]= 0;
+    initpower_coeff[INT_DIV_ACC]= 0;
+    initpower_coeff[DP_MUL_ACC]= 0;
+    initpower_coeff[DP_DIV_ACC]= 0;
+    initpower_coeff[FP_MUL_ACC]= 0;
+    initpower_coeff[FP_DIV_ACC]= 0;
+    initpower_coeff[FP_SQRT_ACC]= 0;
+    initpower_coeff[FP_LG_ACC]= 0;
+    initpower_coeff[FP_SIN_ACC]= 0;
+    initpower_coeff[FP_EXP_ACC]= 0;
+    initpower_coeff[TENSOR_ACC]= 0;
+    initpower_coeff[TEX_ACC]= 0;
+  }
+
+  effpower_coeff[INT_ACC]= initpower_coeff[INT_ACC];
+  effpower_coeff[FP_ACC]= initpower_coeff[FP_ACC];
+  effpower_coeff[DP_ACC]= initpower_coeff[DP_ACC];
+  effpower_coeff[INT_MUL24_ACC]= initpower_coeff[INT_MUL24_ACC];
+  effpower_coeff[INT_MUL32_ACC]= initpower_coeff[INT_MUL32_ACC];
+  effpower_coeff[INT_MUL_ACC]= initpower_coeff[INT_MUL_ACC];
+  effpower_coeff[INT_DIV_ACC]= initpower_coeff[INT_DIV_ACC];
+  effpower_coeff[DP_MUL_ACC]= initpower_coeff[DP_MUL_ACC];
+  effpower_coeff[DP_DIV_ACC]= initpower_coeff[DP_DIV_ACC];
+  effpower_coeff[FP_MUL_ACC]= initpower_coeff[FP_MUL_ACC];
+  effpower_coeff[FP_DIV_ACC]= initpower_coeff[FP_DIV_ACC];
+  effpower_coeff[FP_SQRT_ACC]= initpower_coeff[FP_SQRT_ACC];
+  effpower_coeff[FP_LG_ACC]= initpower_coeff[FP_LG_ACC];
+  effpower_coeff[FP_SIN_ACC]= initpower_coeff[FP_SIN_ACC];
+  effpower_coeff[FP_EXP_ACC]= initpower_coeff[FP_EXP_ACC];
+  effpower_coeff[TENSOR_ACC]= initpower_coeff[TENSOR_ACC];
+  effpower_coeff[TEX_ACC]= initpower_coeff[TEX_ACC];
+
+  initpower_coeff[NOC_A]=proc->get_coefficient_noc_accesses();
+  effpower_coeff[NOC_A]=initpower_coeff[NOC_A]*p->sys.scaling_coefficients[NOC_A];
+
+  //const_dynamic_power=proc->get_const_dynamic_power()/(proc->cores[0]->executionTime);
+
+  for(unsigned i=0; i<num_perf_counters; i++){
+    initpower_coeff[i]/=(proc->cores[0]->executionTime);
+    effpower_coeff[i]/=(proc->cores[0]->executionTime);
+  }
+}
+
+double gpgpu_sim_wrapper::calculate_static_power(){ 
+	double int_accesses = initpower_coeff[INT_ACC] + initpower_coeff[INT_MUL24_ACC] + initpower_coeff[INT_MUL32_ACC] + initpower_coeff[INT_MUL_ACC] + initpower_coeff[INT_DIV_ACC];
+	double int_add_accesses = initpower_coeff[INT_ACC];
+	double int_mul_accesses = initpower_coeff[INT_MUL24_ACC] + initpower_coeff[INT_MUL32_ACC] + initpower_coeff[INT_MUL_ACC] + initpower_coeff[INT_DIV_ACC];
+	double fp_accesses = initpower_coeff[FP_ACC] + initpower_coeff[FP_MUL_ACC] + initpower_coeff[FP_DIV_ACC];
+	double dp_accesses = initpower_coeff[DP_ACC] + initpower_coeff[DP_MUL_ACC] + initpower_coeff[DP_DIV_ACC];
+	double sfu_accesses = initpower_coeff[FP_SQRT_ACC] + initpower_coeff[FP_LG_ACC] + initpower_coeff[FP_SIN_ACC] + initpower_coeff[FP_EXP_ACC];
+	double tensor_accesses = initpower_coeff[TENSOR_ACC];
+	double tex_accesses = initpower_coeff[TEX_ACC];
+	double total_static_power = 0.0;
+	double base_static_power = 0.0; 
+	double lane_static_power = 0.0;
+	double per_active_core = (num_cores - num_idle_cores)/num_cores;
+
+
+	double l1_accesses = initpower_coeff[DC_RH] + initpower_coeff[DC_RM] + initpower_coeff[DC_WH] + initpower_coeff[DC_WM];
+	double l2_accesses = initpower_coeff[L2_RH] + initpower_coeff[L2_RM] + initpower_coeff[L2_WH] + initpower_coeff[L2_WM];
+	double shared_accesses = initpower_coeff[SHRD_ACC];
+
+
+	if(avg_threads_per_warp == 0){ //no functional unit threads, check for memory or a 'LIGHT_SM'
+		if(l1_accesses != 0.0)
+			return (p->sys.static_l1_flane*per_active_core);
+		else if(shared_accesses != 0.0)
+			return (p->sys.static_shared_flane*per_active_core);
+		else if(l2_accesses != 0.0)
+			return (p->sys.static_l2_flane*per_active_core);
+		else //LIGHT_SM
+			return (p->sys.static_light_flane*per_active_core); //return LIGHT_SM base static power
+	}
+
+	/* using a linear model for thread divergence */
+	if((int_accesses != 0.0) && (fp_accesses != 0.0) && (dp_accesses != 0.0) && (sfu_accesses == 0.0) && (tensor_accesses == 0.0) && (tex_accesses == 0.0)){
+		/* INT_FP_DP */
+		base_static_power = p->sys.static_cat3_flane;
+		lane_static_power = p->sys.static_cat3_addlane;
+	}
+
+	else if((int_accesses != 0.0) && (fp_accesses != 0.0) && (dp_accesses == 0.0) && (sfu_accesses == 0.0) && (tensor_accesses != 0.0) && (tex_accesses == 0.0)){
+		/* INT_FP_TENSOR */
+		base_static_power = p->sys.static_cat6_flane;
+		lane_static_power = p->sys.static_cat6_addlane;
+	}
+
+	else if((int_accesses != 0.0) && (fp_accesses != 0.0) && (dp_accesses == 0.0) && (sfu_accesses != 0.0) && (tensor_accesses == 0.0) && (tex_accesses == 0.0)){
+		/* INT_FP_SFU */
+		base_static_power = p->sys.static_cat4_flane;
+		lane_static_power = p->sys.static_cat4_addlane;
+	}
+
+	else if((int_accesses != 0.0) && (fp_accesses != 0.0) && (dp_accesses == 0.0) && (sfu_accesses == 0.0) && (tensor_accesses == 0.0) && (tex_accesses != 0.0)){
+		/* INT_FP_TEX */
+		base_static_power = p->sys.static_cat5_flane;
+		lane_static_power = p->sys.static_cat5_addlane;
+	}
+
+	else if((int_accesses != 0.0) && (fp_accesses != 0.0) && (dp_accesses == 0.0) && (sfu_accesses == 0.0) && (tensor_accesses == 0.0) && (tex_accesses == 0.0)){
+		/* INT_FP */
+		base_static_power = p->sys.static_cat2_flane;
+		lane_static_power = p->sys.static_cat2_addlane;
+	}
+
+	else if((int_accesses != 0.0) && (fp_accesses == 0.0) && (dp_accesses == 0.0) && (sfu_accesses == 0.0) && (tensor_accesses == 0.0) && (tex_accesses == 0.0)){
+		/* INT */
+		/* Seperating INT_ADD only and INT_MUL only from mix of INT instructions */
+		if((int_add_accesses != 0.0) && (int_mul_accesses == 0.0)){ //INT_ADD
+			base_static_power = p->sys.static_intadd_flane;
+			lane_static_power = p->sys.static_intadd_addlane;
+		}
+		else if((int_add_accesses == 0.0) && (int_mul_accesses != 0.0)){ //INT_MUL
+			base_static_power = p->sys.static_intmul_flane;
+			lane_static_power = p->sys.static_intmul_addlane;
+		}
+		else{ //INT_ADD+MUL
+			base_static_power = p->sys.static_cat1_flane;
+			lane_static_power = p->sys.static_cat1_addlane;
+		}
+	}
+
+	else if((int_accesses == 0.0) && (fp_accesses == 0.0) && (dp_accesses == 0.0) && (sfu_accesses == 0.0) && (tensor_accesses == 0.0) && (tex_accesses == 0.0)){
+		/* LIGHT_SM or memory only sample */
+		lane_static_power = 0.0; //addlane static power is 0 for l1/l2/shared memory only accesses
+		if(l1_accesses != 0.0)
+			base_static_power = p->sys.static_l1_flane;
+		else if(shared_accesses != 0.0)
+			base_static_power = p->sys.static_shared_flane;
+		else if(l2_accesses != 0.0)
+			base_static_power = p->sys.static_l2_flane;
+		else{
+			base_static_power = p->sys.static_light_flane;
+			lane_static_power = p->sys.static_light_addlane;
+		}
+	}
+	else{
+		base_static_power = p->sys.static_geomean_flane; //GEOMEAN except LIGHT_SM if we don't fall into any of the categories above
+		lane_static_power = p->sys.static_geomean_addlane;
+	}
+
+	total_static_power = base_static_power + (((double)avg_threads_per_warp-1.0)*lane_static_power); //Linear Model
+	return (total_static_power*per_active_core);
+}
+
+void gpgpu_sim_wrapper::update_components_power()
+{
+
+  update_coefficients();
+
+  proc_power=proc->rt_power.readOp.dynamic;
+  sample_cmp_pwr[IBP]=(proc->cores[0]->ifu->IB->rt_power.readOp.dynamic
+          +proc->cores[0]->ifu->IB->rt_power.writeOp.dynamic
+          +proc->cores[0]->ifu->ID_misc->rt_power.readOp.dynamic
+          +proc->cores[0]->ifu->ID_operand->rt_power.readOp.dynamic
+          +proc->cores[0]->ifu->ID_inst->rt_power.readOp.dynamic)/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[ICP]=proc->cores[0]->ifu->icache.rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[DCP]=proc->cores[0]->lsu->dcache.rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[TCP]=proc->cores[0]->lsu->tcache.rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[CCP]=proc->cores[0]->lsu->ccache.rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[SHRDP]=proc->cores[0]->lsu->sharedmemory.rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[RFP]=(proc->cores[0]->exu->rfu->rt_power.readOp.dynamic/(proc->cores[0]->executionTime))
+         *(proc->cores[0]->exu->rf_fu_clockRate/proc->cores[0]->exu->clockRate);
+
+  double sample_fp_pwr = (proc->cores[0]->exu->fp_u->rt_power.readOp.dynamic/(proc->cores[0]->executionTime));
+
+  double sample_sfu_pwr = (proc->cores[0]->exu->mul->rt_power.readOp.dynamic/(proc->cores[0]->executionTime));
+
+  sample_cmp_pwr[INTP]=(proc->cores[0]->exu->exeu->rt_power.readOp.dynamic/(proc->cores[0]->executionTime))
+         *(proc->cores[0]->exu->rf_fu_clockRate/proc->cores[0]->exu->clockRate);
+
+  
+  if(tot_fpu_accesses != 0){
+    sample_cmp_pwr[FPUP]= sample_fp_pwr * sample_perf_counters[FP_ACC]/tot_fpu_accesses;
+    sample_cmp_pwr[DPUP]= sample_fp_pwr * sample_perf_counters[DP_ACC]/tot_fpu_accesses;
+  }
+  else{
+    sample_cmp_pwr[FPUP]= 0;
+    sample_cmp_pwr[DPUP]= 0;
+  }
+  if(tot_sfu_accesses != 0){
+    sample_cmp_pwr[INT_MUL24P]= sample_sfu_pwr * sample_perf_counters[INT_MUL24_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[INT_MUL32P]= sample_sfu_pwr * sample_perf_counters[INT_MUL32_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[INT_MULP]= sample_sfu_pwr * sample_perf_counters[INT_MUL_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[INT_DIVP]= sample_sfu_pwr * sample_perf_counters[INT_DIV_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[FP_MULP]= sample_sfu_pwr * sample_perf_counters[FP_MUL_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[FP_DIVP]= sample_sfu_pwr * sample_perf_counters[FP_DIV_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[FP_SQRTP]= sample_sfu_pwr * sample_perf_counters[FP_SQRT_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[FP_LGP]= sample_sfu_pwr * sample_perf_counters[FP_LG_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[FP_SINP]= sample_sfu_pwr * sample_perf_counters[FP_SIN_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[FP_EXP]= sample_sfu_pwr * sample_perf_counters[FP_EXP_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[DP_MULP]= sample_sfu_pwr * sample_perf_counters[DP_MUL_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[DP_DIVP]= sample_sfu_pwr * sample_perf_counters[DP_DIV_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[TENSORP]= sample_sfu_pwr * sample_perf_counters[TENSOR_ACC]/tot_sfu_accesses;
+    sample_cmp_pwr[TEXP]= sample_sfu_pwr * sample_perf_counters[TEX_ACC]/tot_sfu_accesses;
+  }
+  else{
+    sample_cmp_pwr[INT_MUL24P]= 0;
+    sample_cmp_pwr[INT_MUL32P]= 0;
+    sample_cmp_pwr[INT_MULP]= 0;
+    sample_cmp_pwr[INT_DIVP]= 0;
+    sample_cmp_pwr[FP_MULP]= 0;
+    sample_cmp_pwr[FP_DIVP]= 0;
+    sample_cmp_pwr[FP_SQRTP]= 0;
+    sample_cmp_pwr[FP_LGP]= 0;
+    sample_cmp_pwr[FP_SINP]= 0;
+    sample_cmp_pwr[FP_EXP]= 0;
+    sample_cmp_pwr[DP_MULP]= 0;
+    sample_cmp_pwr[DP_DIVP]= 0;
+    sample_cmp_pwr[TENSORP]= 0;
+    sample_cmp_pwr[TEXP]= 0;
+  }
+
+  sample_cmp_pwr[SCHEDP]=proc->cores[0]->exu->scheu->rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[L2CP]=(proc->XML->sys.number_of_L2s>0)? proc->l2array[0]->rt_power.readOp.dynamic/(proc->cores[0]->executionTime):0;
+
+  sample_cmp_pwr[MCP]=(proc->mc->rt_power.readOp.dynamic-proc->mc->dram->rt_power.readOp.dynamic)/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[NOCP]=proc->nocs[0]->rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[DRAMP]=proc->mc->dram->rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[PIPEP]=proc->cores[0]->Pipeline_energy/(proc->cores[0]->executionTime);
+
+  sample_cmp_pwr[IDLE_COREP]=proc->cores[0]->IdleCoreEnergy/(proc->cores[0]->executionTime);
+
+  // This constant dynamic power (e.g., clock power) part is estimated via regression model.
+  sample_cmp_pwr[CONSTP]=0;
+  sample_cmp_pwr[STATICP]=0;
+  // double cnst_dyn = proc->get_const_dynamic_power()/(proc->cores[0]->executionTime);
+  // // If the regression scaling term is greater than the recorded constant dynamic power
+  // // then use the difference (other portion already added to dynamic power). Else,
+  // // all the constant dynamic power is accounted for, add nothing.
+  // if(p->sys.scaling_coefficients[constant_power] > cnst_dyn)
+  //   sample_cmp_pwr[CONSTP] = (p->sys.scaling_coefficients[constant_power]-cnst_dyn);
+  sample_cmp_pwr[CONSTP] = p->sys.scaling_coefficients[constant_power];
+  sample_cmp_pwr[STATICP] = calculate_static_power();
+
+  if(g_dvfs_enabled){
+  	double voltage_ratio = modeled_chip_voltage/p->sys.modeled_chip_voltage_ref; 
+  	sample_cmp_pwr[IDLE_COREP] *= voltage_ratio; // static power scaled by voltage_ratio
+  	sample_cmp_pwr[STATICP] *= voltage_ratio;  // static power scaled by voltage_ratio
+  	for(unsigned i=0; i<num_pwr_cmps; i++){
+    	if((i != IDLE_COREP) && (i != STATICP)){ 
+    		sample_cmp_pwr[i] *= voltage_ratio*voltage_ratio; // dynamic power scaled by square of voltage_ratio
+    	}
+  	}
+  }
+  
+  proc_power+=sample_cmp_pwr[CONSTP]+sample_cmp_pwr[STATICP];
+  if(!g_dvfs_enabled){ // sanity check will fail when voltage scaling is applied, fix later
+	  double sum_pwr_cmp=0;
+	  for(unsigned i=0; i<num_pwr_cmps; i++){
+	    sum_pwr_cmp+=sample_cmp_pwr[i];
+	  }
+	  bool check=false;
+	  check=sanity_check(sum_pwr_cmp,proc_power);
+	  if(!check)
+	    printf("sum_pwr_cmp %f : proc_power %f \n",sum_pwr_cmp,proc_power);
+	  assert("Total Power does not equal the sum of the components\n" && (check));
+  }
+}
+
+void gpgpu_sim_wrapper::compute() { proc->compute(); }
+void gpgpu_sim_wrapper::print_power_kernel_stats(
+    double gpu_sim_cycle, double gpu_tot_sim_cycle, double init_value,
+    const std::string& kernel_info_string, bool print_trace) {
+  detect_print_steady_state(1, init_value);
+  if (g_power_simulation_enabled) {
+    powerfile << kernel_info_string << std::endl;
+
+    sanity_check((kernel_power.avg * kernel_sample_count), kernel_tot_power);
+    powerfile << "Kernel Average Power Data:" << std::endl;
+    powerfile << "kernel_avg_power = " << kernel_power.avg << std::endl;
+
+    for (unsigned i = 0; i < num_pwr_cmps; ++i) {
+      powerfile << "gpu_avg_" << pwr_cmp_label[i] << " = "
+                << kernel_cmp_pwr[i].avg / kernel_sample_count << std::endl;
+    }
+    for (unsigned i = 0; i < num_perf_counters; ++i) {
+      powerfile << "gpu_avg_" << perf_count_label[i] << " = "
+                << kernel_cmp_perf_counters[i].avg / kernel_sample_count
+                << std::endl;
+    }
+
+    powerfile << "gpu_avg_threads_per_warp = "
+                << avg_threads_per_warp_tot / (double)kernel_sample_count
+                << std::endl;
+
+    for (unsigned i = 0; i < num_perf_counters; ++i) {
+      powerfile << "gpu_tot_" << perf_count_label[i] << " = "
+                << kernel_cmp_perf_counters[i].avg
+                << std::endl;
+    }
+
+    powerfile << std::endl << "Kernel Maximum Power Data:" << std::endl;
+    powerfile << "kernel_max_power = " << kernel_power.max << std::endl;
+    for (unsigned i = 0; i < num_pwr_cmps; ++i) {
+      powerfile << "gpu_max_" << pwr_cmp_label[i] << " = "
+                << kernel_cmp_pwr[i].max << std::endl;
+    }
+    for (unsigned i = 0; i < num_perf_counters; ++i) {
+      powerfile << "gpu_max_" << perf_count_label[i] << " = "
+                << kernel_cmp_perf_counters[i].max << std::endl;
+    }
+
+    powerfile << std::endl << "Kernel Minimum Power Data:" << std::endl;
+    powerfile << "kernel_min_power = " << kernel_power.min << std::endl;
+    for (unsigned i = 0; i < num_pwr_cmps; ++i) {
+      powerfile << "gpu_min_" << pwr_cmp_label[i] << " = "
+                << kernel_cmp_pwr[i].min << std::endl;
+    }
+    for (unsigned i = 0; i < num_perf_counters; ++i) {
+      powerfile << "gpu_min_" << perf_count_label[i] << " = "
+                << kernel_cmp_perf_counters[i].min << std::endl;
+    }
+
+    powerfile << std::endl
+              << "Accumulative Power Statistics Over Previous Kernels:"
+              << std::endl;
+    powerfile << "gpu_tot_avg_power = "
+              << gpu_tot_power.avg / total_sample_count << std::endl;
+    powerfile << "gpu_tot_max_power = " << gpu_tot_power.max << std::endl;
+    powerfile << "gpu_tot_min_power = " << gpu_tot_power.min << std::endl;
+    powerfile << std::endl << std::endl;
+    powerfile.flush();
+
+    if (print_trace) {
+      print_trace_files();
+    }
+  }
+}
+void gpgpu_sim_wrapper::dump() {
+  if (g_power_per_cycle_dump) proc->displayEnergy(2, 5);
+}
+
+void gpgpu_sim_wrapper::print_steady_state(int position, double init_val) {
+  double temp_avg = sample_val / (double)samples.size();
+  double temp_ipc = (init_val - init_inst_val) /
+                    (double)(samples.size() * gpu_stat_sample_freq);
+
+  if ((samples.size() >
+       gpu_steady_min_period)) {  // If steady state occurred for some time,
+                                  // print to file
+    has_written_avg = true;
+    gzprintf(steady_state_tacking_file, "%u,%d,%f,%f,", sample_start,
+             total_sample_count, temp_avg, temp_ipc);
+    for (unsigned i = 0; i < num_perf_counters; ++i) {
+      gzprintf(steady_state_tacking_file, "%f,",
+               samples_counter.at(i) / ((double)samples.size()));
+    }
+    gzprintf(steady_state_tacking_file, "\n");
+  } else {
+    if (!has_written_avg && position)
+      gzprintf(steady_state_tacking_file,
+               "ERROR! Not enough steady state points to generate average\n");
+  }
+
+  sample_start = 0;
+  sample_val = 0;
+  init_inst_val = init_val;
+  samples.clear();
+  samples_counter.clear();
+  pwr_counter.clear();
+  assert(samples.size() == 0);
+}
+
+void gpgpu_sim_wrapper::detect_print_steady_state(int position,
+                                                  double init_val) {
+  // Calculating Average
+  if (g_power_simulation_enabled && g_steady_power_levels_enabled) {
+    steady_state_tacking_file = gzopen(g_steady_state_tracking_filename, "a");
+    if (position == 0) {
+      if (samples.size() == 0) {
+        // First sample
+        sample_start = total_sample_count;
+        sample_val = proc->rt_power.readOp.dynamic;
+        init_inst_val = init_val;
+        samples.push_back(proc->rt_power.readOp.dynamic);
+        assert(samples_counter.size() == 0);
+        assert(pwr_counter.size() == 0);
+
+        for (unsigned i = 0; i < (num_perf_counters); ++i) {
+          samples_counter.push_back(sample_perf_counters[i]);
+        }
+
+        for (unsigned i = 0; i < (num_pwr_cmps); ++i) {
+          pwr_counter.push_back(sample_cmp_pwr[i]);
+        }
+        assert(pwr_counter.size() == (double)num_pwr_cmps);
+        assert(samples_counter.size() == (double)num_perf_counters);
+      } else {
+        // Get current average
+        double temp_avg = sample_val / (double)samples.size();
+
+        if (abs(proc->rt_power.readOp.dynamic - temp_avg) <
+            gpu_steady_power_deviation) {  // Value is within threshold
+          sample_val += proc->rt_power.readOp.dynamic;
+          samples.push_back(proc->rt_power.readOp.dynamic);
+          for (unsigned i = 0; i < (num_perf_counters); ++i) {
+            samples_counter.at(i) += sample_perf_counters[i];
+          }
+
+          for (unsigned i = 0; i < (num_pwr_cmps); ++i) {
+            pwr_counter.at(i) += sample_cmp_pwr[i];
+          }
+
+        } else {  // Value exceeds threshold, not considered steady state
+          print_steady_state(position, init_val);
+        }
+      }
+    } else {
+      print_steady_state(position, init_val);
+    }
+    gzclose(steady_state_tacking_file);
+  }
+}
+
+void gpgpu_sim_wrapper::open_files() {
+  if (g_power_simulation_enabled) {
+    if (g_power_trace_enabled) {
+      power_trace_file = gzopen(g_power_trace_filename, "a");
+      metric_trace_file = gzopen(g_metric_trace_filename, "a");
+    }
+  }
+}
+void gpgpu_sim_wrapper::close_files() {
+  if (g_power_simulation_enabled) {
+    if (g_power_trace_enabled) {
+      gzclose(power_trace_file);
+      gzclose(metric_trace_file);
+    }
+  }
+}
diff --git a/src/gpuwattch/gpgpu_sim_wrapper.h b/src/accelwattch/gpgpu_sim_wrapper.h
similarity index 68%
rename from src/gpuwattch/gpgpu_sim_wrapper.h
rename to src/accelwattch/gpgpu_sim_wrapper.h
index 00e4f0746..33c4b72f2 100644
--- a/src/gpuwattch/gpgpu_sim_wrapper.h
+++ b/src/accelwattch/gpgpu_sim_wrapper.h
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Tayler Hetherington, Ahmed ElTantawy,
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Tayler Hetherington, Ahmed ElTantawy, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -54,9 +55,34 @@ struct avg_max_min_counters {
   }
 };
 
+#ifndef COEFF_STRUCT
+#define COEFF_STRUCT
+
+struct PowerscalingCoefficients{
+    double int_coeff;
+    double int_mul_coeff;
+    double int_mul24_coeff;
+    double int_mul32_coeff;
+    double int_div_coeff;
+    double fp_coeff;
+    double dp_coeff;
+    double fp_mul_coeff;
+    double fp_div_coeff;
+    double dp_mul_coeff;
+    double dp_div_coeff;
+    double sqrt_coeff;
+    double log_coeff;
+    double sin_coeff;
+    double exp_coeff;
+    double tensor_coeff;
+    double tex_coeff;
+};
+
+#endif
+
 class gpgpu_sim_wrapper {
  public:
-  gpgpu_sim_wrapper(bool power_simulation_enabled, char* xmlfile);
+  gpgpu_sim_wrapper(bool power_simulation_enabled, char* xmlfile, int power_simulation_mode, bool dvfs_enabled);
   ~gpgpu_sim_wrapper();
 
   void init_mcpat(char* xmlfile, char* powerfile, char* power_trace_file,
@@ -64,7 +90,9 @@ class gpgpu_sim_wrapper {
                   bool power_sim_enabled, bool trace_enabled,
                   bool steady_state_enabled, bool power_per_cycle_dump,
                   double steady_power_deviation, double steady_min_period,
-                  int zlevel, double init_val, int stat_sample_freq);
+                  int zlevel, double init_val, int stat_sample_freq, int power_sim_mode, 
+                  bool dvfs_enabled, unsigned clock_freq, unsigned num_shaders);
+  void init_mcpat_hw_mode(unsigned gpu_sim_cycle);
   void detect_print_steady_state(int position, double init_val);
   void close_files();
   void open_files();
@@ -72,6 +100,7 @@ class gpgpu_sim_wrapper {
   void dump();
   void print_trace_files();
   void update_components_power();
+  double calculate_static_power();
   void update_coefficients();
   void reset_counters();
   void print_power_kernel_stats(double gpu_sim_cycle, double gpu_tot_sim_cycle,
@@ -79,6 +108,7 @@ class gpgpu_sim_wrapper {
                                 const std::string& kernel_info_string,
                                 bool print_trace);
   void power_metrics_calculations();
+  void set_model_voltage(double model_voltage);
   void set_inst_power(bool clk_gated_lanes, double tot_cycles,
                       double busy_cycles, double tot_inst, double int_inst,
                       double fp_inst, double load_inst, double store_inst,
@@ -92,16 +122,31 @@ class gpgpu_sim_wrapper {
                          double write_accesses, double write_misses);
   void set_l2cache_power(double read_accesses, double read_misses,
                          double write_accesses, double write_misses);
+  void set_num_cores(double num_core);
   void set_idle_core_power(double num_idle_core);
   void set_duty_cycle_power(double duty_cycle);
   void set_mem_ctrl_power(double reads, double writes, double dram_precharge);
   void set_exec_unit_power(double fpu_accesses, double ialu_accesses,
                            double sfu_accesses);
+  void set_int_accesses(double ialu_accesses, double imul24_accesses, 
+                        double imul32_accesses, double imul_accesses, 
+                        double idiv_accesses);
+  void set_dp_accesses(double dpu_accesses, double dpmul_accesses, 
+                       double dpdiv_accesses);
+  void set_fp_accesses(double fpu_accesses, double fpmul_accesses, 
+                       double fpdiv_accesses);
+  void set_trans_accesses(double sqrt_accesses, double log_accesses, 
+                       double sin_accesses, double exp_accesses);
+  void set_tensor_accesses(double tensor_accesses);
+  void set_tex_accesses(double tex_accesses);
+  void set_avg_active_threads(float active_threads);
   void set_active_lanes_power(double sp_avg_active_lane,
                               double sfu_avg_active_lane);
-  void set_NoC_power(double noc_tot_reads, double noc_tot_write);
+  void set_NoC_power(double noc_tot_acc);
   bool sanity_check(double a, double b);
 
+  PowerscalingCoefficients * get_scaling_coeffs();
+
  private:
   void print_steady_state(int position, double init_val);
 
@@ -109,8 +154,10 @@ class gpgpu_sim_wrapper {
   ParseXML* p;
   // power parameters
   double const_dynamic_power;
+  double avg_threads_per_warp_tot;
   double proc_power;
-
+  double num_cores;
+  double num_idle_cores;
   unsigned num_perf_counters;  // # of performance counters
   unsigned num_pwr_cmps;       // # of components modelled
   int kernel_sample_count;     // # of samples per kernel
@@ -140,6 +187,10 @@ class gpgpu_sim_wrapper {
   unsigned sample_start;
   double sample_val;
   double init_inst_val;
+  double tot_sfu_accesses;
+  double tot_fpu_accesses;
+  double modeled_chip_voltage;
+  unsigned avg_threads_per_warp;
   std::vector<double> samples;
   std::vector<double> samples_counter;
   std::vector<double> pwr_counter;
@@ -150,6 +201,8 @@ class gpgpu_sim_wrapper {
   char* g_metric_trace_filename;
   char* g_steady_state_tracking_filename;
   bool g_power_simulation_enabled;
+  int g_power_simulation_mode;
+  bool g_dvfs_enabled;
   bool g_steady_power_levels_enabled;
   bool g_power_trace_enabled;
   bool g_power_per_cycle_dump;
diff --git a/src/gpuwattch/gpgpu_static.xml b/src/accelwattch/gpgpu_static.xml
similarity index 100%
rename from src/gpuwattch/gpgpu_static.xml
rename to src/accelwattch/gpgpu_static.xml
diff --git a/src/gpuwattch/interconnect.cc b/src/accelwattch/interconnect.cc
similarity index 100%
rename from src/gpuwattch/interconnect.cc
rename to src/accelwattch/interconnect.cc
diff --git a/src/gpuwattch/interconnect.h b/src/accelwattch/interconnect.h
similarity index 100%
rename from src/gpuwattch/interconnect.h
rename to src/accelwattch/interconnect.h
diff --git a/src/gpuwattch/iocontrollers.cc b/src/accelwattch/iocontrollers.cc
similarity index 100%
rename from src/gpuwattch/iocontrollers.cc
rename to src/accelwattch/iocontrollers.cc
diff --git a/src/gpuwattch/iocontrollers.h b/src/accelwattch/iocontrollers.h
similarity index 100%
rename from src/gpuwattch/iocontrollers.h
rename to src/accelwattch/iocontrollers.h
diff --git a/src/gpuwattch/logic.cc b/src/accelwattch/logic.cc
similarity index 100%
rename from src/gpuwattch/logic.cc
rename to src/accelwattch/logic.cc
diff --git a/src/gpuwattch/logic.h b/src/accelwattch/logic.h
similarity index 100%
rename from src/gpuwattch/logic.h
rename to src/accelwattch/logic.h
diff --git a/src/gpuwattch/main.cc b/src/accelwattch/main.cc
similarity index 100%
rename from src/gpuwattch/main.cc
rename to src/accelwattch/main.cc
diff --git a/src/gpuwattch/makefile b/src/accelwattch/makefile
similarity index 100%
rename from src/gpuwattch/makefile
rename to src/accelwattch/makefile
diff --git a/src/gpuwattch/mcpat.mk b/src/accelwattch/mcpat.mk
similarity index 97%
rename from src/gpuwattch/mcpat.mk
rename to src/accelwattch/mcpat.mk
index a09c23b4c..ad2d6c299 100644
--- a/src/gpuwattch/mcpat.mk
+++ b/src/accelwattch/mcpat.mk
@@ -1,5 +1,5 @@
 
-OUTPUT_DIR=$(SIM_OBJ_FILES_DIR)/gpuwattch
+OUTPUT_DIR=$(SIM_OBJ_FILES_DIR)/accelwattch
 TARGET = mcpat
 SHELL = /bin/sh
 .PHONY: all depend clean
diff --git a/src/gpuwattch/mcpatXeonCore.mk b/src/accelwattch/mcpatXeonCore.mk
similarity index 100%
rename from src/gpuwattch/mcpatXeonCore.mk
rename to src/accelwattch/mcpatXeonCore.mk
diff --git a/src/gpuwattch/memoryctrl.cc b/src/accelwattch/memoryctrl.cc
similarity index 100%
rename from src/gpuwattch/memoryctrl.cc
rename to src/accelwattch/memoryctrl.cc
diff --git a/src/gpuwattch/memoryctrl.h b/src/accelwattch/memoryctrl.h
similarity index 100%
rename from src/gpuwattch/memoryctrl.h
rename to src/accelwattch/memoryctrl.h
diff --git a/src/gpuwattch/noc.cc b/src/accelwattch/noc.cc
similarity index 100%
rename from src/gpuwattch/noc.cc
rename to src/accelwattch/noc.cc
diff --git a/src/gpuwattch/noc.h b/src/accelwattch/noc.h
similarity index 100%
rename from src/gpuwattch/noc.h
rename to src/accelwattch/noc.h
diff --git a/src/gpuwattch/processor.cc b/src/accelwattch/processor.cc
similarity index 99%
rename from src/gpuwattch/processor.cc
rename to src/accelwattch/processor.cc
index fc6db463d..9e7f5b2c5 100644
--- a/src/gpuwattch/processor.cc
+++ b/src/accelwattch/processor.cc
@@ -30,11 +30,13 @@
  ***************************************************************************/
 /********************************************************************
  *      Modified by:
- ** Jingwen Leng, Univeristy of Texas, Austin                   * Syed Gilani,
- *University of Wisconsin–Madison                * Tayler Hetherington,
- *University of British Columbia         * Ahmed ElTantawy, University of
- *British Columbia             *
+ * Jingwen Leng, University of Texas, Austin                
+ * Syed Gilani, University of Wisconsin–Madison         
+ * Tayler Hetherington, University of British Columbia
+ * Ahmed ElTantawy, University of British Columbia
+ * Vijay Kandiah, Northwestern University
  ********************************************************************/
+
 #include "processor.h"
 #include <assert.h>
 #include <stdio.h>
@@ -118,7 +120,7 @@ Processor::Processor(ParseXML *XML_interface)
       set_pppm(pppm_t, cores[i]->clockRate * procdynp.numCore, procdynp.numCore,
                procdynp.numCore, procdynp.numCore);
       // set the exClockRate
-      exClockRate = cores[0]->clockRate * 2;  // TODO; get from XML file
+      exClockRate = cores[0]->clockRate;  // TODO; get from XML file
       // cout<<"****EX clock rate:"<<exClockRate<<endl;
       core.power = core.power + cores[i]->power * pppm_t;
       set_pppm(pppm_t, 1 / cores[i]->executionTime, procdynp.numCore,
diff --git a/src/gpuwattch/processor.h b/src/accelwattch/processor.h
similarity index 100%
rename from src/gpuwattch/processor.h
rename to src/accelwattch/processor.h
diff --git a/src/gpuwattch/quadro.xml b/src/accelwattch/quadro.xml
similarity index 100%
rename from src/gpuwattch/quadro.xml
rename to src/accelwattch/quadro.xml
diff --git a/src/gpuwattch/results/Alpha21364 b/src/accelwattch/results/Alpha21364
similarity index 100%
rename from src/gpuwattch/results/Alpha21364
rename to src/accelwattch/results/Alpha21364
diff --git a/src/gpuwattch/results/Alpha21364_90nm b/src/accelwattch/results/Alpha21364_90nm
similarity index 100%
rename from src/gpuwattch/results/Alpha21364_90nm
rename to src/accelwattch/results/Alpha21364_90nm
diff --git a/src/gpuwattch/results/Penryn b/src/accelwattch/results/Penryn
similarity index 100%
rename from src/gpuwattch/results/Penryn
rename to src/accelwattch/results/Penryn
diff --git a/src/gpuwattch/results/T1 b/src/accelwattch/results/T1
similarity index 100%
rename from src/gpuwattch/results/T1
rename to src/accelwattch/results/T1
diff --git a/src/gpuwattch/results/T1_DC_64 b/src/accelwattch/results/T1_DC_64
similarity index 100%
rename from src/gpuwattch/results/T1_DC_64
rename to src/accelwattch/results/T1_DC_64
diff --git a/src/gpuwattch/results/T1_SBT_64 b/src/accelwattch/results/T1_SBT_64
similarity index 100%
rename from src/gpuwattch/results/T1_SBT_64
rename to src/accelwattch/results/T1_SBT_64
diff --git a/src/gpuwattch/results/T1_ST_64 b/src/accelwattch/results/T1_ST_64
similarity index 100%
rename from src/gpuwattch/results/T1_ST_64
rename to src/accelwattch/results/T1_ST_64
diff --git a/src/gpuwattch/results/T2 b/src/accelwattch/results/T2
similarity index 100%
rename from src/gpuwattch/results/T2
rename to src/accelwattch/results/T2
diff --git a/src/gpuwattch/results/Xeon_core b/src/accelwattch/results/Xeon_core
similarity index 100%
rename from src/gpuwattch/results/Xeon_core
rename to src/accelwattch/results/Xeon_core
diff --git a/src/gpuwattch/results/Xeon_uncore b/src/accelwattch/results/Xeon_uncore
similarity index 100%
rename from src/gpuwattch/results/Xeon_uncore
rename to src/accelwattch/results/Xeon_uncore
diff --git a/src/gpuwattch/sharedcache.cc b/src/accelwattch/sharedcache.cc
similarity index 100%
rename from src/gpuwattch/sharedcache.cc
rename to src/accelwattch/sharedcache.cc
diff --git a/src/gpuwattch/sharedcache.h b/src/accelwattch/sharedcache.h
similarity index 100%
rename from src/gpuwattch/sharedcache.h
rename to src/accelwattch/sharedcache.h
diff --git a/src/gpuwattch/technology_xeon_core.cc b/src/accelwattch/technology_xeon_core.cc
similarity index 100%
rename from src/gpuwattch/technology_xeon_core.cc
rename to src/accelwattch/technology_xeon_core.cc
diff --git a/src/gpuwattch/version.h b/src/accelwattch/version.h
similarity index 100%
rename from src/gpuwattch/version.h
rename to src/accelwattch/version.h
diff --git a/src/gpuwattch/xmlParser.cc b/src/accelwattch/xmlParser.cc
similarity index 100%
rename from src/gpuwattch/xmlParser.cc
rename to src/accelwattch/xmlParser.cc
diff --git a/src/gpuwattch/xmlParser.h b/src/accelwattch/xmlParser.h
similarity index 100%
rename from src/gpuwattch/xmlParser.h
rename to src/accelwattch/xmlParser.h
diff --git a/src/cuda-sim/cuda-sim.cc b/src/cuda-sim/cuda-sim.cc
index 71f0703ac..f9e5db314 100644
--- a/src/cuda-sim/cuda-sim.cc
+++ b/src/cuda-sim/cuda-sim.cc
@@ -1,19 +1,20 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Ali Bakhoda, Wilson W.L. Fung,
-// George L. Yuan, Jimmy Kwa
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Ali Bakhoda, Wilson W.L. Fung,
+// George L. Yuan, Jimmy Kwa, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -588,65 +589,119 @@ void ptx_instruction::set_fp_or_int_archop() {
       oprnd_type = INT_OP;
   }
 }
-void ptx_instruction::set_mul_div_or_other_archop() {
-  sp_op = OTHER_OP;
-  if ((m_opcode != MEMBAR_OP) && (m_opcode != SSY_OP) && (m_opcode != BRA_OP) &&
-      (m_opcode != BAR_OP) && (m_opcode != EXIT_OP) && (m_opcode != NOP_OP) &&
-      (m_opcode != RETP_OP) && (m_opcode != RET_OP) && (m_opcode != CALLP_OP) &&
-      (m_opcode != CALL_OP)) {
-    if (get_type() == F32_TYPE || get_type() == F64_TYPE ||
-        get_type() == FF64_TYPE) {
-      switch (get_opcode()) {
-        case MUL_OP:
-        case MAD_OP:
-          sp_op = FP_MUL_OP;
-          break;
-        case DIV_OP:
-          sp_op = FP_DIV_OP;
-          break;
-        case LG2_OP:
-          sp_op = FP_LG_OP;
-          break;
-        case RSQRT_OP:
-        case SQRT_OP:
-          sp_op = FP_SQRT_OP;
-          break;
-        case RCP_OP:
-          sp_op = FP_DIV_OP;
-          break;
-        case SIN_OP:
-        case COS_OP:
-          sp_op = FP_SIN_OP;
-          break;
-        case EX2_OP:
-          sp_op = FP_EXP_OP;
-          break;
-        default:
-          if ((op == ALU_OP) || (op == TENSOR_CORE_OP)) sp_op = FP__OP;
-          break;
+
+void ptx_instruction::set_mul_div_or_other_archop(){
+  sp_op=OTHER_OP;
+  if((m_opcode != MEMBAR_OP) && (m_opcode != SSY_OP) && (m_opcode != BRA_OP) && (m_opcode != BAR_OP) && (m_opcode != EXIT_OP) && (m_opcode != NOP_OP) && (m_opcode != RETP_OP) && (m_opcode != RET_OP) && (m_opcode != CALLP_OP) && (m_opcode != CALL_OP)){
+    if(get_type() == F64_TYPE || get_type() == FF64_TYPE){
+         switch(get_opcode()){
+            case MUL_OP:
+            case MAD_OP:
+            case FMA_OP:
+                sp_op=DP_MUL_OP;
+               break;
+            case DIV_OP:
+            case REM_OP:
+                sp_op=DP_DIV_OP;
+               break;
+            case RCP_OP:
+                sp_op=DP_DIV_OP;
+               break;
+            case LG2_OP:
+                sp_op=FP_LG_OP;
+               break;
+            case RSQRT_OP:
+            case SQRT_OP:
+                sp_op=FP_SQRT_OP;
+               break;            
+            case SIN_OP:
+            case COS_OP:
+                sp_op=FP_SIN_OP;
+               break;
+            case EX2_OP:
+                sp_op=FP_EXP_OP;
+               break;
+            case MMA_OP:
+                sp_op=TENSOR__OP;
+            break;
+            case TEX_OP:
+                sp_op=TEX__OP;
+            break;
+            default:
+               if((op==DP_OP) || (op==ALU_OP))
+                  sp_op=DP___OP;
+               break;
+         }
       }
-    } else {
-      switch (get_opcode()) {
-        case MUL24_OP:
-        case MAD24_OP:
-          sp_op = INT_MUL24_OP;
-          break;
-        case MUL_OP:
-        case MAD_OP:
-          if (get_type() == U32_TYPE || get_type() == S32_TYPE ||
-              get_type() == B32_TYPE)
-            sp_op = INT_MUL32_OP;
-          else
-            sp_op = INT_MUL_OP;
-          break;
-        case DIV_OP:
-          sp_op = INT_DIV_OP;
-          break;
-        default:
-          if ((op == ALU_OP)) sp_op = INT__OP;
-          break;
+      else if(get_type()==F16_TYPE || get_type()==F32_TYPE){
+         switch(get_opcode()){
+            case MUL_OP:
+            case MAD_OP:
+            case FMA_OP:
+                sp_op=FP_MUL_OP;
+               break;
+            case DIV_OP:
+            case REM_OP:
+                sp_op=FP_DIV_OP;
+               break;
+            case RCP_OP:
+                sp_op=FP_DIV_OP;
+               break;
+            case LG2_OP:
+                sp_op=FP_LG_OP;
+               break;
+            case RSQRT_OP:
+            case SQRT_OP:
+                sp_op=FP_SQRT_OP;
+               break;            
+            case SIN_OP:
+            case COS_OP:
+                sp_op=FP_SIN_OP;
+               break;
+            case EX2_OP:
+                sp_op=FP_EXP_OP;
+               break;
+            case MMA_OP:
+                sp_op=TENSOR__OP;
+            break;
+            case TEX_OP:
+                sp_op=TEX__OP;
+            break;
+            default:
+               if((op==SP_OP) || (op==ALU_OP))
+                  sp_op=FP__OP;
+               break;
+         }
+      }else {
+         switch(get_opcode()){
+            case MUL24_OP:
+            case MAD24_OP:
+                sp_op=INT_MUL24_OP;
+            break;
+            case MUL_OP:
+            case MAD_OP:
+            case FMA_OP:
+               if(get_type()==U32_TYPE || get_type()==S32_TYPE || get_type()==B32_TYPE)
+                   sp_op=INT_MUL32_OP;
+               else
+                   sp_op=INT_MUL_OP;
+            break;
+            case DIV_OP:
+            case REM_OP:
+                sp_op=INT_DIV_OP;
+            break;
+            case MMA_OP:
+                sp_op=TENSOR__OP;
+            break;
+            case TEX_OP:
+                sp_op=TEX__OP;
+            break;
+            default:
+               if((op==INTP_OP) || (op==ALU_OP))
+                   sp_op=INT__OP;
+               break;
+         }
       }
-    }
   }
 }
 
@@ -880,6 +935,7 @@ void ptx_instruction::set_opcode_and_latency() {
     case MAD_OP:
     case MADC_OP:
     case MADP_OP:
+    case FMA_OP:
       // MAD latency
       switch (get_type()) {
         case F32_TYPE:
@@ -903,7 +959,18 @@ void ptx_instruction::set_opcode_and_latency() {
           break;
       }
       break;
+    case MUL24_OP: //MUL24 is performed on mul32 units (with additional instructions for bitmasking) on devices with compute capability >1.x
+      latency = int_latency[2]+1;
+      initiation_interval = int_init[2]+1;
+      op = INTP_OP;
+      break;
+    case MAD24_OP:
+      latency = int_latency[3]+1;
+      initiation_interval = int_init[3]+1;
+      op = INTP_OP;
+      break;
     case DIV_OP:
+    case REM_OP:
       // Floating point only
       op = SFU_OP;
       switch (get_type()) {
diff --git a/src/cuda-sim/instructions.cc b/src/cuda-sim/instructions.cc
index 0b990e83c..44afbe5aa 100644
--- a/src/cuda-sim/instructions.cc
+++ b/src/cuda-sim/instructions.cc
@@ -1,19 +1,20 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Wilson W.L. Fung, Ali Bakhoda,
-// Jimmy Kwa, George L. Yuan
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Ali Bakhoda,
+// Jimmy Kwa, George L. Yuan, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -26,6 +27,7 @@
 // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
+
 #include "instructions.h"
 #include "half.h"
 #include "half.hpp"
@@ -3977,7 +3979,7 @@ void mad_def(const ptx_instruction *pI, ptx_thread_info *thread,
           fesetround(FE_TOWARDZERO);
           break;
         default:
-          assert(0);
+          //assert(0);
           break;
       }
       d.f32 = a.f32 * b.f32 + c.f32;
@@ -4323,11 +4325,8 @@ void mul_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
     case S64_TYPE:
       t.s64 = a.s64 * b.s64;
       assert(!pI->is_wide());
-      assert(!pI->is_hi());
-      if (pI->is_lo())
-        d.s64 = t.s64;
-      else
-        assert(0);
+      //assert(!pI->is_hi());
+      d.s64 = t.s64;
       break;
     case U16_TYPE:
       t.u32 = ((unsigned)a.u16) * ((unsigned)b.u16);
diff --git a/src/cuda-sim/ptx.l b/src/cuda-sim/ptx.l
index 675404597..7706f0b31 100644
--- a/src/cuda-sim/ptx.l
+++ b/src/cuda-sim/ptx.l
@@ -1,32 +1,34 @@
 /*
-Copyright (c) 2009-2011, Tor M. Aamodt
-The University of British Columbia
+Copyright (c) 2009-2021, Tor M. Aamodt, Vijay Kandiah, Nikos Hardavellas
+The University of British Columbia, Northwestern University
 All rights reserved.
-
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
 
-Redistributions of source code must retain the above copyright notice, this
-list of conditions and the following disclaimer.
-Redistributions in binary form must reproduce the above copyright notice, this
-list of conditions and the following disclaimer in the documentation and/or
-other materials provided with the distribution.
-Neither the name of The University of British Columbia nor the names of its
-contributors may be used to endorse or promote products derived from this
-software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of The University of British Columbia, Northwestern 
+   University nor the names of their contributors may be used to
+   endorse or promote products derived from this software without specific
+   prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
 */
 
+
 %option nounput
 %option noyywrap
 %option yylineno
@@ -69,6 +71,7 @@ andn	TC; yylval->int_value = ANDN_OP; return OPCODE;
 atom	TC; yylval->int_value = ATOM_OP; return OPCODE;
 bar.warp 	TC; yylval->int_value = NOP_OP; return OPCODE;
 bar 	TC; yylval->int_value = BAR_OP; return OPCODE;
+barrier	TC; yylval->int_value = BAR_OP; return OPCODE;
 bfe     TC; yylval->int_value = BFE_OP; return OPCODE;
 bfi     TC; yylval->int_value = BFI_OP; return OPCODE;
 bfind   TC; yylval->int_value = BFIND_OP; return OPCODE;
@@ -167,14 +170,22 @@ breakaddr  TC; yylval->int_value = BREAKADDR_OP; return OPCODE;
 "CPTX_END"	printf("ENDING CUSTOM PTX.\n"); BEGIN(IN_COMMENT);
 
 <INITIAL,NOT_OPCODE,IN_INST,IN_FUNC_DECL>{
-\.a\.sync TC; yylval->int_value = LOAD_A; return WMMA_DIRECTIVE;
-\.b\.sync TC; yylval->int_value = LOAD_B; return WMMA_DIRECTIVE;
-\.c\.sync TC; yylval->int_value = LOAD_C; return WMMA_DIRECTIVE;
-\.d\.sync TC; yylval->int_value = STORE_D; return WMMA_DIRECTIVE;
-\.mma\.sync TC;yylval->int_value=MMA; return WMMA_DIRECTIVE;
+\.a\.sync\.aligned TC; yylval->int_value = LOAD_A; return WMMA_DIRECTIVE;
+\.b\.sync\.aligned TC; yylval->int_value = LOAD_B; return WMMA_DIRECTIVE;
+\.c\.sync\.aligned TC; yylval->int_value = LOAD_C; return WMMA_DIRECTIVE;
+\.d\.sync\.aligned TC; yylval->int_value = STORE_D; return WMMA_DIRECTIVE;
+\.mma\.sync\.aligned TC;yylval->int_value=MMA; return WMMA_DIRECTIVE;
 
 \.row TC; yylval->int_value = ROW; return LAYOUT;
 \.col TC; yylval->int_value = COL; return LAYOUT;
+\.m16n16k16\.global TC; yylval->int_value = M16N16K16; return CONFIGURATION;
+\.m32n8k16\.global TC; yylval->int_value = M32N8K16; return CONFIGURATION;
+\.m8n32k16\.global TC;  yylval->int_value = M8N32K16; return CONFIGURATION;
+
+\.m16n16k16\.shared TC; yylval->int_value = M16N16K16; return CONFIGURATION;
+\.m32n8k16\.shared TC; yylval->int_value = M32N8K16; return CONFIGURATION;
+\.m8n32k16\.shared TC;  yylval->int_value = M8N32K16; return CONFIGURATION;
+
 \.m16n16k16 TC; yylval->int_value = M16N16K16; return CONFIGURATION;
 \.m32n8k16 TC; yylval->int_value = M32N8K16; return CONFIGURATION;
 \.m8n32k16 TC;  yylval->int_value = M8N32K16; return CONFIGURATION;
@@ -476,4 +487,4 @@ int ptx_error( yyscan_t yyscanner, ptx_recognizer* recognizer, const char *s )
 	fflush(stdout);
 	//exit(1);
 	return 0;
-}
+}
\ No newline at end of file
diff --git a/src/cuda-sim/ptx_ir.cc b/src/cuda-sim/ptx_ir.cc
index d3da4b541..2edc1ed56 100644
--- a/src/cuda-sim/ptx_ir.cc
+++ b/src/cuda-sim/ptx_ir.cc
@@ -1,19 +1,20 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Ali Bakhoda, Wilson W.L. Fung,
-// George L. Yuan
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Ali Bakhoda, Wilson W.L. Fung,
+// George L. Yuan, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -1384,6 +1385,8 @@ ptx_instruction::ptx_instruction(
       case CS_OPTION:
       case LU_OPTION:
       case CV_OPTION:
+      case WB_OPTION: 
+      case WT_OPTION:
         m_cache_option = last_ptx_inst_option;
         break;
       case HALF_OPTION:
diff --git a/src/gpgpu-sim/dram.cc b/src/gpgpu-sim/dram.cc
index ca47c4684..545c45dfd 100644
--- a/src/gpgpu-sim/dram.cc
+++ b/src/gpgpu-sim/dram.cc
@@ -1,19 +1,20 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Wilson W.L. Fung, Ali Bakhoda,
-// Ivan Sham, George L. Yuan,
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Ali Bakhoda,
+// Ivan Sham, George L. Yuan, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -855,7 +856,7 @@ void dram_t::visualizer_print(gzFile visualizer_file) {
 
 void dram_t::set_dram_power_stats(unsigned &cmd, unsigned &activity,
                                   unsigned &nop, unsigned &act, unsigned &pre,
-                                  unsigned &rd, unsigned &wr,
+                                  unsigned &rd, unsigned &wr, unsigned &wr_WB,
                                   unsigned &req) const {
   // Point power performance counters to low-level DRAM counters
   cmd = n_cmd;
@@ -865,6 +866,7 @@ void dram_t::set_dram_power_stats(unsigned &cmd, unsigned &activity,
   pre = n_pre;
   rd = n_rd;
   wr = n_wr;
+  wr_WB = n_wr_WB;
   req = n_req;
 }
 
diff --git a/src/gpgpu-sim/dram.h b/src/gpgpu-sim/dram.h
index 6c212e9be..88e46ed7b 100644
--- a/src/gpgpu-sim/dram.h
+++ b/src/gpgpu-sim/dram.h
@@ -1,19 +1,20 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Ivan Sham, Ali Bakhoda,
-// George L. Yuan, Wilson W.L. Fung
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Ivan Sham, Ali Bakhoda,
+// George L. Yuan, Wilson W.L. Fung, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -135,7 +136,7 @@ class dram_t {
   // Power Model
   void set_dram_power_stats(unsigned &cmd, unsigned &activity, unsigned &nop,
                             unsigned &act, unsigned &pre, unsigned &rd,
-                            unsigned &wr, unsigned &req) const;
+                            unsigned &wr, unsigned &wr_WB, unsigned &req) const;
 
   const memory_config *m_config;
 
diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 7416246f0..a2aeec57f 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Tayler Hetherington
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -642,6 +643,7 @@ void cache_stats::clear() {
   ///
   for (unsigned i = 0; i < NUM_MEM_ACCESS_TYPE; ++i) {
     std::fill(m_stats[i].begin(), m_stats[i].end(), 0);
+    std::fill(m_stats_pw[i].begin(), m_stats_pw[i].end(), 0);
     std::fill(m_fail_stats[i].begin(), m_fail_stats[i].end(), 0);
   }
   m_cache_port_available_cycles = 0;
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 67d084cbf..498dfebd0 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Tayler Hetherington
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index 56ede056c..e44551ee3 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -1,19 +1,20 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Wilson W.L. Fung, George L. Yuan,
-// Ali Bakhoda, Andrew Turner, Ivan Sham
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, George L. Yuan,
+// Ali Bakhoda, Andrew Turner, Ivan Sham, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -95,10 +96,11 @@ tr1_hash_map<new_addr_type, unsigned> address_random_interleaving;
 
 #include "mem_latency_stat.h"
 
+
 void power_config::reg_options(class OptionParser *opp) {
-  option_parser_register(opp, "-gpuwattch_xml_file", OPT_CSTR,
-                         &g_power_config_name, "GPUWattch XML file",
-                         "gpuwattch.xml");
+  option_parser_register(opp, "-accelwattch_xml_file", OPT_CSTR,
+                         &g_power_config_name, "AccelWattch XML file",
+                         "accelwattch_sass_sim.xml");
 
   option_parser_register(opp, "-power_simulation_enabled", OPT_BOOL,
                          &g_power_simulation_enabled,
@@ -108,6 +110,92 @@ void power_config::reg_options(class OptionParser *opp) {
                          &g_power_per_cycle_dump,
                          "Dump detailed power output each cycle", "0");
 
+
+
+
+  option_parser_register(opp, "-hw_perf_file_name", OPT_CSTR,
+                         &g_hw_perf_file_name, "Hardware Performance Statistics file",
+                         "hw_perf.csv");
+
+  option_parser_register(opp, "-hw_perf_bench_name", OPT_CSTR,
+                         &g_hw_perf_bench_name, "Kernel Name in Hardware Performance Statistics file",
+                         "");
+
+  option_parser_register(opp, "-power_simulation_mode", OPT_INT32,
+                         &g_power_simulation_mode,
+                         "Switch performance counter input for power simulation (0=Sim, 1=HW, 2=HW-Sim Hybrid)", "0");
+
+  option_parser_register(opp, "-dvfs_enabled", OPT_BOOL,
+                         &g_dvfs_enabled,
+                         "Turn on DVFS for power model", "0");
+  option_parser_register(opp, "-aggregate_power_stats", OPT_BOOL,
+                         &g_aggregate_power_stats,
+                         "Accumulate power across all kernels", "0");
+
+  //Accelwattch Hyrbid Configuration
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L1_RH", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_L1_RH],
+                         "Get L1 Read Hits for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L1_RM", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_L1_RM],
+                         "Get L1 Read Misses for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L1_WH", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_L1_WH],
+                         "Get L1 Write Hits for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L1_WM", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_L1_WM],
+                         "Get L1 Write Misses for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L2_RH", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_L2_RH],
+                         "Get L2 Read Hits for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L2_RM", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_L2_RM],
+                         "Get L2 Read Misses for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L2_WH", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_L2_WH],
+                         "Get L2 Write Hits for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L2_WM", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_L2_WM],
+                         "Get L2 Write Misses for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_CC_ACC", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_CC_ACC],
+                         "Get Constant Cache Acesses for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_SHARED_ACC", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_SHRD_ACC],
+                         "Get Shared Memory Acesses for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_DRAM_RD", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_DRAM_RD],
+                         "Get DRAM Reads for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_DRAM_WR", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_DRAM_WR],
+                         "Get DRAM Writes for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_NOC", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_NOC],
+                         "Get Interconnect Acesses for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_PIPE_DUTY", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_PIPE_DUTY],
+                         "Get Pipeline Duty Cycle Acesses for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_NUM_SM_IDLE", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_NUM_SM_IDLE],
+                         "Get Number of Idle SMs for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_CYCLES", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_CYCLES],
+                         "Get Executed Cycles for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(opp, "-accelwattch_hybrid_perfsim_VOLTAGE", OPT_BOOL,
+                         &accelwattch_hybrid_configuration[HW_VOLTAGE],
+                         "Get Chip Voltage for Accelwattch-Hybrid from Accel-Sim", "0");
+
+
   // Output Data Formats
   option_parser_register(
       opp, "-power_trace_enabled", OPT_BOOL, &g_power_trace_enabled,
@@ -835,7 +923,7 @@ gpgpu_sim::gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx)
 
 #ifdef GPGPUSIM_POWER_MODEL
   m_gpgpusim_wrapper = new gpgpu_sim_wrapper(config.g_power_simulation_enabled,
-                                             config.g_power_config_name);
+                                             config.g_power_config_name, config.g_power_simulation_mode, config.g_dvfs_enabled);
 #endif
 
   m_shader_stats = new shader_core_stats(m_shader_config);
@@ -1010,6 +1098,14 @@ void gpgpu_sim::init() {
   partiton_reqs_in_parallel_util = 0;
   gpu_sim_cycle_parition_util = 0;
 
+// McPAT initialization function. Called on first launch of GPU
+#ifdef GPGPUSIM_POWER_MODEL
+  if (m_config.g_power_simulation_enabled) {
+    init_mcpat(m_config, m_gpgpusim_wrapper, m_config.gpu_stat_sample_freq,
+               gpu_tot_sim_insn, gpu_sim_insn);
+  }
+#endif
+
   reinit_clock_domains();
   gpgpu_ctx->func_sim->set_param_gpgpu_num_shaders(m_config.num_shader());
   for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++)
@@ -1035,14 +1131,6 @@ void gpgpu_sim::init() {
   }
 
   if (g_network_mode) icnt_init();
-
-    // McPAT initialization function. Called on first launch of GPU
-#ifdef GPGPUSIM_POWER_MODEL
-  if (m_config.g_power_simulation_enabled) {
-    init_mcpat(m_config, m_gpgpusim_wrapper, m_config.gpu_stat_sample_freq,
-               gpu_tot_sim_insn, gpu_sim_insn);
-  }
-#endif
 }
 
 void gpgpu_sim::update_stats() {
@@ -1067,6 +1155,11 @@ void gpgpu_sim::update_stats() {
   gpu_occupancy = occupancy_stats();
 }
 
+PowerscalingCoefficients *gpgpu_sim::get_scaling_coeffs()
+{
+  return m_gpgpusim_wrapper->get_scaling_coeffs();
+}
+
 void gpgpu_sim::print_stats() {
   gpgpu_ctx->stats->ptx_file_line_stats_write_file();
   gpu_print_stat();
@@ -1146,6 +1239,18 @@ std::string gpgpu_sim::executed_kernel_info_string() {
 
   return statout.str();
 }
+
+std::string gpgpu_sim::executed_kernel_name() {
+  std::stringstream statout;  
+  if( m_executed_kernel_names.size() == 1)
+     statout << m_executed_kernel_names[0];
+  else{
+    for (unsigned int k = 0; k < m_executed_kernel_names.size(); k++) {
+      statout << m_executed_kernel_names[k] << " ";
+    }
+  }
+  return statout.str();
+}
 void gpgpu_sim::set_cache_config(std::string kernel_name,
                                  FuncCache cacheConfig) {
   m_special_cache_config[kernel_name] = cacheConfig;
@@ -1326,10 +1431,20 @@ void gpgpu_sim::gpu_print_stat() {
   m_shader_stats->print(stdout);
 #ifdef GPGPUSIM_POWER_MODEL
   if (m_config.g_power_simulation_enabled) {
+    if(m_config.g_power_simulation_mode > 0){
+        //if(!m_config.g_aggregate_power_stats)
+          mcpat_reset_perf_count(m_gpgpusim_wrapper);
+        calculate_hw_mcpat(m_config, getShaderCoreConfig(), m_gpgpusim_wrapper,
+                  m_power_stats, m_config.gpu_stat_sample_freq,
+                  gpu_tot_sim_cycle, gpu_sim_cycle, gpu_tot_sim_insn,
+                  gpu_sim_insn, m_config.g_power_simulation_mode, m_config.g_dvfs_enabled, 
+                  m_config.g_hw_perf_file_name, m_config.g_hw_perf_bench_name, executed_kernel_name(), m_config.accelwattch_hybrid_configuration, m_config.g_aggregate_power_stats);
+    }
     m_gpgpusim_wrapper->print_power_kernel_stats(
         gpu_sim_cycle, gpu_tot_sim_cycle, gpu_tot_sim_insn + gpu_sim_insn,
         kernel_info_str, true);
-    mcpat_reset_perf_count(m_gpgpusim_wrapper);
+    //if(!m_config.g_aggregate_power_stats)
+      mcpat_reset_perf_count(m_gpgpusim_wrapper);
   }
 #endif
 
@@ -1796,6 +1911,7 @@ void gpgpu_sim::cycle() {
           m_power_stats->pwr_mem_stat->n_pre[CURRENT_STAT_IDX][i],
           m_power_stats->pwr_mem_stat->n_rd[CURRENT_STAT_IDX][i],
           m_power_stats->pwr_mem_stat->n_wr[CURRENT_STAT_IDX][i],
+          m_power_stats->pwr_mem_stat->n_wr_WB[CURRENT_STAT_IDX][i],
           m_power_stats->pwr_mem_stat->n_req[CURRENT_STAT_IDX][i]);
     }
   }
@@ -1839,7 +1955,7 @@ void gpgpu_sim::cycle() {
         m_cluster[i]->core_cycle();
         *active_sms += m_cluster[i]->get_n_active_sms();
       }
-      // Update core icnt/cache stats for GPUWattch
+      // Update core icnt/cache stats for AccelWattch
       m_cluster[i]->get_icnt_stats(
           m_power_stats->pwr_mem_stat->n_simt_to_mem[CURRENT_STAT_IDX][i],
           m_power_stats->pwr_mem_stat->n_mem_to_simt[CURRENT_STAT_IDX][i]);
@@ -1869,10 +1985,12 @@ void gpgpu_sim::cycle() {
       // McPAT main cycle (interface with McPAT)
 #ifdef GPGPUSIM_POWER_MODEL
     if (m_config.g_power_simulation_enabled) {
+      if(m_config.g_power_simulation_mode == 0){
       mcpat_cycle(m_config, getShaderCoreConfig(), m_gpgpusim_wrapper,
                   m_power_stats, m_config.gpu_stat_sample_freq,
                   gpu_tot_sim_cycle, gpu_sim_cycle, gpu_tot_sim_insn,
-                  gpu_sim_insn);
+                  gpu_sim_insn, m_config.g_dvfs_enabled);
+      }
     }
 #endif
 
diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h
index 2e6820d82..68b3dfa10 100644
--- a/src/gpgpu-sim/gpu-sim.h
+++ b/src/gpgpu-sim/gpu-sim.h
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Wilson W.L. Fung
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -26,6 +27,7 @@
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 
+
 #ifndef GPU_SIM_H
 #define GPU_SIM_H
 
@@ -68,6 +70,29 @@ extern tr1_hash_map<new_addr_type, unsigned> address_random_interleaving;
 
 enum dram_ctrl_t { DRAM_FIFO = 0, DRAM_FRFCFS = 1 };
 
+enum hw_perf_t {
+  HW_BENCH_NAME=0,
+  HW_KERNEL_NAME,
+  HW_L1_RH,
+  HW_L1_RM,
+  HW_L1_WH,
+  HW_L1_WM,
+  HW_CC_ACC,
+  HW_SHRD_ACC,
+  HW_DRAM_RD,
+  HW_DRAM_WR,
+  HW_L2_RH,
+  HW_L2_RM,
+  HW_L2_WH,
+  HW_L2_WM,
+  HW_NOC,
+  HW_PIPE_DUTY,
+  HW_NUM_SM_IDLE,
+  HW_CYCLES,
+  HW_VOLTAGE,
+  HW_TOTAL_STATS
+};
+
 struct power_config {
   power_config() { m_valid = true; }
   void init() {
@@ -82,7 +107,8 @@ struct power_config {
       s++;
     }
     char buf1[1024];
-    snprintf(buf1, 1024, "gpgpusim_power_report__%s.log", date);
+    //snprintf(buf1, 1024, "accelwattch_power_report__%s.log", date);
+    snprintf(buf1, 1024, "accelwattch_power_report.log");
     g_power_filename = strdup(buf1);
     char buf2[1024];
     snprintf(buf2, 1024, "gpgpusim_power_trace_report__%s.log.gz", date);
@@ -94,6 +120,9 @@ struct power_config {
     snprintf(buf4, 1024, "gpgpusim_steady_state_tracking_report__%s.log.gz",
              date);
     g_steady_state_tracking_filename = strdup(buf4);
+    // for(int i =0; i< hw_perf_t::HW_TOTAL_STATS; i++){
+    //   accelwattch_hybrid_configuration[i] = 0;
+    // }
 
     if (g_steady_power_levels_enabled) {
       sscanf(gpu_steady_state_definition, "%lf:%lf",
@@ -125,6 +154,14 @@ struct power_config {
   double gpu_steady_power_deviation;
   double gpu_steady_min_period;
 
+
+  char *g_hw_perf_file_name;
+  char *g_hw_perf_bench_name;
+  int g_power_simulation_mode;
+  bool g_dvfs_enabled;
+  bool g_aggregate_power_stats;
+  bool accelwattch_hybrid_configuration[hw_perf_t::HW_TOTAL_STATS];
+
   // Nonlinear power model
   bool g_use_nonlinear_model;
   char *gpu_nonlinear_model_config;
@@ -357,7 +394,7 @@ class gpgpu_sim_config : public power_config,
 
     m_valid = true;
   }
-
+  unsigned get_core_freq() const { return core_freq; }
   unsigned num_shader() const { return m_shader_config.num_shader(); }
   unsigned num_cluster() const { return m_shader_config.n_simt_clusters; }
   unsigned get_max_concurrent_kernel() const { return max_concurrent_kernel; }
@@ -527,6 +564,7 @@ class gpgpu_sim : public gpgpu_t {
   bool kernel_more_cta_left(kernel_info_t *kernel) const;
   bool hit_max_cta_count() const;
   kernel_info_t *select_kernel();
+  PowerscalingCoefficients *get_scaling_coeffs();
   void decrement_kernel_latency();
 
   const gpgpu_sim_config &get_config() const { return m_config; }
@@ -634,6 +672,7 @@ class gpgpu_sim : public gpgpu_t {
 
   std::string executed_kernel_info_string();  //< format the kernel information
                                               // into a string for stat printout
+  std::string executed_kernel_name();
   void clear_executed_kernel_info();  //< clear the kernel information after
                                       // stat printout
   virtual void createSIMTCluster() = 0;
diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index f1c761fe5..511c15efa 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -388,9 +389,9 @@ void memory_partition_unit::set_done(mem_fetch *mf) {
 
 void memory_partition_unit::set_dram_power_stats(
     unsigned &n_cmd, unsigned &n_activity, unsigned &n_nop, unsigned &n_act,
-    unsigned &n_pre, unsigned &n_rd, unsigned &n_wr, unsigned &n_req) const {
+    unsigned &n_pre, unsigned &n_rd, unsigned &n_wr, unsigned &n_wr_WB, unsigned &n_req) const {
   m_dram->set_dram_power_stats(n_cmd, n_activity, n_nop, n_act, n_pre, n_rd,
-                               n_wr, n_req);
+                               n_wr, n_wr_WB, n_req);
 }
 
 void memory_partition_unit::print(FILE *fp) const {
@@ -664,6 +665,7 @@ void gpgpu_sim::print_dram_stats(FILE *fout) const {
   unsigned pre = 0;
   unsigned rd = 0;
   unsigned wr = 0;
+  unsigned wr_WB = 0;
   unsigned req = 0;
   unsigned tot_cmd = 0;
   unsigned tot_nop = 0;
@@ -675,13 +677,13 @@ void gpgpu_sim::print_dram_stats(FILE *fout) const {
 
   for (unsigned i = 0; i < m_memory_config->m_n_mem; i++) {
     m_memory_partition_unit[i]->set_dram_power_stats(cmd, activity, nop, act,
-                                                     pre, rd, wr, req);
+                                                     pre, rd, wr, wr_WB, req);
     tot_cmd += cmd;
     tot_nop += nop;
     tot_act += act;
     tot_pre += pre;
     tot_rd += rd;
-    tot_wr += wr;
+    tot_wr += wr + wr_WB;
     tot_req += req;
   }
   fprintf(fout, "gpgpu_n_dram_reads = %d\n", tot_rd);
diff --git a/src/gpgpu-sim/l2cache.h b/src/gpgpu-sim/l2cache.h
index beed76562..902a4b7c0 100644
--- a/src/gpgpu-sim/l2cache.h
+++ b/src/gpgpu-sim/l2cache.h
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -95,7 +96,7 @@ class memory_partition_unit {
   // Power model
   void set_dram_power_stats(unsigned &n_cmd, unsigned &n_activity,
                             unsigned &n_nop, unsigned &n_act, unsigned &n_pre,
-                            unsigned &n_rd, unsigned &n_wr,
+                            unsigned &n_rd, unsigned &n_wr, unsigned &n_wr_WB,
                             unsigned &n_req) const;
 
   int global_sub_partition_id_to_local_id(int global_sub_partition_id) const;
diff --git a/src/gpgpu-sim/power_interface.cc b/src/gpgpu-sim/power_interface.cc
index c637d846f..63b985260 100644
--- a/src/gpgpu-sim/power_interface.cc
+++ b/src/gpgpu-sim/power_interface.cc
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -26,8 +27,10 @@
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 
+
 #include "power_interface.h"
 
+
 void init_mcpat(const gpgpu_sim_config &config,
                 class gpgpu_sim_wrapper *wrapper, unsigned stat_sample_freq,
                 unsigned tot_inst, unsigned inst) {
@@ -38,7 +41,11 @@ void init_mcpat(const gpgpu_sim_config &config,
       config.g_power_simulation_enabled, config.g_power_trace_enabled,
       config.g_steady_power_levels_enabled, config.g_power_per_cycle_dump,
       config.gpu_steady_power_deviation, config.gpu_steady_min_period,
-      config.g_power_trace_zlevel, tot_inst + inst, stat_sample_freq);
+      config.g_power_trace_zlevel, tot_inst + inst, stat_sample_freq,  
+      config.g_power_simulation_mode, 
+      config.g_dvfs_enabled,
+      config.get_core_freq()/1000000,
+      config.num_shader());
 }
 
 void mcpat_cycle(const gpgpu_sim_config &config,
@@ -46,7 +53,7 @@ void mcpat_cycle(const gpgpu_sim_config &config,
                  class gpgpu_sim_wrapper *wrapper,
                  class power_stat_t *power_stats, unsigned stat_sample_freq,
                  unsigned tot_cycle, unsigned cycle, unsigned tot_inst,
-                 unsigned inst) {
+                 unsigned inst, bool dvfs_enabled) {
   static bool mcpat_init = true;
 
   if (mcpat_init) {  // If first cycle, don't have any power numbers yet
@@ -55,41 +62,45 @@ void mcpat_cycle(const gpgpu_sim_config &config,
   }
 
   if ((tot_cycle + cycle) % stat_sample_freq == 0) {
+    if(dvfs_enabled){
+      wrapper->set_model_voltage(1); //performance model needs to support this.
+    }
+
     wrapper->set_inst_power(
         shdr_config->gpgpu_clock_gated_lanes, stat_sample_freq,
-        stat_sample_freq, power_stats->get_total_inst(),
-        power_stats->get_total_int_inst(), power_stats->get_total_fp_inst(),
-        power_stats->get_l1d_read_accesses(),
-        power_stats->get_l1d_write_accesses(),
-        power_stats->get_committed_inst());
+        stat_sample_freq, power_stats->get_total_inst(0),
+        power_stats->get_total_int_inst(0), power_stats->get_total_fp_inst(0),
+        power_stats->get_l1d_read_accesses(0),
+        power_stats->get_l1d_write_accesses(0),
+        power_stats->get_committed_inst(0));
 
     // Single RF for both int and fp ops
-    wrapper->set_regfile_power(power_stats->get_regfile_reads(),
-                               power_stats->get_regfile_writes(),
-                               power_stats->get_non_regfile_operands());
+    wrapper->set_regfile_power(power_stats->get_regfile_reads(0),
+                               power_stats->get_regfile_writes(0),
+                               power_stats->get_non_regfile_operands(0));
 
     // Instruction cache stats
-    wrapper->set_icache_power(power_stats->get_inst_c_hits(),
-                              power_stats->get_inst_c_misses());
+    wrapper->set_icache_power(power_stats->get_inst_c_hits(0),
+                              power_stats->get_inst_c_misses(0));
 
     // Constant Cache, shared memory, texture cache
-    wrapper->set_ccache_power(power_stats->get_constant_c_hits(),
-                              power_stats->get_constant_c_misses());
+    wrapper->set_ccache_power(power_stats->get_const_accessess(0), 0); //assuming all HITS in constant cache for now
     wrapper->set_tcache_power(power_stats->get_texture_c_hits(),
                               power_stats->get_texture_c_misses());
-    wrapper->set_shrd_mem_power(power_stats->get_shmem_read_access());
+    wrapper->set_shrd_mem_power(power_stats->get_shmem_access(0));
 
     wrapper->set_l1cache_power(
-        power_stats->get_l1d_read_hits(), power_stats->get_l1d_read_misses(),
-        power_stats->get_l1d_write_hits(), power_stats->get_l1d_write_misses());
+        power_stats->get_l1d_read_hits(0), power_stats->get_l1d_read_misses(0),
+        power_stats->get_l1d_write_hits(0), power_stats->get_l1d_write_misses(0));
 
     wrapper->set_l2cache_power(
-        power_stats->get_l2_read_hits(), power_stats->get_l2_read_misses(),
-        power_stats->get_l2_write_hits(), power_stats->get_l2_write_misses());
+        power_stats->get_l2_read_hits(0), power_stats->get_l2_read_misses(0),
+        power_stats->get_l2_write_hits(0), power_stats->get_l2_write_misses(0));
 
     float active_sms = (*power_stats->m_active_sms) / stat_sample_freq;
     float num_cores = shdr_config->num_shader();
     float num_idle_core = num_cores - active_sms;
+    wrapper->set_num_cores(num_cores);
     wrapper->set_idle_core_power(num_idle_core);
 
     // pipeline power - pipeline_duty_cycle *= percent_active_sms;
@@ -101,38 +112,64 @@ void mcpat_cycle(const gpgpu_sim_config &config,
     wrapper->set_duty_cycle_power(pipeline_duty_cycle);
 
     // Memory Controller
-    wrapper->set_mem_ctrl_power(power_stats->get_dram_rd(),
-                                power_stats->get_dram_wr(),
-                                power_stats->get_dram_pre());
+    wrapper->set_mem_ctrl_power(power_stats->get_dram_rd(0),
+                                power_stats->get_dram_wr(0),
+                                power_stats->get_dram_pre(0));
 
     // Execution pipeline accesses
     // FPU (SP) accesses, Integer ALU (not present in Tesla), Sfu accesses
-    wrapper->set_exec_unit_power(power_stats->get_tot_fpu_accessess(),
-                                 power_stats->get_ialu_accessess(),
-                                 power_stats->get_tot_sfu_accessess());
+
+    wrapper->set_int_accesses(power_stats->get_ialu_accessess(0), 
+                              power_stats->get_intmul24_accessess(0), 
+                              power_stats->get_intmul32_accessess(0), 
+                              power_stats->get_intmul_accessess(0), 
+                              power_stats->get_intdiv_accessess(0));
+
+    wrapper->set_dp_accesses(power_stats->get_dp_accessess(0), 
+                              power_stats->get_dpmul_accessess(0), 
+                              power_stats->get_dpdiv_accessess(0));
+
+    wrapper->set_fp_accesses(power_stats->get_fp_accessess(0), 
+                            power_stats->get_fpmul_accessess(0), 
+                            power_stats->get_fpdiv_accessess(0));
+
+    wrapper->set_trans_accesses(power_stats->get_sqrt_accessess(0), 
+                                power_stats->get_log_accessess(0), 
+                                power_stats->get_sin_accessess(0), 
+                                power_stats->get_exp_accessess(0));
+
+    wrapper->set_tensor_accesses(power_stats->get_tensor_accessess(0));
+
+    wrapper->set_tex_accesses(power_stats->get_tex_accessess(0));
+
+    wrapper->set_exec_unit_power(power_stats->get_tot_fpu_accessess(0),
+                                 power_stats->get_ialu_accessess(0),
+                                 power_stats->get_tot_sfu_accessess(0));
+
+    wrapper->set_avg_active_threads(power_stats->get_active_threads(0));
 
     // Average active lanes for sp and sfu pipelines
     float avg_sp_active_lanes =
         (power_stats->get_sp_active_lanes()) / stat_sample_freq;
     float avg_sfu_active_lanes =
         (power_stats->get_sfu_active_lanes()) / stat_sample_freq;
+    if(avg_sp_active_lanes >32.0 )
+      avg_sp_active_lanes = 32.0;
+    if(avg_sfu_active_lanes >32.0 )
+      avg_sfu_active_lanes = 32.0;
     assert(avg_sp_active_lanes <= 32);
     assert(avg_sfu_active_lanes <= 32);
-    wrapper->set_active_lanes_power(
-        (power_stats->get_sp_active_lanes()) / stat_sample_freq,
-        (power_stats->get_sfu_active_lanes()) / stat_sample_freq);
+    wrapper->set_active_lanes_power(avg_sp_active_lanes, avg_sfu_active_lanes);
 
     double n_icnt_simt_to_mem =
         (double)
-            power_stats->get_icnt_simt_to_mem();  // # flits from SIMT clusters
+            power_stats->get_icnt_simt_to_mem(0);  // # flits from SIMT clusters
                                                   // to memory partitions
     double n_icnt_mem_to_simt =
         (double)
-            power_stats->get_icnt_mem_to_simt();  // # flits from memory
+            power_stats->get_icnt_mem_to_simt(0);  // # flits from memory
                                                   // partitions to SIMT clusters
-    wrapper->set_NoC_power(
-        n_icnt_mem_to_simt,
-        n_icnt_simt_to_mem);  // Number of flits traversing the interconnect
+    wrapper->set_NoC_power(n_icnt_mem_to_simt + n_icnt_simt_to_mem);  // Number of flits traversing the interconnect
 
     wrapper->compute();
 
@@ -152,3 +189,336 @@ void mcpat_cycle(const gpgpu_sim_config &config,
 void mcpat_reset_perf_count(class gpgpu_sim_wrapper *wrapper) {
   wrapper->reset_counters();
 }
+
+bool parse_hw_file(char* hwpowerfile, bool find_target_kernel, vector<string> &hw_data, char* benchname, std::string executed_kernelname){
+  fstream hw_file;
+  hw_file.open(hwpowerfile, ios::in);
+  string line, word, temp;
+  while(!hw_file.eof()){
+    hw_data.clear();
+    getline(hw_file, line);
+    stringstream s(line);
+    while (getline(s,word,',')){
+      hw_data.push_back(word);
+    }
+    if(hw_data[HW_BENCH_NAME] == std::string(benchname)){
+      if(find_target_kernel){
+        if(hw_data[HW_KERNEL_NAME] == ""){
+          hw_file.close();
+          return true;
+        }
+        else{
+          if(hw_data[HW_KERNEL_NAME] == executed_kernelname){
+            hw_file.close();
+            return true;
+          }
+        }
+      }
+      else{
+        hw_file.close();
+        return true;
+      }
+    } 
+  }
+  hw_file.close();
+  return false;
+}
+
+
+void calculate_hw_mcpat(const gpgpu_sim_config &config,
+                 const shader_core_config *shdr_config,
+                 class gpgpu_sim_wrapper *wrapper,
+                 class power_stat_t *power_stats, unsigned stat_sample_freq,
+                 unsigned tot_cycle, unsigned cycle, unsigned tot_inst,
+                 unsigned inst, int power_simulation_mode, bool dvfs_enabled, char* hwpowerfile, 
+                 char* benchname, std::string executed_kernelname, 
+                 const bool *accelwattch_hybrid_configuration, bool aggregate_power_stats){
+
+  /* Reading HW data from CSV file */
+
+  vector<string> hw_data;
+  bool kernel_found = false;
+  kernel_found = parse_hw_file(hwpowerfile, true, hw_data, benchname, executed_kernelname); //Searching for matching executed_kernelname.
+  if(!kernel_found)
+    kernel_found = parse_hw_file(hwpowerfile, false, hw_data, benchname, executed_kernelname); //Searching for any kernel with same benchname. 
+  assert("Could not find perf stats for the target benchmark in hwpowerfile.\n" && (kernel_found));
+  unsigned perf_cycles = static_cast<unsigned int>(std::stod(hw_data[HW_CYCLES]) + 0.5);
+  if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_CYCLES]))
+    perf_cycles = cycle;
+  wrapper->init_mcpat_hw_mode(perf_cycles); //total PERF MODEL cycles for current kernel
+
+  if(dvfs_enabled){
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_VOLTAGE])) 
+      wrapper->set_model_voltage(1); //performance model needs to support this
+    else  
+      wrapper->set_model_voltage(std::stod(hw_data[HW_VOLTAGE])); //performance model needs to support this
+  }
+
+  double l1_read_hits = std::stod(hw_data[HW_L1_RH]);
+  double l1_read_misses = std::stod(hw_data[HW_L1_RM]);
+  double l1_write_hits = std::stod(hw_data[HW_L1_WH]);
+  double l1_write_misses = std::stod(hw_data[HW_L1_WM]);
+
+  if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L1_RH]))
+    l1_read_hits = power_stats->get_l1d_read_hits(1) - power_stats->l1r_hits_kernel;
+  if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L1_RM]))
+    l1_read_misses = power_stats->get_l1d_read_misses(1) - power_stats->l1r_misses_kernel;
+  if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L1_WH]))
+    l1_write_hits = power_stats->get_l1d_write_hits(1) - power_stats->l1w_hits_kernel;
+  if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L1_WM]))
+    l1_write_misses = power_stats->get_l1d_write_misses(1) - power_stats->l1w_misses_kernel;
+
+    if(aggregate_power_stats){
+      power_stats->tot_inst_execution += power_stats->get_total_inst(1);
+      power_stats->tot_int_inst_execution +=  power_stats->get_total_int_inst(1);
+      power_stats->tot_fp_inst_execution +=  power_stats->get_total_fp_inst(1);
+      power_stats->commited_inst_execution += power_stats->get_committed_inst(1);
+      wrapper->set_inst_power(
+        shdr_config->gpgpu_clock_gated_lanes, cycle, //TODO: core.[0] cycles counts don't matter, remove this
+        cycle, power_stats->tot_inst_execution,
+        power_stats->tot_int_inst_execution, power_stats->tot_fp_inst_execution,
+        l1_read_hits + l1_read_misses,
+        l1_write_hits + l1_write_misses,
+        power_stats->commited_inst_execution);
+    }
+    else{
+    wrapper->set_inst_power(
+        shdr_config->gpgpu_clock_gated_lanes, cycle, //TODO: core.[0] cycles counts don't matter, remove this
+        cycle, power_stats->get_total_inst(1),
+        power_stats->get_total_int_inst(1), power_stats->get_total_fp_inst(1),
+        l1_read_hits + l1_read_misses,
+        l1_write_hits + l1_write_misses,
+        power_stats->get_committed_inst(1));
+    }
+
+    // Single RF for both int and fp ops -- activity factor set to 0 for Accelwattch HW and Accelwattch Hybrid because no HW Perf Stats for register files
+    wrapper->set_regfile_power(power_stats->get_regfile_reads(1),
+                               power_stats->get_regfile_writes(1),
+                               power_stats->get_non_regfile_operands(1));
+
+    // Instruction cache stats -- activity factor set to 0 for Accelwattch HW and Accelwattch Hybrid because no HW Perf Stats for instruction cache
+    wrapper->set_icache_power(power_stats->get_inst_c_hits(1) - power_stats->l1i_hits_kernel,
+                              power_stats->get_inst_c_misses(1) - power_stats->l1i_misses_kernel);
+
+    // Constant Cache, shared memory, texture cache
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_CC_ACC]))
+      wrapper->set_ccache_power(power_stats->get_const_accessess(1) - power_stats->cc_accesses_kernel, 0); //assuming all HITS in constant cache for now
+    else  
+      wrapper->set_ccache_power(std::stod(hw_data[HW_CC_ACC]), 0); //assuming all HITS in constant cache for now
+
+    
+    // wrapper->set_tcache_power(power_stats->get_texture_c_hits(),
+    //                           power_stats->get_texture_c_misses());
+
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_SHRD_ACC]))
+      wrapper->set_shrd_mem_power(power_stats->get_shmem_access(1) - power_stats->shared_accesses_kernel);
+    else  
+      wrapper->set_shrd_mem_power(std::stod(hw_data[HW_SHRD_ACC]));
+
+    wrapper->set_l1cache_power( l1_read_hits,  l1_read_misses, l1_write_hits,  l1_write_misses);
+
+    double l2_read_hits = std::stod(hw_data[HW_L2_RH]);
+    double l2_read_misses = std::stod(hw_data[HW_L2_RM]);
+    double l2_write_hits = std::stod(hw_data[HW_L2_WH]);
+    double l2_write_misses = std::stod(hw_data[HW_L2_WM]);
+
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L2_RH]))
+      l2_read_hits = power_stats->get_l2_read_hits(1) - power_stats->l2r_hits_kernel;
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L2_RM]))
+      l2_read_misses = power_stats->get_l2_read_misses(1)  - power_stats->l2r_misses_kernel;
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L2_WH]))
+      l2_write_hits = power_stats->get_l2_write_hits(1) - power_stats->l2w_hits_kernel;
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L2_WM]))
+      l2_write_misses = power_stats->get_l2_write_misses(1) - power_stats->l2w_misses_kernel;
+
+    wrapper->set_l2cache_power(l2_read_hits, l2_read_misses, l2_write_hits, l2_write_misses);
+    
+    float active_sms = (*power_stats->m_active_sms) / stat_sample_freq;
+    float num_cores = shdr_config->num_shader();
+    float num_idle_core = num_cores - active_sms;
+    wrapper->set_num_cores(num_cores);
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_NUM_SM_IDLE]))
+      wrapper->set_idle_core_power(num_idle_core);
+    else 
+      wrapper->set_idle_core_power(std::stod(hw_data[HW_NUM_SM_IDLE])); 
+
+    float pipeline_duty_cycle =
+        ((*power_stats->m_average_pipeline_duty_cycle / (stat_sample_freq)) <
+         0.8)
+            ? ((*power_stats->m_average_pipeline_duty_cycle) / stat_sample_freq)
+            : 0.8;
+    
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_PIPE_DUTY]))
+      wrapper->set_duty_cycle_power(pipeline_duty_cycle);
+    else
+      wrapper->set_duty_cycle_power(std::stod(hw_data[HW_PIPE_DUTY]));
+
+    // Memory Controller
+  
+    double dram_reads = std::stod(hw_data[HW_DRAM_RD]);
+    double dram_writes = std::stod(hw_data[HW_DRAM_WR]);
+    double dram_pre = 0;
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_DRAM_RD]))
+      dram_reads = power_stats->get_dram_rd(1) - power_stats->dram_rd_kernel;
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_DRAM_WR]))
+      dram_writes = power_stats->get_dram_wr(1) - power_stats->dram_wr_kernel;
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_DRAM_RD]))
+      dram_pre = power_stats->get_dram_pre(1) - power_stats->dram_pre_kernel;
+
+
+    wrapper->set_mem_ctrl_power(dram_reads, dram_writes, dram_pre);
+
+    if(aggregate_power_stats){
+      power_stats->ialu_acc_execution += power_stats->get_ialu_accessess(1);
+      power_stats->imul24_acc_execution += power_stats->get_intmul24_accessess(1);
+      power_stats->imul32_acc_execution += power_stats->get_intmul32_accessess(1);
+      power_stats->imul_acc_execution += power_stats->get_intmul_accessess(1);
+      power_stats->idiv_acc_execution += power_stats->get_intdiv_accessess(1);
+      power_stats->dp_acc_execution += power_stats->get_dp_accessess(1);
+      power_stats->dpmul_acc_execution += power_stats->get_dpmul_accessess(1);
+      power_stats->dpdiv_acc_execution += power_stats->get_dpdiv_accessess(1);
+      power_stats->fp_acc_execution += power_stats->get_fp_accessess(1);
+      power_stats->fpmul_acc_execution += power_stats->get_fpmul_accessess(1);
+      power_stats->fpdiv_acc_execution += power_stats->get_fpdiv_accessess(1);
+      power_stats->sqrt_acc_execution += power_stats->get_sqrt_accessess(1);
+      power_stats->log_acc_execution += power_stats->get_log_accessess(1);
+      power_stats->sin_acc_execution += power_stats->get_sin_accessess(1);
+      power_stats->exp_acc_execution += power_stats->get_exp_accessess(1);
+      power_stats->tensor_acc_execution += power_stats->get_tensor_accessess(1);
+      power_stats->tex_acc_execution += power_stats->get_tex_accessess(1);
+      power_stats->tot_fpu_acc_execution += power_stats->get_tot_fpu_accessess(1);
+      power_stats->tot_sfu_acc_execution += power_stats->get_tot_sfu_accessess(1);
+      power_stats->tot_threads_acc_execution += power_stats->get_tot_threads_kernel(1);
+      power_stats->tot_warps_acc_execution += power_stats->get_tot_warps_kernel(1);
+      
+      power_stats->sp_active_lanes_execution += (power_stats->get_sp_active_lanes() * shdr_config->num_shader() * shdr_config->gpgpu_num_sp_units);
+      power_stats->sfu_active_lanes_execution += (power_stats->get_sfu_active_lanes() * shdr_config->num_shader() * shdr_config->gpgpu_num_sp_units);
+
+      wrapper->set_int_accesses(power_stats->ialu_acc_execution, 
+                                power_stats->imul24_acc_execution, 
+                                power_stats->imul32_acc_execution, 
+                                power_stats->imul_acc_execution, 
+                                power_stats->idiv_acc_execution);
+
+      wrapper->set_dp_accesses(power_stats->dp_acc_execution, 
+                                power_stats->dpmul_acc_execution, 
+                                power_stats->dpdiv_acc_execution);
+
+      wrapper->set_fp_accesses(power_stats->fp_acc_execution, 
+                              power_stats->fpmul_acc_execution, 
+                              power_stats->fpdiv_acc_execution);
+
+      wrapper->set_trans_accesses(power_stats->sqrt_acc_execution, 
+                                  power_stats->log_acc_execution, 
+                                  power_stats->sin_acc_execution, 
+                                  power_stats->exp_acc_execution);
+
+      wrapper->set_tensor_accesses(power_stats->tensor_acc_execution);
+
+      wrapper->set_tex_accesses(power_stats->tex_acc_execution);
+
+      wrapper->set_exec_unit_power(power_stats->ialu_acc_execution,
+                                   power_stats->tot_fpu_acc_execution,
+                                   power_stats->tot_sfu_acc_execution);
+
+      wrapper->set_avg_active_threads((double)((double)power_stats->tot_threads_acc_execution / (double)power_stats->tot_warps_acc_execution));
+
+      // Average active lanes for sp and sfu pipelines
+      float avg_sp_active_lanes =
+          (power_stats->sp_active_lanes_execution) / shdr_config->num_shader() / shdr_config->gpgpu_num_sp_units / stat_sample_freq;
+      float avg_sfu_active_lanes =
+          (power_stats->sfu_active_lanes_execution) / shdr_config->num_shader() / shdr_config->gpgpu_num_sp_units / stat_sample_freq;
+      if(avg_sp_active_lanes >32.0 )
+        avg_sp_active_lanes = 32.0;
+      if(avg_sfu_active_lanes >32.0 )
+        avg_sfu_active_lanes = 32.0;
+      assert(avg_sp_active_lanes <= 32);
+      assert(avg_sfu_active_lanes <= 32);
+      wrapper->set_active_lanes_power(avg_sp_active_lanes, avg_sfu_active_lanes);
+    }
+    else{
+      wrapper->set_int_accesses(power_stats->get_ialu_accessess(1), 
+                                power_stats->get_intmul24_accessess(1), 
+                                power_stats->get_intmul32_accessess(1), 
+                                power_stats->get_intmul_accessess(1), 
+                                power_stats->get_intdiv_accessess(1));
+
+      wrapper->set_dp_accesses(power_stats->get_dp_accessess(1), 
+                                power_stats->get_dpmul_accessess(1), 
+                                power_stats->get_dpdiv_accessess(1));
+
+      wrapper->set_fp_accesses(power_stats->get_fp_accessess(1), 
+                              power_stats->get_fpmul_accessess(1), 
+                              power_stats->get_fpdiv_accessess(1));
+
+      wrapper->set_trans_accesses(power_stats->get_sqrt_accessess(1), 
+                                  power_stats->get_log_accessess(1), 
+                                  power_stats->get_sin_accessess(1), 
+                                  power_stats->get_exp_accessess(1));
+
+      wrapper->set_tensor_accesses(power_stats->get_tensor_accessess(1));
+
+      wrapper->set_tex_accesses(power_stats->get_tex_accessess(1));
+
+      wrapper->set_exec_unit_power(power_stats->get_tot_fpu_accessess(1),
+                                   power_stats->get_ialu_accessess(1),
+                                   power_stats->get_tot_sfu_accessess(1));
+
+      wrapper->set_avg_active_threads(power_stats->get_active_threads(1));
+
+      // Average active lanes for sp and sfu pipelines
+      float avg_sp_active_lanes =
+          (power_stats->get_sp_active_lanes()) / stat_sample_freq;
+      float avg_sfu_active_lanes =
+          (power_stats->get_sfu_active_lanes()) / stat_sample_freq;
+      if(avg_sp_active_lanes >32.0 )
+        avg_sp_active_lanes = 32.0;
+      if(avg_sfu_active_lanes >32.0 )
+        avg_sfu_active_lanes = 32.0;
+      assert(avg_sp_active_lanes <= 32);
+      assert(avg_sfu_active_lanes <= 32);
+      wrapper->set_active_lanes_power(avg_sp_active_lanes, avg_sfu_active_lanes);
+    }
+
+  
+    double n_icnt_simt_to_mem =
+      (double)
+          (power_stats->get_icnt_simt_to_mem(1) - power_stats->noc_tr_kernel);  // # flits from SIMT clusters
+                                                // to memory partitions
+    double n_icnt_mem_to_simt =
+      (double)
+          (power_stats->get_icnt_mem_to_simt(1)- power_stats->noc_rc_kernel);  // # flits from memory
+                                                // partitions to SIMT clusters
+    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_NOC]))   
+      wrapper->set_NoC_power(n_icnt_mem_to_simt + n_icnt_simt_to_mem);  // Number of flits traversing the interconnect from Accel-Sim
+    else
+      wrapper->set_NoC_power(std::stod(hw_data[HW_NOC]));  // Number of flits traversing the interconnect from HW
+   
+    wrapper->compute();
+
+    wrapper->update_components_power();
+
+    wrapper->power_metrics_calculations();
+
+    wrapper->dump();
+    power_stats->l1r_hits_kernel = power_stats->get_l1d_read_hits(1);
+    power_stats->l1r_misses_kernel = power_stats->get_l1d_read_misses(1);
+    power_stats->l1w_hits_kernel = power_stats->get_l1d_write_hits(1);
+    power_stats->l1w_misses_kernel = power_stats->get_l1d_write_misses(1);
+    power_stats->shared_accesses_kernel = power_stats->get_const_accessess(1);
+    power_stats->cc_accesses_kernel = power_stats->get_shmem_access(1);
+    power_stats->dram_rd_kernel = power_stats->get_dram_rd(1);
+    power_stats->dram_wr_kernel = power_stats->get_dram_wr(1);
+    power_stats->dram_pre_kernel = power_stats->get_dram_pre(1);
+    power_stats->l1i_hits_kernel = power_stats->get_inst_c_hits(1);
+    power_stats->l1i_misses_kernel = power_stats->get_inst_c_misses(1);
+    power_stats->l2r_hits_kernel = power_stats->get_l2_read_hits(1);
+    power_stats->l2r_misses_kernel = power_stats->get_l2_read_misses(1);
+    power_stats->l2w_hits_kernel =  power_stats->get_l2_write_hits(1); 
+    power_stats->l2w_misses_kernel = power_stats->get_l2_write_misses(1);
+    power_stats->noc_tr_kernel = power_stats->get_icnt_simt_to_mem(1);
+    power_stats->noc_rc_kernel =  power_stats->get_icnt_mem_to_simt(1);
+
+
+    power_stats->clear();
+}
\ No newline at end of file
diff --git a/src/gpgpu-sim/power_interface.h b/src/gpgpu-sim/power_interface.h
index 2bfd4d504..1a488948c 100644
--- a/src/gpgpu-sim/power_interface.h
+++ b/src/gpgpu-sim/power_interface.h
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -43,7 +44,19 @@ void mcpat_cycle(const gpgpu_sim_config &config,
                  class gpgpu_sim_wrapper *wrapper,
                  class power_stat_t *power_stats, unsigned stat_sample_freq,
                  unsigned tot_cycle, unsigned cycle, unsigned tot_inst,
-                 unsigned inst);
+                 unsigned inst, bool dvfs_enabled);
+
+void calculate_hw_mcpat(const gpgpu_sim_config &config,
+                 const shader_core_config *shdr_config,
+                 class gpgpu_sim_wrapper *wrapper,
+                 class power_stat_t *power_stats, unsigned stat_sample_freq,
+                 unsigned tot_cycle, unsigned cycle, unsigned tot_inst,
+                 unsigned inst, int power_simulation_mode, bool dvfs_enabled, 
+                 char* hwpowerfile, char* benchname, std::string executed_kernelname, 
+                 const bool *accelwattch_hybrid_configuration, bool aggregate_power_stats);
+
+bool parse_hw_file(char* hwpowerfile, bool find_target_kernel, vector<string> &hw_data, char* benchname, std::string executed_kernelname);
+
 void mcpat_reset_perf_count(class gpgpu_sim_wrapper *wrapper);
 
 #endif /* POWER_INTERFACE_H_ */
diff --git a/src/gpgpu-sim/power_stat.cc b/src/gpgpu-sim/power_stat.cc
index 7b60ddf84..fd7a77560 100644
--- a/src/gpgpu-sim/power_stat.cc
+++ b/src/gpgpu-sim/power_stat.cc
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington
-// The University of British Columbia
+// Copyright (c) 2009-2021,  Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -54,10 +55,64 @@ power_mem_stat_t::power_mem_stat_t(const memory_config *mem_config,
   init();
 }
 
+void power_stat_t::clear(){
+  for(unsigned i=0; i< NUM_STAT_IDX; ++i){
+    pwr_mem_stat->core_cache_stats[i].clear();
+    pwr_mem_stat->l2_cache_stats[i].clear();
+    for(unsigned j=0; j<m_config->num_shader(); ++j){
+      pwr_core_stat->m_pipeline_duty_cycle[i][j]=0;                
+      pwr_core_stat->m_num_decoded_insn[i][j]=0;
+      pwr_core_stat->m_num_FPdecoded_insn[i][j]=0;
+      pwr_core_stat->m_num_INTdecoded_insn[i][j]=0;
+      pwr_core_stat->m_num_storequeued_insn[i][j]=0;
+      pwr_core_stat->m_num_loadqueued_insn[i][j]=0;
+      pwr_core_stat->m_num_tex_inst[i][j]=0;
+      pwr_core_stat->m_num_ialu_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_fp_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_imul_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_imul24_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_imul32_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_fpmul_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_idiv_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_fpdiv_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_dp_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_dpmul_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_dpdiv_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_tensor_core_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_const_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_tex_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_sp_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_sfu_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_sqrt_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_log_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_sin_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_exp_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_mem_acesses[i][j]=0;                   
+      pwr_core_stat->m_num_sp_committed[i][j]=0;
+      pwr_core_stat->m_num_sfu_committed[i][j]=0;
+      pwr_core_stat->m_num_mem_committed[i][j]=0;
+      pwr_core_stat->m_read_regfile_acesses[i][j]=0;
+      pwr_core_stat->m_write_regfile_acesses[i][j]=0;
+      pwr_core_stat->m_non_rf_operands[i][j]=0;
+      pwr_core_stat->m_active_sp_lanes[i][j]=0;
+      pwr_core_stat->m_active_sfu_lanes[i][j]=0;
+      pwr_core_stat->m_active_exu_threads[i][j]=0;                   
+      pwr_core_stat->m_active_exu_warps[i][j]=0;
+    }
+    for (unsigned j = 0; j < m_mem_config->m_n_mem; ++j) {
+      pwr_mem_stat->n_rd[i][j]=0;
+      pwr_mem_stat->n_wr[i][j]=0;
+      pwr_mem_stat->n_pre[i][j]=0;
+    }
+  }
+}
+
+
+
 void power_mem_stat_t::init() {
-  shmem_read_access[CURRENT_STAT_IDX] =
+  shmem_access[CURRENT_STAT_IDX] =
       m_core_stats->gpgpu_n_shmem_bank_access;  // Shared memory access
-  shmem_read_access[PREV_STAT_IDX] =
+  shmem_access[PREV_STAT_IDX] =
       (unsigned *)calloc(m_core_config->num_shader(), sizeof(unsigned));
 
   for (unsigned i = 0; i < NUM_STAT_IDX; ++i) {
@@ -71,6 +126,7 @@ void power_mem_stat_t::init() {
     n_pre[i] = (unsigned *)calloc(m_config->m_n_mem, sizeof(unsigned));
     n_rd[i] = (unsigned *)calloc(m_config->m_n_mem, sizeof(unsigned));
     n_wr[i] = (unsigned *)calloc(m_config->m_n_mem, sizeof(unsigned));
+    n_wr_WB[i] = (unsigned *)calloc(m_config->m_n_mem, sizeof(unsigned));
     n_req[i] = (unsigned *)calloc(m_config->m_n_mem, sizeof(unsigned));
 
     // Interconnect stats
@@ -86,8 +142,8 @@ void power_mem_stat_t::save_stats() {
   l2_cache_stats[PREV_STAT_IDX] = l2_cache_stats[CURRENT_STAT_IDX];
 
   for (unsigned i = 0; i < m_core_config->num_shader(); ++i) {
-    shmem_read_access[PREV_STAT_IDX][i] =
-        shmem_read_access[CURRENT_STAT_IDX][i];  // Shared memory access
+    shmem_access[PREV_STAT_IDX][i] =
+        shmem_access[CURRENT_STAT_IDX][i];  // Shared memory access
   }
 
   for (unsigned i = 0; i < m_config->m_n_mem; ++i) {
@@ -98,6 +154,7 @@ void power_mem_stat_t::save_stats() {
     n_pre[PREV_STAT_IDX][i] = n_pre[CURRENT_STAT_IDX][i];
     n_rd[PREV_STAT_IDX][i] = n_rd[CURRENT_STAT_IDX][i];
     n_wr[PREV_STAT_IDX][i] = n_wr[CURRENT_STAT_IDX][i];
+    n_wr_WB[PREV_STAT_IDX][i] = n_wr_WB[CURRENT_STAT_IDX][i];
     n_req[PREV_STAT_IDX][i] = n_req[CURRENT_STAT_IDX][i];
   }
 
@@ -117,7 +174,7 @@ void power_mem_stat_t::print(FILE *fout) const {
   unsigned total_mem_writes = 0;
   for (unsigned i = 0; i < m_config->m_n_mem; ++i) {
     total_mem_reads += n_rd[CURRENT_STAT_IDX][i];
-    total_mem_writes += n_wr[CURRENT_STAT_IDX][i];
+    total_mem_writes += n_wr[CURRENT_STAT_IDX][i] + n_wr_WB[CURRENT_STAT_IDX][i];
   }
   fprintf(fout, "Total memory controller accesses: %u\n",
           total_mem_reads + total_mem_writes);
@@ -147,198 +204,165 @@ void power_core_stat_t::print(FILE *fout) {
   // per core statistics
   fprintf(fout, "Power Metrics: \n");
   for (unsigned i = 0; i < m_config->num_shader(); i++) {
-    fprintf(fout, "core %u:\n", i);
-    fprintf(fout, "\tpipeline duty cycle =%f\n",
-            m_pipeline_duty_cycle[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal Deocded Instructions=%u\n",
-            m_num_decoded_insn[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal FP Deocded Instructions=%u\n",
-            m_num_FPdecoded_insn[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal INT Deocded Instructions=%u\n",
-            m_num_INTdecoded_insn[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal LOAD Queued Instructions=%u\n",
-            m_num_loadqueued_insn[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal STORE Queued Instructions=%u\n",
-            m_num_storequeued_insn[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal IALU Acesses=%u\n",
-            m_num_ialu_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal FP Acesses=%u\n",
-            m_num_fp_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal IMUL Acesses=%u\n",
-            m_num_imul_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal IMUL24 Acesses=%u\n",
-            m_num_imul24_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal IMUL32 Acesses=%u\n",
-            m_num_imul32_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal IDIV Acesses=%u\n",
-            m_num_idiv_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal FPMUL Acesses=%u\n",
-            m_num_fpmul_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal SFU Acesses=%u\n",
-            m_num_trans_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal FPDIV Acesses=%u\n",
-            m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal SFU Acesses=%u\n",
-            m_num_sfu_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal SP Acesses=%u\n",
-            m_num_sp_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal MEM Acesses=%u\n",
-            m_num_mem_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal SFU Commissions=%u\n",
-            m_num_sfu_committed[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal SP Commissions=%u\n",
-            m_num_sp_committed[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal MEM Commissions=%u\n",
-            m_num_mem_committed[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal REG Reads=%u\n",
-            m_read_regfile_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal REG Writes=%u\n",
-            m_write_regfile_acesses[CURRENT_STAT_IDX][i]);
-    fprintf(fout, "\tTotal NON REG=%u\n",
-            m_non_rf_operands[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"core %u:\n",i);
+        fprintf(fout,"\tpipeline duty cycle =%f\n",m_pipeline_duty_cycle[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal Deocded Instructions=%u\n",m_num_decoded_insn[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal FP Deocded Instructions=%u\n",m_num_FPdecoded_insn[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal INT Deocded Instructions=%u\n",m_num_INTdecoded_insn[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal LOAD Queued Instructions=%u\n",m_num_loadqueued_insn[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal STORE Queued Instructions=%u\n",m_num_storequeued_insn[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal IALU Acesses=%f\n",m_num_ialu_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal FP Acesses=%f\n",m_num_fp_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal DP Acesses=%f\n",m_num_dp_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal IMUL Acesses=%f\n",m_num_imul_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal IMUL24 Acesses=%f\n",m_num_imul24_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal IMUL32 Acesses=%f\n",m_num_imul32_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal IDIV Acesses=%f\n",m_num_idiv_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal FPMUL Acesses=%f\n",m_num_fpmul_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal DPMUL Acesses=%f\n",m_num_dpmul_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal SQRT Acesses=%f\n",m_num_sqrt_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal LOG Acesses=%f\n",m_num_log_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal SIN Acesses=%f\n",m_num_sin_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal EXP Acesses=%f\n",m_num_exp_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal FPDIV Acesses=%f\n",m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal DPDIV Acesses=%f\n",m_num_dpdiv_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal TENSOR Acesses=%f\n",m_num_tensor_core_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal CONST Acesses=%f\n",m_num_const_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal TEX Acesses=%f\n",m_num_tex_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal SFU Acesses=%f\n",m_num_sfu_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal SP Acesses=%f\n",m_num_sp_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal MEM Acesses=%f\n",m_num_mem_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal SFU Commissions=%u\n",m_num_sfu_committed[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal SP Commissions=%u\n",m_num_sp_committed[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal MEM Commissions=%u\n",m_num_mem_committed[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal REG Reads=%u\n",m_read_regfile_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal REG Writes=%u\n",m_write_regfile_acesses[CURRENT_STAT_IDX][i]);
+        fprintf(fout,"\tTotal NON REG=%u\n",m_non_rf_operands[CURRENT_STAT_IDX][i]);
   }
 }
 void power_core_stat_t::init() {
-  m_pipeline_duty_cycle[CURRENT_STAT_IDX] = m_core_stats->m_pipeline_duty_cycle;
-  m_num_decoded_insn[CURRENT_STAT_IDX] = m_core_stats->m_num_decoded_insn;
-  m_num_FPdecoded_insn[CURRENT_STAT_IDX] = m_core_stats->m_num_FPdecoded_insn;
-  m_num_INTdecoded_insn[CURRENT_STAT_IDX] = m_core_stats->m_num_INTdecoded_insn;
-  m_num_storequeued_insn[CURRENT_STAT_IDX] =
-      m_core_stats->m_num_storequeued_insn;
-  m_num_loadqueued_insn[CURRENT_STAT_IDX] = m_core_stats->m_num_loadqueued_insn;
-  m_num_ialu_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_ialu_acesses;
-  m_num_fp_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_fp_acesses;
-  m_num_imul_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_imul_acesses;
-  m_num_imul24_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_imul24_acesses;
-  m_num_imul32_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_imul32_acesses;
-  m_num_fpmul_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_fpmul_acesses;
-  m_num_idiv_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_idiv_acesses;
-  m_num_fpdiv_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_fpdiv_acesses;
-  m_num_sp_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_sp_acesses;
-  m_num_sfu_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_sfu_acesses;
-  m_num_trans_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_trans_acesses;
-  m_num_mem_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_mem_acesses;
-  m_num_sp_committed[CURRENT_STAT_IDX] = m_core_stats->m_num_sp_committed;
-  m_num_sfu_committed[CURRENT_STAT_IDX] = m_core_stats->m_num_sfu_committed;
-  m_num_mem_committed[CURRENT_STAT_IDX] = m_core_stats->m_num_mem_committed;
-  m_read_regfile_acesses[CURRENT_STAT_IDX] =
-      m_core_stats->m_read_regfile_acesses;
-  m_write_regfile_acesses[CURRENT_STAT_IDX] =
-      m_core_stats->m_write_regfile_acesses;
-  m_non_rf_operands[CURRENT_STAT_IDX] = m_core_stats->m_non_rf_operands;
-  m_active_sp_lanes[CURRENT_STAT_IDX] = m_core_stats->m_active_sp_lanes;
-  m_active_sfu_lanes[CURRENT_STAT_IDX] = m_core_stats->m_active_sfu_lanes;
-  m_num_tex_inst[CURRENT_STAT_IDX] = m_core_stats->m_num_tex_inst;
+    m_pipeline_duty_cycle[CURRENT_STAT_IDX]=m_core_stats->m_pipeline_duty_cycle;
+    m_num_decoded_insn[CURRENT_STAT_IDX]=m_core_stats->m_num_decoded_insn;
+    m_num_FPdecoded_insn[CURRENT_STAT_IDX]=m_core_stats->m_num_FPdecoded_insn;
+    m_num_INTdecoded_insn[CURRENT_STAT_IDX]=m_core_stats->m_num_INTdecoded_insn;
+    m_num_storequeued_insn[CURRENT_STAT_IDX]=m_core_stats->m_num_storequeued_insn;
+    m_num_loadqueued_insn[CURRENT_STAT_IDX]=m_core_stats->m_num_loadqueued_insn;
+    m_num_ialu_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_ialu_acesses;
+    m_num_fp_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_fp_acesses;
+    m_num_imul_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_imul_acesses;
+    m_num_imul24_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_imul24_acesses;
+    m_num_imul32_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_imul32_acesses;
+    m_num_fpmul_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_fpmul_acesses;
+    m_num_idiv_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_idiv_acesses;
+    m_num_fpdiv_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_fpdiv_acesses;
+    m_num_dp_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_dp_acesses;
+    m_num_dpmul_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_dpmul_acesses;
+    m_num_dpdiv_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_dpdiv_acesses;
+    m_num_sp_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_sp_acesses;
+    m_num_sfu_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_sfu_acesses;
+    m_num_sqrt_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_sqrt_acesses;
+    m_num_log_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_log_acesses;
+    m_num_sin_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_sin_acesses;
+    m_num_exp_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_exp_acesses;
+    m_num_tensor_core_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_tensor_core_acesses;
+    m_num_const_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_const_acesses;
+    m_num_tex_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_tex_acesses;
+    m_num_mem_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_mem_acesses;
+    m_num_sp_committed[CURRENT_STAT_IDX]=m_core_stats->m_num_sp_committed;
+    m_num_sfu_committed[CURRENT_STAT_IDX]=m_core_stats->m_num_sfu_committed;
+    m_num_mem_committed[CURRENT_STAT_IDX]=m_core_stats->m_num_mem_committed;
+    m_read_regfile_acesses[CURRENT_STAT_IDX]=m_core_stats->m_read_regfile_acesses;
+    m_write_regfile_acesses[CURRENT_STAT_IDX]=m_core_stats->m_write_regfile_acesses;
+    m_non_rf_operands[CURRENT_STAT_IDX]=m_core_stats->m_non_rf_operands;
+    m_active_sp_lanes[CURRENT_STAT_IDX]=m_core_stats->m_active_sp_lanes;
+    m_active_sfu_lanes[CURRENT_STAT_IDX]=m_core_stats->m_active_sfu_lanes;
+    m_active_exu_threads[CURRENT_STAT_IDX]=m_core_stats->m_active_exu_threads;
+    m_active_exu_warps[CURRENT_STAT_IDX]=m_core_stats->m_active_exu_warps;
+    m_num_tex_inst[CURRENT_STAT_IDX]=m_core_stats->m_num_tex_inst;
+
+    m_pipeline_duty_cycle[PREV_STAT_IDX]=(float*)calloc(m_config->num_shader(),sizeof(float));
+    m_num_decoded_insn[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_num_FPdecoded_insn[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_num_INTdecoded_insn[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_num_storequeued_insn[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_num_loadqueued_insn[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_num_tex_inst[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+
+    m_num_ialu_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_fp_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_imul_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_imul24_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_imul32_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_fpmul_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_idiv_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_fpdiv_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_dp_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_dpmul_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_dpdiv_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_tensor_core_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_const_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_tex_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_sp_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_sfu_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_sqrt_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_log_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_sin_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_exp_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_mem_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_num_sp_committed[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_num_sfu_committed[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_num_mem_committed[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_read_regfile_acesses[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_write_regfile_acesses[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_non_rf_operands[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_active_sp_lanes[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_active_sfu_lanes[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
+    m_active_exu_threads[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+    m_active_exu_warps[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+
 
-  m_pipeline_duty_cycle[PREV_STAT_IDX] =
-      (float *)calloc(m_config->num_shader(), sizeof(float));
-  m_num_decoded_insn[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_FPdecoded_insn[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_INTdecoded_insn[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_storequeued_insn[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_loadqueued_insn[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_ialu_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_fp_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_tex_inst[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_imul_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_imul24_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_imul32_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_fpmul_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_idiv_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_fpdiv_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_sp_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_sfu_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_trans_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_mem_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_sp_committed[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_sfu_committed[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_num_mem_committed[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_read_regfile_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_write_regfile_acesses[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_non_rf_operands[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_active_sp_lanes[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
-  m_active_sfu_lanes[PREV_STAT_IDX] =
-      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
 }
 
 void power_core_stat_t::save_stats() {
   for (unsigned i = 0; i < m_config->num_shader(); ++i) {
-    m_pipeline_duty_cycle[PREV_STAT_IDX][i] =
-        m_pipeline_duty_cycle[CURRENT_STAT_IDX][i];
-    m_num_decoded_insn[PREV_STAT_IDX][i] =
-        m_num_decoded_insn[CURRENT_STAT_IDX][i];
-    m_num_FPdecoded_insn[PREV_STAT_IDX][i] =
-        m_num_FPdecoded_insn[CURRENT_STAT_IDX][i];
-    m_num_INTdecoded_insn[PREV_STAT_IDX][i] =
-        m_num_INTdecoded_insn[CURRENT_STAT_IDX][i];
-    m_num_storequeued_insn[PREV_STAT_IDX][i] =
-        m_num_storequeued_insn[CURRENT_STAT_IDX][i];
-    m_num_loadqueued_insn[PREV_STAT_IDX][i] =
-        m_num_loadqueued_insn[CURRENT_STAT_IDX][i];
-    m_num_ialu_acesses[PREV_STAT_IDX][i] =
-        m_num_ialu_acesses[CURRENT_STAT_IDX][i];
-    m_num_fp_acesses[PREV_STAT_IDX][i] = m_num_fp_acesses[CURRENT_STAT_IDX][i];
-    m_num_tex_inst[PREV_STAT_IDX][i] = m_num_tex_inst[CURRENT_STAT_IDX][i];
-    m_num_imul_acesses[PREV_STAT_IDX][i] =
-        m_num_imul_acesses[CURRENT_STAT_IDX][i];
-    m_num_imul24_acesses[PREV_STAT_IDX][i] =
-        m_num_imul24_acesses[CURRENT_STAT_IDX][i];
-    m_num_imul32_acesses[PREV_STAT_IDX][i] =
-        m_num_imul32_acesses[CURRENT_STAT_IDX][i];
-    m_num_fpmul_acesses[PREV_STAT_IDX][i] =
-        m_num_fpmul_acesses[CURRENT_STAT_IDX][i];
-    m_num_idiv_acesses[PREV_STAT_IDX][i] =
-        m_num_idiv_acesses[CURRENT_STAT_IDX][i];
-    m_num_fpdiv_acesses[PREV_STAT_IDX][i] =
-        m_num_fpdiv_acesses[CURRENT_STAT_IDX][i];
-    m_num_sp_acesses[PREV_STAT_IDX][i] = m_num_sp_acesses[CURRENT_STAT_IDX][i];
-    m_num_sfu_acesses[PREV_STAT_IDX][i] =
-        m_num_sfu_acesses[CURRENT_STAT_IDX][i];
-    m_num_trans_acesses[PREV_STAT_IDX][i] =
-        m_num_trans_acesses[CURRENT_STAT_IDX][i];
-    m_num_mem_acesses[PREV_STAT_IDX][i] =
-        m_num_mem_acesses[CURRENT_STAT_IDX][i];
-    m_num_sp_committed[PREV_STAT_IDX][i] =
-        m_num_sp_committed[CURRENT_STAT_IDX][i];
-    m_num_sfu_committed[PREV_STAT_IDX][i] =
-        m_num_sfu_committed[CURRENT_STAT_IDX][i];
-    m_num_mem_committed[PREV_STAT_IDX][i] =
-        m_num_mem_committed[CURRENT_STAT_IDX][i];
-    m_read_regfile_acesses[PREV_STAT_IDX][i] =
-        m_read_regfile_acesses[CURRENT_STAT_IDX][i];
-    m_write_regfile_acesses[PREV_STAT_IDX][i] =
-        m_write_regfile_acesses[CURRENT_STAT_IDX][i];
-    m_non_rf_operands[PREV_STAT_IDX][i] =
-        m_non_rf_operands[CURRENT_STAT_IDX][i];
-    m_active_sp_lanes[PREV_STAT_IDX][i] =
-        m_active_sp_lanes[CURRENT_STAT_IDX][i];
-    m_active_sfu_lanes[PREV_STAT_IDX][i] =
-        m_active_sfu_lanes[CURRENT_STAT_IDX][i];
+    m_pipeline_duty_cycle[PREV_STAT_IDX][i]=m_pipeline_duty_cycle[CURRENT_STAT_IDX][i];
+    m_num_decoded_insn[PREV_STAT_IDX][i]= m_num_decoded_insn[CURRENT_STAT_IDX][i];
+    m_num_FPdecoded_insn[PREV_STAT_IDX][i]=m_num_FPdecoded_insn[CURRENT_STAT_IDX][i];
+    m_num_INTdecoded_insn[PREV_STAT_IDX][i]=m_num_INTdecoded_insn[CURRENT_STAT_IDX][i];
+    m_num_storequeued_insn[PREV_STAT_IDX][i]=m_num_storequeued_insn[CURRENT_STAT_IDX][i];
+    m_num_loadqueued_insn[PREV_STAT_IDX][i]=m_num_loadqueued_insn[CURRENT_STAT_IDX][i];
+    m_num_ialu_acesses[PREV_STAT_IDX][i]=m_num_ialu_acesses[CURRENT_STAT_IDX][i];
+    m_num_fp_acesses[PREV_STAT_IDX][i]=m_num_fp_acesses[CURRENT_STAT_IDX][i];
+    m_num_tex_inst[PREV_STAT_IDX][i]=m_num_tex_inst[CURRENT_STAT_IDX][i];
+    m_num_imul_acesses[PREV_STAT_IDX][i]=m_num_imul_acesses[CURRENT_STAT_IDX][i];
+    m_num_imul24_acesses[PREV_STAT_IDX][i]=m_num_imul24_acesses[CURRENT_STAT_IDX][i];
+    m_num_imul32_acesses[PREV_STAT_IDX][i]=m_num_imul32_acesses[CURRENT_STAT_IDX][i];
+    m_num_fpmul_acesses[PREV_STAT_IDX][i]=m_num_fpmul_acesses[CURRENT_STAT_IDX][i];
+    m_num_idiv_acesses[PREV_STAT_IDX][i]=m_num_idiv_acesses[CURRENT_STAT_IDX][i];
+    m_num_fpdiv_acesses[PREV_STAT_IDX][i]=m_num_fpdiv_acesses[CURRENT_STAT_IDX][i];
+    m_num_sp_acesses[PREV_STAT_IDX][i]=m_num_sp_acesses[CURRENT_STAT_IDX][i];
+    m_num_sfu_acesses[PREV_STAT_IDX][i]=m_num_sfu_acesses[CURRENT_STAT_IDX][i];
+    m_num_sqrt_acesses[PREV_STAT_IDX][i]=m_num_sqrt_acesses[CURRENT_STAT_IDX][i];
+    m_num_log_acesses[PREV_STAT_IDX][i]=m_num_log_acesses[CURRENT_STAT_IDX][i];
+    m_num_sin_acesses[PREV_STAT_IDX][i]=m_num_sin_acesses[CURRENT_STAT_IDX][i];
+    m_num_exp_acesses[PREV_STAT_IDX][i]=m_num_exp_acesses[CURRENT_STAT_IDX][i];
+    m_num_dp_acesses[PREV_STAT_IDX][i]=m_num_dp_acesses[CURRENT_STAT_IDX][i];
+    m_num_dpmul_acesses[PREV_STAT_IDX][i]=m_num_dpmul_acesses[CURRENT_STAT_IDX][i];
+    m_num_dpdiv_acesses[PREV_STAT_IDX][i]=m_num_dpdiv_acesses[CURRENT_STAT_IDX][i];
+    m_num_tensor_core_acesses[PREV_STAT_IDX][i]=m_num_tensor_core_acesses[CURRENT_STAT_IDX][i];
+    m_num_const_acesses[PREV_STAT_IDX][i]=m_num_const_acesses[CURRENT_STAT_IDX][i];
+    m_num_tex_acesses[PREV_STAT_IDX][i]=m_num_tex_acesses[CURRENT_STAT_IDX][i];
+    m_num_mem_acesses[PREV_STAT_IDX][i]=m_num_mem_acesses[CURRENT_STAT_IDX][i];
+    m_num_sp_committed[PREV_STAT_IDX][i]=m_num_sp_committed[CURRENT_STAT_IDX][i];
+    m_num_sfu_committed[PREV_STAT_IDX][i]=m_num_sfu_committed[CURRENT_STAT_IDX][i];
+    m_num_mem_committed[PREV_STAT_IDX][i]=m_num_mem_committed[CURRENT_STAT_IDX][i];
+    m_read_regfile_acesses[PREV_STAT_IDX][i]=m_read_regfile_acesses[CURRENT_STAT_IDX][i];
+    m_write_regfile_acesses[PREV_STAT_IDX][i]=m_write_regfile_acesses[CURRENT_STAT_IDX][i];
+    m_non_rf_operands[PREV_STAT_IDX][i]=m_non_rf_operands[CURRENT_STAT_IDX][i];
+    m_active_sp_lanes[PREV_STAT_IDX][i]=m_active_sp_lanes[CURRENT_STAT_IDX][i];
+    m_active_sfu_lanes[PREV_STAT_IDX][i]=m_active_sfu_lanes[CURRENT_STAT_IDX][i];
+    m_active_exu_threads[PREV_STAT_IDX][i]=m_active_exu_threads[CURRENT_STAT_IDX][i];
+    m_active_exu_warps[PREV_STAT_IDX][i]=m_active_exu_warps[CURRENT_STAT_IDX][i];
   }
 }
 
@@ -356,6 +380,51 @@ power_stat_t::power_stat_t(const shader_core_config *shader_config,
   m_active_sms = active_sms;
   m_config = shader_config;
   m_mem_config = mem_config;
+  l1r_hits_kernel = 0;
+  l1r_misses_kernel = 0;
+  l1w_hits_kernel = 0;
+  l1w_misses_kernel = 0;
+  shared_accesses_kernel = 0;
+  cc_accesses_kernel = 0;
+  dram_rd_kernel = 0;
+  dram_wr_kernel = 0;
+  dram_pre_kernel = 0;
+  l1i_hits_kernel =0;
+  l1i_misses_kernel =0;
+  l2r_hits_kernel =0;
+  l2r_misses_kernel =0;
+  l2w_hits_kernel =0;
+  l2w_misses_kernel =0;
+  noc_tr_kernel = 0;
+  noc_rc_kernel = 0;
+
+  tot_inst_execution = 0;
+  tot_int_inst_execution = 0;
+  tot_fp_inst_execution = 0;
+  commited_inst_execution = 0;
+  ialu_acc_execution = 0;
+  imul24_acc_execution = 0;
+  imul32_acc_execution = 0;
+  imul_acc_execution = 0;
+  idiv_acc_execution = 0;
+  dp_acc_execution = 0;
+  dpmul_acc_execution = 0;
+  dpdiv_acc_execution = 0;
+  fp_acc_execution = 0;
+  fpmul_acc_execution = 0;
+  fpdiv_acc_execution = 0;
+  sqrt_acc_execution = 0;
+  log_acc_execution = 0;
+  sin_acc_execution = 0;
+  exp_acc_execution = 0;
+  tensor_acc_execution = 0;
+  tex_acc_execution = 0;
+  tot_fpu_acc_execution = 0;
+  tot_sfu_acc_execution = 0;
+  tot_threads_acc_execution = 0;
+  tot_warps_acc_execution = 0;
+  sp_active_lanes_execution = 0;
+  sfu_active_lanes_execution = 0;
 }
 
 void power_stat_t::visualizer_print(gzFile visualizer_file) {
diff --git a/src/gpgpu-sim/power_stat.h b/src/gpgpu-sim/power_stat.h
index c469db3b3..e2c3ed5cc 100644
--- a/src/gpgpu-sim/power_stat.h
+++ b/src/gpgpu-sim/power_stat.h
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington
-// The University of British Columbia
+// Copyright (c) 2009-2021,  Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -51,29 +52,40 @@ struct shader_core_power_stats_pod {
   unsigned
       *m_num_INTdecoded_insn[NUM_STAT_IDX];  // number of instructions committed
                                              // by this shader core
-  unsigned *m_num_storequeued_insn[NUM_STAT_IDX];
-  unsigned *m_num_loadqueued_insn[NUM_STAT_IDX];
-  unsigned *m_num_ialu_acesses[NUM_STAT_IDX];
-  unsigned *m_num_fp_acesses[NUM_STAT_IDX];
-  unsigned *m_num_tex_inst[NUM_STAT_IDX];
-  unsigned *m_num_imul_acesses[NUM_STAT_IDX];
-  unsigned *m_num_imul32_acesses[NUM_STAT_IDX];
-  unsigned *m_num_imul24_acesses[NUM_STAT_IDX];
-  unsigned *m_num_fpmul_acesses[NUM_STAT_IDX];
-  unsigned *m_num_idiv_acesses[NUM_STAT_IDX];
-  unsigned *m_num_fpdiv_acesses[NUM_STAT_IDX];
-  unsigned *m_num_sp_acesses[NUM_STAT_IDX];
-  unsigned *m_num_sfu_acesses[NUM_STAT_IDX];
-  unsigned *m_num_trans_acesses[NUM_STAT_IDX];
-  unsigned *m_num_mem_acesses[NUM_STAT_IDX];
-  unsigned *m_num_sp_committed[NUM_STAT_IDX];
-  unsigned *m_num_sfu_committed[NUM_STAT_IDX];
-  unsigned *m_num_mem_committed[NUM_STAT_IDX];
-  unsigned *m_active_sp_lanes[NUM_STAT_IDX];
-  unsigned *m_active_sfu_lanes[NUM_STAT_IDX];
-  unsigned *m_read_regfile_acesses[NUM_STAT_IDX];
-  unsigned *m_write_regfile_acesses[NUM_STAT_IDX];
-  unsigned *m_non_rf_operands[NUM_STAT_IDX];
+    unsigned *m_num_storequeued_insn[NUM_STAT_IDX];
+    unsigned *m_num_loadqueued_insn[NUM_STAT_IDX];
+    unsigned *m_num_tex_inst[NUM_STAT_IDX];
+    double *m_num_ialu_acesses[NUM_STAT_IDX];
+    double *m_num_fp_acesses[NUM_STAT_IDX];
+    double *m_num_imul_acesses[NUM_STAT_IDX];
+    double *m_num_imul32_acesses[NUM_STAT_IDX];
+    double *m_num_imul24_acesses[NUM_STAT_IDX];
+    double *m_num_fpmul_acesses[NUM_STAT_IDX];
+    double *m_num_idiv_acesses[NUM_STAT_IDX];
+    double *m_num_fpdiv_acesses[NUM_STAT_IDX];
+    double *m_num_dp_acesses[NUM_STAT_IDX];
+    double *m_num_dpmul_acesses[NUM_STAT_IDX];
+    double *m_num_dpdiv_acesses[NUM_STAT_IDX];
+    double *m_num_sp_acesses[NUM_STAT_IDX];
+    double *m_num_sfu_acesses[NUM_STAT_IDX];
+    double *m_num_sqrt_acesses[NUM_STAT_IDX];
+    double *m_num_log_acesses[NUM_STAT_IDX];
+    double *m_num_sin_acesses[NUM_STAT_IDX];
+    double *m_num_exp_acesses[NUM_STAT_IDX];
+    double *m_num_tensor_core_acesses[NUM_STAT_IDX];
+    double *m_num_const_acesses[NUM_STAT_IDX];
+    double *m_num_tex_acesses[NUM_STAT_IDX];
+    double *m_num_mem_acesses[NUM_STAT_IDX];
+    unsigned *m_num_sp_committed[NUM_STAT_IDX];
+    unsigned *m_num_sfu_committed[NUM_STAT_IDX];
+    unsigned *m_num_mem_committed[NUM_STAT_IDX];
+    unsigned *m_active_sp_lanes[NUM_STAT_IDX];
+    unsigned *m_active_sfu_lanes[NUM_STAT_IDX];
+    double *m_active_exu_threads[NUM_STAT_IDX];
+    double *m_active_exu_warps[NUM_STAT_IDX];    
+    unsigned *m_read_regfile_acesses[NUM_STAT_IDX];
+    unsigned *m_write_regfile_acesses[NUM_STAT_IDX];
+    unsigned *m_non_rf_operands[NUM_STAT_IDX];
 };
 
 class power_core_stat_t : public shader_core_power_stats_pod {
@@ -84,6 +96,7 @@ class power_core_stat_t : public shader_core_power_stats_pod {
   void print(FILE *fout);
   void init();
   void save_stats();
+ 
 
  private:
   shader_core_stats *m_core_stats;
@@ -96,8 +109,7 @@ struct mem_power_stats_pod {
   class cache_stats core_cache_stats[NUM_STAT_IDX];  // Total core stats
   class cache_stats l2_cache_stats[NUM_STAT_IDX];    // Total L2 partition stats
 
-  unsigned *shmem_read_access[NUM_STAT_IDX];  // Shared memory access
-
+  unsigned *shmem_access[NUM_STAT_IDX];  // Shared memory access
   // Low level DRAM stats
   unsigned *n_cmd[NUM_STAT_IDX];
   unsigned *n_activity[NUM_STAT_IDX];
@@ -106,6 +118,7 @@ struct mem_power_stats_pod {
   unsigned *n_pre[NUM_STAT_IDX];
   unsigned *n_rd[NUM_STAT_IDX];
   unsigned *n_wr[NUM_STAT_IDX];
+  unsigned *n_wr_WB[NUM_STAT_IDX];
   unsigned *n_req[NUM_STAT_IDX];
 
   // Interconnect stats
@@ -144,34 +157,88 @@ class power_stat_t {
     *m_average_pipeline_duty_cycle = 0;
     *m_active_sms = 0;
   }
-
-  unsigned get_total_inst() {
-    unsigned total_inst = 0;
+  void clear();
+  unsigned l1i_misses_kernel;
+  unsigned l1i_hits_kernel;
+  unsigned long long l1r_hits_kernel;
+  unsigned long long l1r_misses_kernel;
+  unsigned long long l1w_hits_kernel;
+  unsigned long long l1w_misses_kernel;
+  unsigned long long shared_accesses_kernel;
+  unsigned long long cc_accesses_kernel;
+  unsigned long long dram_rd_kernel;
+  unsigned long long dram_wr_kernel;
+  unsigned long long dram_pre_kernel;
+  unsigned long long l2r_hits_kernel;
+  unsigned long long l2r_misses_kernel;
+  unsigned long long l2w_hits_kernel;
+  unsigned long long l2w_misses_kernel;
+  unsigned long long noc_tr_kernel;
+  unsigned long long noc_rc_kernel;
+  unsigned long long tot_inst_execution;
+  unsigned long long tot_int_inst_execution;
+  unsigned long long tot_fp_inst_execution;
+  unsigned long long commited_inst_execution;
+  unsigned long long ialu_acc_execution;
+  unsigned long long imul24_acc_execution;
+  unsigned long long imul32_acc_execution;
+  unsigned long long imul_acc_execution;
+  unsigned long long idiv_acc_execution;
+  unsigned long long dp_acc_execution;
+  unsigned long long dpmul_acc_execution;
+  unsigned long long dpdiv_acc_execution;
+  unsigned long long fp_acc_execution;
+  unsigned long long fpmul_acc_execution;
+  unsigned long long fpdiv_acc_execution;
+  unsigned long long sqrt_acc_execution;
+  unsigned long long log_acc_execution;
+  unsigned long long sin_acc_execution;
+  unsigned long long exp_acc_execution;
+  unsigned long long tensor_acc_execution;
+  unsigned long long tex_acc_execution;
+  unsigned long long tot_fpu_acc_execution;
+  unsigned long long tot_sfu_acc_execution;
+  unsigned long long tot_threads_acc_execution;
+  unsigned long long tot_warps_acc_execution;
+  unsigned long long sp_active_lanes_execution;
+  unsigned long long sfu_active_lanes_execution;
+  double get_total_inst(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_decoded_insn[CURRENT_STAT_IDX][i]) -
+      if(aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_decoded_insn[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_decoded_insn[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_decoded_insn[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_total_int_inst() {
-    unsigned total_inst = 0;
+  double get_total_int_inst(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst +=
+      if(aggregate_stat)
+          total_inst +=
+          (pwr_core_stat->m_num_INTdecoded_insn[CURRENT_STAT_IDX][i]);
+      else 
+        total_inst +=
           (pwr_core_stat->m_num_INTdecoded_insn[CURRENT_STAT_IDX][i]) -
           (pwr_core_stat->m_num_INTdecoded_insn[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_total_fp_inst() {
-    unsigned total_inst = 0;
+  double get_total_fp_inst(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_FPdecoded_insn[CURRENT_STAT_IDX][i]) -
+      if(aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_FPdecoded_insn[CURRENT_STAT_IDX][i]);
+      else 
+        total_inst += (pwr_core_stat->m_num_FPdecoded_insn[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_FPdecoded_insn[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_total_load_inst() {
-    unsigned total_inst = 0;
+  double get_total_load_inst() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst +=
           (pwr_core_stat->m_num_loadqueued_insn[CURRENT_STAT_IDX][i]) -
@@ -179,8 +246,8 @@ class power_stat_t {
     }
     return total_inst;
   }
-  unsigned get_total_store_inst() {
-    unsigned total_inst = 0;
+  double get_total_store_inst() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst +=
           (pwr_core_stat->m_num_storequeued_insn[CURRENT_STAT_IDX][i]) -
@@ -188,34 +255,39 @@ class power_stat_t {
     }
     return total_inst;
   }
-  unsigned get_sp_committed_inst() {
-    unsigned total_inst = 0;
+  double get_sp_committed_inst() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_num_sp_committed[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_sp_committed[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_sfu_committed_inst() {
-    unsigned total_inst = 0;
+  double get_sfu_committed_inst() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_num_sfu_committed[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_sfu_committed[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_mem_committed_inst() {
-    unsigned total_inst = 0;
+  double get_mem_committed_inst() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_num_mem_committed[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_mem_committed[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_committed_inst() {
-    unsigned total_inst = 0;
+  double get_committed_inst(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_mem_committed[CURRENT_STAT_IDX][i]) -
+      if(aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_mem_committed[CURRENT_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_sfu_committed[CURRENT_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_sp_committed[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_mem_committed[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_mem_committed[PREV_STAT_IDX][i]) +
                     (pwr_core_stat->m_num_sfu_committed[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_sfu_committed[PREV_STAT_IDX][i]) +
@@ -224,19 +296,27 @@ class power_stat_t {
     }
     return total_inst;
   }
-  unsigned get_regfile_reads() {
-    unsigned total_inst = 0;
+  double get_regfile_reads(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst +=
+      if(aggregate_stat)
+         total_inst +=
+          (pwr_core_stat->m_read_regfile_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst +=
           (pwr_core_stat->m_read_regfile_acesses[CURRENT_STAT_IDX][i]) -
           (pwr_core_stat->m_read_regfile_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_regfile_writes() {
-    unsigned total_inst = 0;
+  double get_regfile_writes(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst +=
+      if(aggregate_stat)
+        total_inst +=
+          (pwr_core_stat->m_write_regfile_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst +=
           (pwr_core_stat->m_write_regfile_acesses[CURRENT_STAT_IDX][i]) -
           (pwr_core_stat->m_write_regfile_acesses[PREV_STAT_IDX][i]);
     }
@@ -253,17 +333,20 @@ class power_stat_t {
     return total_inst;
   }
 
-  unsigned get_non_regfile_operands() {
-    unsigned total_inst = 0;
+  double get_non_regfile_operands(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_non_rf_operands[CURRENT_STAT_IDX][i]) -
+      if(aggregate_stat)
+         total_inst += (pwr_core_stat->m_non_rf_operands[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_non_rf_operands[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_non_rf_operands[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_sp_accessess() {
-    unsigned total_inst = 0;
+  double get_sp_accessess() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_num_sp_acesses[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_sp_acesses[PREV_STAT_IDX][i]);
@@ -271,25 +354,58 @@ class power_stat_t {
     return total_inst;
   }
 
-  unsigned get_sfu_accessess() {
-    unsigned total_inst = 0;
+  double get_sfu_accessess() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_num_sfu_acesses[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_sfu_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
-  unsigned get_trans_accessess() {
-    unsigned total_inst = 0;
-    for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_trans_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_trans_acesses[PREV_STAT_IDX][i]);
-    }
-    return total_inst;
+
+  double get_sqrt_accessess(bool aggregate_stat){
+      double total_inst=0;
+      for(unsigned i=0; i<m_config->num_shader();i++){
+          if(aggregate_stat)
+            total_inst+=(pwr_core_stat->m_num_sqrt_acesses[CURRENT_STAT_IDX][i]);
+          else
+            total_inst+=(pwr_core_stat->m_num_sqrt_acesses[CURRENT_STAT_IDX][i]) - (pwr_core_stat->m_num_sqrt_acesses[PREV_STAT_IDX][i]);
+      }
+      return total_inst;
+  }
+  double get_log_accessess(bool aggregate_stat){
+      double total_inst=0;
+      for(unsigned i=0; i<m_config->num_shader();i++){
+        if(aggregate_stat)
+          total_inst+=(pwr_core_stat->m_num_log_acesses[CURRENT_STAT_IDX][i]);
+        else 
+          total_inst+=(pwr_core_stat->m_num_log_acesses[CURRENT_STAT_IDX][i]) - (pwr_core_stat->m_num_log_acesses[PREV_STAT_IDX][i]);
+      }
+      return total_inst;
+  }
+  double get_sin_accessess(bool aggregate_stat){
+      double total_inst=0;
+      for(unsigned i=0; i<m_config->num_shader();i++){
+        if(aggregate_stat)  
+          total_inst+=(pwr_core_stat->m_num_sin_acesses[CURRENT_STAT_IDX][i]);
+        else 
+          total_inst+=(pwr_core_stat->m_num_sin_acesses[CURRENT_STAT_IDX][i]) - (pwr_core_stat->m_num_sin_acesses[PREV_STAT_IDX][i]);
+      }
+      return total_inst;
+  }
+  double get_exp_accessess(bool aggregate_stat){
+      double total_inst=0;
+      for(unsigned i=0; i<m_config->num_shader();i++){
+        if(aggregate_stat)  
+          total_inst+=(pwr_core_stat->m_num_exp_acesses[CURRENT_STAT_IDX][i]);
+        else  
+          total_inst+=(pwr_core_stat->m_num_exp_acesses[CURRENT_STAT_IDX][i]) - (pwr_core_stat->m_num_exp_acesses[PREV_STAT_IDX][i]);
+      }
+      return total_inst;
   }
 
-  unsigned get_mem_accessess() {
-    unsigned total_inst = 0;
+  double get_mem_accessess() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_num_mem_acesses[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_mem_acesses[PREV_STAT_IDX][i]);
@@ -297,66 +413,164 @@ class power_stat_t {
     return total_inst;
   }
 
-  unsigned get_intdiv_accessess() {
-    unsigned total_inst = 0;
+  double get_intdiv_accessess(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_idiv_acesses[CURRENT_STAT_IDX][i]) -
+      if(aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_idiv_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_idiv_acesses[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_idiv_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_fpdiv_accessess() {
-    unsigned total_inst = 0;
+  double get_fpdiv_accessess(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]) -
+      if(aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]);
+      else  
+        total_inst += (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_fpdiv_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_intmul32_accessess() {
-    unsigned total_inst = 0;
+  double get_intmul32_accessess(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]) -
+      if(aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]);
+      else  
+        total_inst += (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_imul32_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_intmul24_accessess() {
-    unsigned total_inst = 0;
+  double get_intmul24_accessess(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]) -
+      if(aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]);
+      else  
+        total_inst += (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_imul24_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_intmul_accessess() {
-    unsigned total_inst = 0;
-    for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_imul_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_imul24_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_imul32_acesses[PREV_STAT_IDX][i]);
+  double get_intmul_accessess(bool aggregate_stat){
+      double total_inst=0;
+      for(unsigned i=0; i<m_config->num_shader();i++){
+        if(aggregate_stat)
+          total_inst+= (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i]); 
+        else  
+          total_inst+= (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i]) - 
+                       (pwr_core_stat->m_num_imul_acesses[PREV_STAT_IDX][i]);
+      }
+      return total_inst;
+  }
+
+  double get_fpmul_accessess(bool aggregate_stat){
+    double total_inst=0;
+    for(unsigned i=0; i<m_config->num_shader();i++){
+        if(aggregate_stat)
+          total_inst += (pwr_core_stat->m_num_fpmul_acesses[CURRENT_STAT_IDX][i]);
+        else
+          total_inst += (pwr_core_stat->m_num_fpmul_acesses[CURRENT_STAT_IDX][i]) - 
+                      (pwr_core_stat->m_num_fpmul_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_fpmul_accessess() {
-    unsigned total_inst = 0;
-    for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_fp_acesses[PREV_STAT_IDX][i]);
+  double get_fp_accessess(bool aggregate_stat){
+    double total_inst=0;
+    for(unsigned i=0; i<m_config->num_shader();i++){
+        if(aggregate_stat)
+          total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i]);
+        else  
+          total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i]) - 
+                      (pwr_core_stat->m_num_fp_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  float get_sp_active_lanes() {
-    unsigned total_inst = 0;
+  double get_dp_accessess(bool aggregate_stat){
+    double total_inst=0;
+    for(unsigned i=0; i<m_config->num_shader();i++){
+        if(aggregate_stat)
+          total_inst += (pwr_core_stat->m_num_dp_acesses[CURRENT_STAT_IDX][i]);
+        else  
+          total_inst += (pwr_core_stat->m_num_dp_acesses[CURRENT_STAT_IDX][i]) - 
+                      (pwr_core_stat->m_num_dp_acesses[PREV_STAT_IDX][i]);
+    }
+    return total_inst;
+  }
+
+  double get_dpmul_accessess(bool aggregate_stat){
+    double total_inst=0;
+    for(unsigned i=0; i<m_config->num_shader();i++){
+      if(aggregate_stat)  
+        total_inst += (pwr_core_stat->m_num_dpmul_acesses[CURRENT_STAT_IDX][i]);
+      else  
+        total_inst += (pwr_core_stat->m_num_dpmul_acesses[CURRENT_STAT_IDX][i]) - 
+                      (pwr_core_stat->m_num_dpmul_acesses[PREV_STAT_IDX][i]);
+    }
+    return total_inst;
+  }
+
+  double get_dpdiv_accessess(bool aggregate_stat){
+    double total_inst=0;
+    for(unsigned i=0; i<m_config->num_shader();i++){
+      if(aggregate_stat)  
+        total_inst += (pwr_core_stat->m_num_dpdiv_acesses[CURRENT_STAT_IDX][i]);
+      else  
+        total_inst += (pwr_core_stat->m_num_dpdiv_acesses[CURRENT_STAT_IDX][i]) - 
+                      (pwr_core_stat->m_num_dpdiv_acesses[PREV_STAT_IDX][i]);
+    }
+    return total_inst;
+  }
+
+  double get_tensor_accessess(bool aggregate_stat){
+    double total_inst=0;
+    for(unsigned i=0; i<m_config->num_shader();i++){
+      if(aggregate_stat)  
+        total_inst += (pwr_core_stat->m_num_tensor_core_acesses[CURRENT_STAT_IDX][i]);
+      else  
+        total_inst += (pwr_core_stat->m_num_tensor_core_acesses[CURRENT_STAT_IDX][i]) - 
+                      (pwr_core_stat->m_num_tensor_core_acesses[PREV_STAT_IDX][i]);
+    }
+    return total_inst;
+  }
+
+  double get_const_accessess(bool aggregate_stat){
+    double total_inst=0;
+    for(unsigned i=0; i<m_config->num_shader();i++){
+        if(aggregate_stat)
+          total_inst += pwr_core_stat->m_num_const_acesses[CURRENT_STAT_IDX][i];
+        else
+          total_inst += (pwr_core_stat->m_num_const_acesses[CURRENT_STAT_IDX][i]) - 
+                      (pwr_core_stat->m_num_const_acesses[PREV_STAT_IDX][i]);
+    }
+    return (total_inst);
+  }
+
+  double get_tex_accessess(bool aggregate_stat){
+    double total_inst=0;
+    for(unsigned i=0; i<m_config->num_shader();i++){
+      if(aggregate_stat)  
+        total_inst += (pwr_core_stat->m_num_tex_acesses[CURRENT_STAT_IDX][i]);
+      else  
+        total_inst += (pwr_core_stat->m_num_tex_acesses[CURRENT_STAT_IDX][i]) - 
+                      (pwr_core_stat->m_num_tex_acesses[PREV_STAT_IDX][i]);
+    }
+    return total_inst;
+  }
+
+  double get_sp_active_lanes() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_active_sp_lanes[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_active_sp_lanes[PREV_STAT_IDX][i]);
@@ -365,7 +579,7 @@ class power_stat_t {
   }
 
   float get_sfu_active_lanes() {
-    unsigned total_inst = 0;
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_active_sfu_lanes[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_active_sfu_lanes[PREV_STAT_IDX][i]);
@@ -375,49 +589,141 @@ class power_stat_t {
            m_config->gpgpu_num_sfu_units;
   }
 
-  unsigned get_tot_fpu_accessess() {
-    unsigned total_inst = 0;
+
+  float get_active_threads(bool aggregate_stat) {
+    unsigned total_threads = 0;
+    unsigned total_warps = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if(aggregate_stat){
+        total_threads += (pwr_core_stat->m_active_exu_threads[CURRENT_STAT_IDX][i]) ;
+        total_warps += (pwr_core_stat->m_active_exu_warps[CURRENT_STAT_IDX][i]);
+      }
+      else{
+        total_threads += (pwr_core_stat->m_active_exu_threads[CURRENT_STAT_IDX][i]) -
+                    (pwr_core_stat->m_active_exu_threads[PREV_STAT_IDX][i]);
+        total_warps += (pwr_core_stat->m_active_exu_warps[CURRENT_STAT_IDX][i]) -
+                    (pwr_core_stat->m_active_exu_warps[PREV_STAT_IDX][i]);
+        }
+    }
+    if(total_warps != 0)
+      return (float)((float)total_threads / (float)total_warps);
+    else
+      return 0;
+  }
+
+  unsigned long long get_tot_threads_kernel(bool aggregate_stat) {
+    unsigned total_threads = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i]) -
+      if(aggregate_stat){
+        total_threads += (pwr_core_stat->m_active_exu_threads[CURRENT_STAT_IDX][i]) ;
+      }
+      else{
+        total_threads += (pwr_core_stat->m_active_exu_threads[CURRENT_STAT_IDX][i]) -
+                    (pwr_core_stat->m_active_exu_threads[PREV_STAT_IDX][i]);
+        }
+    }
+
+      return total_threads;
+  }
+  unsigned long long get_tot_warps_kernel(bool aggregate_stat) {
+    unsigned long long total_warps = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if(aggregate_stat){
+        total_warps += (pwr_core_stat->m_active_exu_warps[CURRENT_STAT_IDX][i]);
+      }
+      else{
+        total_warps += (pwr_core_stat->m_active_exu_warps[CURRENT_STAT_IDX][i]) -
+                    (pwr_core_stat->m_active_exu_warps[PREV_STAT_IDX][i]);
+        }
+    }
+      return total_warps;
+  }
+
+
+  double get_tot_fpu_accessess(bool aggregate_stat){
+    double total_inst=0;
+    for(unsigned i=0; i<m_config->num_shader();i++){
+      if(aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_dp_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i]) - 
                     (pwr_core_stat->m_num_fp_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_fpdiv_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_fpmul_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_fpmul_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_imul24_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_imul_acesses[PREV_STAT_IDX][i]);
+                    (pwr_core_stat->m_num_dp_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_dp_acesses[PREV_STAT_IDX][i]);
     }
-    total_inst +=
-        get_total_load_inst() + get_total_store_inst() + get_tex_inst();
+    //total_inst += get_total_load_inst()+get_total_store_inst()+get_tex_inst();
     return total_inst;
   }
 
-  unsigned get_tot_sfu_accessess() {
-    unsigned total_inst = 0;
-    for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_idiv_acesses[CURRENT_STAT_IDX][i]) -
+
+
+  double get_tot_sfu_accessess(bool aggregate_stat){
+    double total_inst=0;
+    for(unsigned i=0; i<m_config->num_shader();i++){
+      if(aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_idiv_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_sqrt_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_log_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_sin_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_exp_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_fpmul_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_dpmul_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_dpdiv_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_tensor_core_acesses[CURRENT_STAT_IDX][i])+
+                    (pwr_core_stat->m_num_tex_acesses[CURRENT_STAT_IDX][i]);
+        else
+            total_inst += (pwr_core_stat->m_num_idiv_acesses[CURRENT_STAT_IDX][i]) - 
                     (pwr_core_stat->m_num_idiv_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]) -
+                    (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]) - 
                     (pwr_core_stat->m_num_imul32_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_trans_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_trans_acesses[PREV_STAT_IDX][i]);
+                    (pwr_core_stat->m_num_sqrt_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_sqrt_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_log_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_log_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_sin_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_sin_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_exp_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_exp_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_fpdiv_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_fpmul_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_fpmul_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_dpmul_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_dpmul_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_dpdiv_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_dpdiv_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_imul24_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_imul_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_tensor_core_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_tensor_core_acesses[PREV_STAT_IDX][i]) +
+                    (pwr_core_stat->m_num_tex_acesses[CURRENT_STAT_IDX][i]) - 
+                    (pwr_core_stat->m_num_tex_acesses[PREV_STAT_IDX][i]);
+
     }
     return total_inst;
   }
 
-  unsigned get_ialu_accessess() {
-    unsigned total_inst = 0;
+  double get_ialu_accessess(bool aggregate_stat) {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_core_stat->m_num_ialu_acesses[CURRENT_STAT_IDX][i]) -
+      if(aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_ialu_acesses[CURRENT_STAT_IDX][i]);
+      else  
+        total_inst += (pwr_core_stat->m_num_ialu_acesses[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_ialu_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_tex_inst() {
-    unsigned total_inst = 0;
+  double get_tex_inst() {
+    double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
       total_inst += (pwr_core_stat->m_num_tex_inst[CURRENT_STAT_IDX][i]) -
                     (pwr_core_stat->m_num_tex_inst[PREV_STAT_IDX][i]);
@@ -425,7 +731,7 @@ class power_stat_t {
     return total_inst;
   }
 
-  unsigned get_constant_c_accesses() {
+  double get_constant_c_accesses() {
     enum mem_access_type access_type[] = {CONST_ACC_R};
     enum cache_request_status request_status[] = {HIT, MISS, HIT_RESERVED};
     unsigned num_access_type =
@@ -440,7 +746,7 @@ class power_stat_t {
                access_type, num_access_type, request_status,
                num_request_status));
   }
-  unsigned get_constant_c_misses() {
+  double get_constant_c_misses() {
     enum mem_access_type access_type[] = {CONST_ACC_R};
     enum cache_request_status request_status[] = {MISS};
     unsigned num_access_type =
@@ -455,10 +761,10 @@ class power_stat_t {
                access_type, num_access_type, request_status,
                num_request_status));
   }
-  unsigned get_constant_c_hits() {
+  double get_constant_c_hits() {
     return (get_constant_c_accesses() - get_constant_c_misses());
   }
-  unsigned get_texture_c_accesses() {
+  double get_texture_c_accesses() {
     enum mem_access_type access_type[] = {TEXTURE_ACC_R};
     enum cache_request_status request_status[] = {HIT, MISS, HIT_RESERVED};
     unsigned num_access_type =
@@ -473,7 +779,7 @@ class power_stat_t {
                access_type, num_access_type, request_status,
                num_request_status));
   }
-  unsigned get_texture_c_misses() {
+  double get_texture_c_misses() {
     enum mem_access_type access_type[] = {TEXTURE_ACC_R};
     enum cache_request_status request_status[] = {MISS};
     unsigned num_access_type =
@@ -488,205 +794,268 @@ class power_stat_t {
                access_type, num_access_type, request_status,
                num_request_status));
   }
-  unsigned get_texture_c_hits() {
+  double get_texture_c_hits() {
     return (get_texture_c_accesses() - get_texture_c_misses());
   }
-  unsigned get_inst_c_accesses() {
+  double get_inst_c_accesses(bool aggregate_stat) {
     enum mem_access_type access_type[] = {INST_ACC_R};
     enum cache_request_status request_status[] = {HIT, MISS, HIT_RESERVED};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
-
-    return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+    if(aggregate_stat)
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+               access_type, num_access_type, request_status,
+               num_request_status));
+    else
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status)) -
            (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status));
   }
-  unsigned get_inst_c_misses() {
+  double get_inst_c_misses(bool aggregate_stat) {
     enum mem_access_type access_type[] = {INST_ACC_R};
     enum cache_request_status request_status[] = {MISS};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
-
-    return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+    if(aggregate_stat)
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+               access_type, num_access_type, request_status,
+               num_request_status));
+    else
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status)) -
            (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status));
   }
-  unsigned get_inst_c_hits() {
-    return (get_inst_c_accesses() - get_inst_c_misses());
+  double get_inst_c_hits(bool aggregate_stat) {
+    return (get_inst_c_accesses(aggregate_stat) - get_inst_c_misses(aggregate_stat));
   }
 
-  unsigned get_l1d_read_accesses() {
+  double get_l1d_read_accesses(bool aggregate_stat) {
     enum mem_access_type access_type[] = {GLOBAL_ACC_R, LOCAL_ACC_R};
-    enum cache_request_status request_status[] = {HIT, MISS, HIT_RESERVED};
+    enum cache_request_status request_status[] = {HIT, MISS, SECTOR_MISS}; 
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
 
-    return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+    if(aggregate_stat){
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+               access_type, num_access_type, request_status,
+               num_request_status));
+    }
+    else{
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status)) -
            (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status));
+      }
+  }
+  double get_l1d_read_misses(bool aggregate_stat) {
+    return (get_l1d_read_accesses(aggregate_stat) - get_l1d_read_hits(aggregate_stat));
   }
-  unsigned get_l1d_read_misses() {
+  double get_l1d_read_hits(bool aggregate_stat) {
     enum mem_access_type access_type[] = {GLOBAL_ACC_R, LOCAL_ACC_R};
-    enum cache_request_status request_status[] = {MISS};
+    enum cache_request_status request_status[] = {HIT, MSHR_HIT};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
 
-    return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+    if(aggregate_stat){
+       return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+               access_type, num_access_type, request_status,
+               num_request_status));
+    }
+    else{
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status)) -
            (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status));
+      }
   }
-  unsigned get_l1d_read_hits() {
-    return (get_l1d_read_accesses() - get_l1d_read_misses());
-  }
-  unsigned get_l1d_write_accesses() {
+  double get_l1d_write_accesses(bool aggregate_stat) {
     enum mem_access_type access_type[] = {GLOBAL_ACC_W, LOCAL_ACC_W};
-    enum cache_request_status request_status[] = {HIT, MISS, HIT_RESERVED};
+    enum cache_request_status request_status[] = {HIT, MISS, SECTOR_MISS};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
 
-    return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+    if(aggregate_stat){
+       return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+               access_type, num_access_type, request_status,
+               num_request_status));
+    }
+    else{
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status)) -
            (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status));
+      }
+  }
+  double get_l1d_write_misses(bool aggregate_stat) {
+    return (get_l1d_write_accesses(aggregate_stat) - get_l1d_write_hits(aggregate_stat));
   }
-  unsigned get_l1d_write_misses() {
+  double get_l1d_write_hits(bool aggregate_stat) {
     enum mem_access_type access_type[] = {GLOBAL_ACC_W, LOCAL_ACC_W};
-    enum cache_request_status request_status[] = {MISS};
+    enum cache_request_status request_status[] = {HIT, MSHR_HIT};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
 
-    return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+    if(aggregate_stat){
+       return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+               access_type, num_access_type, request_status,
+               num_request_status));
+    }
+    else{
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status)) -
            (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status));
+      }
   }
-  unsigned get_l1d_write_hits() {
-    return (get_l1d_write_accesses() - get_l1d_write_misses());
-  }
-  unsigned get_cache_misses() {
-    return get_l1d_read_misses() + get_constant_c_misses() +
-           get_l1d_write_misses() + get_texture_c_misses();
+  double get_cache_misses() {
+    return get_l1d_read_misses(0) + get_constant_c_misses() +
+           get_l1d_write_misses(0) + get_texture_c_misses();
   }
 
-  unsigned get_cache_read_misses() {
-    return get_l1d_read_misses() + get_constant_c_misses() +
+  double get_cache_read_misses() {
+    return get_l1d_read_misses(0) + get_constant_c_misses() +
            get_texture_c_misses();
   }
 
-  unsigned get_cache_write_misses() { return get_l1d_write_misses(); }
+  double get_cache_write_misses() { return get_l1d_write_misses(0); }
 
-  unsigned get_shmem_read_access() {
+  double get_shmem_access(bool aggregate_stat) {
     unsigned total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      total_inst += (pwr_mem_stat->shmem_read_access[CURRENT_STAT_IDX][i]) -
-                    (pwr_mem_stat->shmem_read_access[PREV_STAT_IDX][i]);
+      if(aggregate_stat)
+        total_inst += (pwr_mem_stat->shmem_access[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_mem_stat->shmem_access[CURRENT_STAT_IDX][i]) -
+                    (pwr_mem_stat->shmem_access[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned get_l2_read_accesses() {
+  unsigned long long  get_l2_read_accesses(bool aggregate_stat) {
     enum mem_access_type access_type[] = {
         GLOBAL_ACC_R, LOCAL_ACC_R, CONST_ACC_R, TEXTURE_ACC_R, INST_ACC_R};
-    enum cache_request_status request_status[] = {HIT, MISS, HIT_RESERVED};
+    enum cache_request_status request_status[] = {HIT, HIT_RESERVED, MISS, SECTOR_MISS}; 
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
-
-    return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+    if(aggregate_stat){
+       return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+               access_type, num_access_type, request_status,
+               num_request_status));
+    }
+    else{
+      return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status)) -
            (pwr_mem_stat->l2_cache_stats[PREV_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status));
+    }
   }
 
-  unsigned get_l2_read_misses() {
-    enum mem_access_type access_type[] = {
+  unsigned long long get_l2_read_misses(bool aggregate_stat) {
+    return (get_l2_read_accesses(aggregate_stat) - get_l2_read_hits(aggregate_stat));
+  }
+
+  unsigned long long get_l2_read_hits(bool aggregate_stat) {
+       enum mem_access_type access_type[] = {
         GLOBAL_ACC_R, LOCAL_ACC_R, CONST_ACC_R, TEXTURE_ACC_R, INST_ACC_R};
-    enum cache_request_status request_status[] = {MISS};
+    enum cache_request_status request_status[] =  {HIT, HIT_RESERVED};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
-
-    return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+    if(aggregate_stat){
+       return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+               access_type, num_access_type, request_status,
+               num_request_status));
+    }
+    else{
+      return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status)) -
            (pwr_mem_stat->l2_cache_stats[PREV_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status));
+    }
   }
 
-  unsigned get_l2_read_hits() {
-    return (get_l2_read_accesses() - get_l2_read_misses());
-  }
-
-  unsigned get_l2_write_accesses() {
+  unsigned long long get_l2_write_accesses(bool aggregate_stat) {
     enum mem_access_type access_type[] = {GLOBAL_ACC_W, LOCAL_ACC_W,
                                           L1_WRBK_ACC};
-    enum cache_request_status request_status[] = {HIT, MISS, HIT_RESERVED};
+    enum cache_request_status request_status[] = {HIT, HIT_RESERVED, MISS, SECTOR_MISS}; 
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
-
-    return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+    if(aggregate_stat){
+      return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+               access_type, num_access_type, request_status,
+               num_request_status));
+    }
+    else{
+      return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status)) -
            (pwr_mem_stat->l2_cache_stats[PREV_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status));
+    }
   }
 
-  unsigned get_l2_write_misses() {
-    enum mem_access_type access_type[] = {GLOBAL_ACC_W, LOCAL_ACC_W,
+  unsigned long long get_l2_write_misses(bool aggregate_stat) {
+    return (get_l2_write_accesses(aggregate_stat) - get_l2_write_hits(aggregate_stat));
+  }
+  unsigned long long get_l2_write_hits(bool aggregate_stat) {
+        enum mem_access_type access_type[] = {GLOBAL_ACC_W, LOCAL_ACC_W,
                                           L1_WRBK_ACC};
-    enum cache_request_status request_status[] = {MISS};
+    enum cache_request_status request_status[] = {HIT, HIT_RESERVED};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
-
-    return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+    if(aggregate_stat){
+      return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+               access_type, num_access_type, request_status,
+               num_request_status));
+    }
+    else{
+      return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status)) -
            (pwr_mem_stat->l2_cache_stats[PREV_STAT_IDX].get_stats(
                access_type, num_access_type, request_status,
                num_request_status));
+    }
   }
-  unsigned get_l2_write_hits() {
-    return (get_l2_write_accesses() - get_l2_write_misses());
-  }
-  unsigned get_dram_cmd() {
+  double get_dram_cmd() {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
       total += (pwr_mem_stat->n_cmd[CURRENT_STAT_IDX][i] -
@@ -694,7 +1063,7 @@ class power_stat_t {
     }
     return total;
   }
-  unsigned get_dram_activity() {
+  double get_dram_activity() {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
       total += (pwr_mem_stat->n_activity[CURRENT_STAT_IDX][i] -
@@ -702,7 +1071,7 @@ class power_stat_t {
     }
     return total;
   }
-  unsigned get_dram_nop() {
+  double get_dram_nop() {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
       total += (pwr_mem_stat->n_nop[CURRENT_STAT_IDX][i] -
@@ -710,7 +1079,7 @@ class power_stat_t {
     }
     return total;
   }
-  unsigned get_dram_act() {
+  double get_dram_act() {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
       total += (pwr_mem_stat->n_act[CURRENT_STAT_IDX][i] -
@@ -718,31 +1087,49 @@ class power_stat_t {
     }
     return total;
   }
-  unsigned get_dram_pre() {
+  double get_dram_pre(bool aggregate_stat) {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
-      total += (pwr_mem_stat->n_pre[CURRENT_STAT_IDX][i] -
+      if(aggregate_stat){
+        total += pwr_mem_stat->n_pre[CURRENT_STAT_IDX][i];
+      }
+      else{
+        total += (pwr_mem_stat->n_pre[CURRENT_STAT_IDX][i] -
                 pwr_mem_stat->n_pre[PREV_STAT_IDX][i]);
+      }
     }
     return total;
   }
-  unsigned get_dram_rd() {
+  double get_dram_rd(bool aggregate_stat) {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
-      total += (pwr_mem_stat->n_rd[CURRENT_STAT_IDX][i] -
+      if(aggregate_stat){
+        total += pwr_mem_stat->n_rd[CURRENT_STAT_IDX][i];
+      }
+      else{
+        total += (pwr_mem_stat->n_rd[CURRENT_STAT_IDX][i] -
                 pwr_mem_stat->n_rd[PREV_STAT_IDX][i]);
+      }
     }
     return total;
   }
-  unsigned get_dram_wr() {
+  double get_dram_wr(bool aggregate_stat) {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
-      total += (pwr_mem_stat->n_wr[CURRENT_STAT_IDX][i] -
-                pwr_mem_stat->n_wr[PREV_STAT_IDX][i]);
+      if(aggregate_stat){
+        total += pwr_mem_stat->n_wr[CURRENT_STAT_IDX][i] + 
+                pwr_mem_stat->n_wr_WB[CURRENT_STAT_IDX][i];
+      }
+      else{
+        total += (pwr_mem_stat->n_wr[CURRENT_STAT_IDX][i] - 
+                pwr_mem_stat->n_wr[PREV_STAT_IDX][i]) +
+                (pwr_mem_stat->n_wr_WB[CURRENT_STAT_IDX][i] - 
+                pwr_mem_stat->n_wr_WB[PREV_STAT_IDX][i]);
+      }
     }
     return total;
   }
-  unsigned get_dram_req() {
+  double get_dram_req() {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
       total += (pwr_mem_stat->n_req[CURRENT_STAT_IDX][i] -
@@ -751,20 +1138,31 @@ class power_stat_t {
     return total;
   }
 
-  long get_icnt_simt_to_mem() {
+  unsigned long long get_icnt_simt_to_mem(bool aggregate_stat) {
     long total = 0;
-    for (unsigned i = 0; i < m_config->n_simt_clusters; ++i) {
-      total += (pwr_mem_stat->n_simt_to_mem[CURRENT_STAT_IDX][i] -
+    for (unsigned i = 0; i < m_config->n_simt_clusters; ++i){
+      if(aggregate_stat){
+        total += pwr_mem_stat->n_simt_to_mem[CURRENT_STAT_IDX][i];
+      }
+      else{
+        total += (pwr_mem_stat->n_simt_to_mem[CURRENT_STAT_IDX][i] -
                 pwr_mem_stat->n_simt_to_mem[PREV_STAT_IDX][i]);
+      }
     }
     return total;
   }
 
-  long get_icnt_mem_to_simt() {
+  unsigned long long get_icnt_mem_to_simt(bool aggregate_stat) {
     long total = 0;
     for (unsigned i = 0; i < m_config->n_simt_clusters; ++i) {
-      total += (pwr_mem_stat->n_mem_to_simt[CURRENT_STAT_IDX][i] -
+      if(aggregate_stat){
+        total += pwr_mem_stat->n_mem_to_simt[CURRENT_STAT_IDX][i];
+      }
+      
+      else{
+        total += (pwr_mem_stat->n_mem_to_simt[CURRENT_STAT_IDX][i] -
                 pwr_mem_stat->n_mem_to_simt[PREV_STAT_IDX][i]);
+      }
     }
     return total;
   }
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index bcfda1867..c0161dd31 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -1,19 +1,20 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Wilson W.L. Fung, Ali Bakhoda,
-// George L. Yuan, Andrew Turner, Inderpreet Singh
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Ali Bakhoda,
+// George L. Yuan, Andrew Turner, Inderpreet Singh, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -485,6 +486,10 @@ shader_core_ctx::shader_core_ctx(class gpgpu_sim *gpu,
   m_sid = shader_id;
   m_tpc = tpc_id;
 
+  if(get_gpu()->get_config().g_power_simulation_enabled){
+    scaling_coeffs =  get_gpu()->get_scaling_coeffs();
+  }
+
   m_last_inst_gpu_sim_cycle = 0;
   m_last_inst_gpu_tot_sim_cycle = 0;
 
@@ -888,7 +893,7 @@ void shader_core_ctx::decode() {
     m_warp[m_inst_fetch_buffer.m_warp_id]->inc_inst_in_pipeline();
     if (pI1) {
       m_stats->m_num_decoded_insn[m_sid]++;
-      if (pI1->oprnd_type == INT_OP) {
+      if ((pI1->oprnd_type == INT_OP) || (pI1->oprnd_type == UN_OP))  { //these counters get added up in mcPat to compute scheduler power
         m_stats->m_num_INTdecoded_insn[m_sid]++;
       } else if (pI1->oprnd_type == FP_OP) {
         m_stats->m_num_FPdecoded_insn[m_sid]++;
@@ -899,7 +904,7 @@ void shader_core_ctx::decode() {
         m_warp[m_inst_fetch_buffer.m_warp_id]->ibuffer_fill(1, pI2);
         m_warp[m_inst_fetch_buffer.m_warp_id]->inc_inst_in_pipeline();
         m_stats->m_num_decoded_insn[m_sid]++;
-        if (pI2->oprnd_type == INT_OP) {
+        if ((pI1->oprnd_type == INT_OP) || (pI1->oprnd_type == UN_OP))  { //these counters get added up in mcPat to compute scheduler power
           m_stats->m_num_INTdecoded_insn[m_sid]++;
         } else if (pI2->oprnd_type == FP_OP) {
           m_stats->m_num_FPdecoded_insn[m_sid]++;
@@ -982,8 +987,10 @@ void shader_core_ctx::fetch() {
               m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
           std::list<cache_event> events;
           enum cache_request_status status;
-          if (m_config->perfect_inst_const_cache)
+          if (m_config->perfect_inst_const_cache){
             status = HIT;
+            shader_cache_access_log(m_sid, INSTRUCTION, 0);
+          }
           else
             status = m_L1I->access(
                 (new_addr_type)ppc, mf,
@@ -2275,7 +2282,7 @@ void sp_unit::active_lanes_in_pipeline() {
 void dp_unit::active_lanes_in_pipeline() {
   unsigned active_count = pipelined_simd_unit::get_active_lanes_in_pipeline();
   assert(active_count <= m_core->get_config()->warp_size);
-  m_core->incspactivelanes_stat(active_count);
+  //m_core->incspactivelanes_stat(active_count);
   m_core->incfuactivelanes_stat(active_count);
   m_core->incfumemactivelanes_stat(active_count);
 }
@@ -3079,52 +3086,69 @@ void warp_inst_t::print(FILE *fout) const {
   m_config->gpgpu_ctx->func_sim->ptx_print_insn(pc, fout);
   fprintf(fout, "\n");
 }
-void shader_core_ctx::incexecstat(warp_inst_t *&inst) {
-  if (inst->mem_op == TEX) inctex_stat(inst->active_count(), 1);
-
-  // Latency numbers for next operations are used to scale the power values
-  // for special operations, according observations from microbenchmarking
-  // TODO: put these numbers in the xml configuration
-
-  switch (inst->sp_op) {
+void shader_core_ctx::incexecstat(warp_inst_t *&inst)
+{
+    // Latency numbers for next operations are used to scale the power values
+    // for special operations, according observations from microbenchmarking
+    // TODO: put these numbers in the xml configuration
+  if(get_gpu()->get_config().g_power_simulation_enabled){
+    switch(inst->sp_op){
     case INT__OP:
-      incialu_stat(inst->active_count(), 32);
+      incialu_stat(inst->active_count(), scaling_coeffs->int_coeff);
       break;
     case INT_MUL_OP:
-      incimul_stat(inst->active_count(), 7.2);
+      incimul_stat(inst->active_count(), scaling_coeffs->int_mul_coeff);
       break;
     case INT_MUL24_OP:
-      incimul24_stat(inst->active_count(), 4.2);
+      incimul24_stat(inst->active_count(), scaling_coeffs->int_mul24_coeff);
       break;
     case INT_MUL32_OP:
-      incimul32_stat(inst->active_count(), 4);
+      incimul32_stat(inst->active_count(), scaling_coeffs->int_mul32_coeff);
       break;
     case INT_DIV_OP:
-      incidiv_stat(inst->active_count(), 40);
+      incidiv_stat(inst->active_count(), scaling_coeffs->int_div_coeff);
       break;
     case FP__OP:
-      incfpalu_stat(inst->active_count(), 1);
+      incfpalu_stat(inst->active_count(),scaling_coeffs->fp_coeff);
       break;
     case FP_MUL_OP:
-      incfpmul_stat(inst->active_count(), 1.8);
+      incfpmul_stat(inst->active_count(), scaling_coeffs->fp_mul_coeff);
       break;
     case FP_DIV_OP:
-      incfpdiv_stat(inst->active_count(), 48);
+      incfpdiv_stat(inst->active_count(), scaling_coeffs->fp_div_coeff);
+      break;
+    case DP___OP:
+      incdpalu_stat(inst->active_count(), scaling_coeffs->dp_coeff);
+      break;
+    case DP_MUL_OP:
+      incdpmul_stat(inst->active_count(), scaling_coeffs->dp_mul_coeff);
+      break;
+    case DP_DIV_OP:
+      incdpdiv_stat(inst->active_count(), scaling_coeffs->dp_div_coeff);
       break;
     case FP_SQRT_OP:
-      inctrans_stat(inst->active_count(), 25);
+      incsqrt_stat(inst->active_count(), scaling_coeffs->sqrt_coeff);
       break;
     case FP_LG_OP:
-      inctrans_stat(inst->active_count(), 35);
+      inclog_stat(inst->active_count(), scaling_coeffs->log_coeff);
       break;
     case FP_SIN_OP:
-      inctrans_stat(inst->active_count(), 12);
+      incsin_stat(inst->active_count(), scaling_coeffs->sin_coeff);
       break;
     case FP_EXP_OP:
-      inctrans_stat(inst->active_count(), 35);
+      incexp_stat(inst->active_count(), scaling_coeffs->exp_coeff);
+      break;
+    case TENSOR__OP:
+      inctensor_stat(inst->active_count(), scaling_coeffs->tensor_coeff);
+      break;
+    case TEX__OP:
+      inctex_stat(inst->active_count(), scaling_coeffs->tex_coeff);
       break;
     default:
       break;
+    }
+    if(inst->const_cache_operand) //warp has const address space load as one operand
+      inc_const_accesses(1);
   }
 }
 void shader_core_ctx::print_stage(unsigned int stage, FILE *fout) const {
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index f2fac1209..65d56251c 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -1,19 +1,20 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Wilson W.L. Fung, Andrew Turner,
-// Ali Bakhoda
-// The University of British Columbia
+// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Andrew Turner,
+// Ali Bakhoda, Vijay Kandiah, Nikos Hardavellas
+// The University of British Columbia, Northwestern University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer;
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution;
+// 3. Neither the names of The University of British Columbia, Northwestern 
+//    University nor the names of their contributors may be used to
+//    endorse or promote products derived from this software without specific
+//    prior written permission.
 //
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -1709,18 +1710,26 @@ struct shader_core_stats_pod {
   unsigned *m_num_INTdecoded_insn;
   unsigned *m_num_storequeued_insn;
   unsigned *m_num_loadqueued_insn;
-  unsigned *m_num_ialu_acesses;
-  unsigned *m_num_fp_acesses;
-  unsigned *m_num_imul_acesses;
   unsigned *m_num_tex_inst;
-  unsigned *m_num_fpmul_acesses;
-  unsigned *m_num_idiv_acesses;
-  unsigned *m_num_fpdiv_acesses;
-  unsigned *m_num_sp_acesses;
-  unsigned *m_num_sfu_acesses;
-  unsigned *m_num_tensor_core_acesses;
-  unsigned *m_num_trans_acesses;
-  unsigned *m_num_mem_acesses;
+  double *m_num_ialu_acesses;
+  double *m_num_fp_acesses;
+  double *m_num_imul_acesses;
+  double *m_num_fpmul_acesses;
+  double *m_num_idiv_acesses;
+  double *m_num_fpdiv_acesses;
+  double *m_num_sp_acesses;
+  double *m_num_sfu_acesses;
+  double *m_num_tensor_core_acesses;
+  double *m_num_tex_acesses;
+  double *m_num_const_acesses;
+  double *m_num_dp_acesses;
+  double *m_num_dpmul_acesses;
+  double *m_num_dpdiv_acesses;
+  double *m_num_sqrt_acesses;
+  double *m_num_log_acesses;
+  double *m_num_sin_acesses;
+  double *m_num_exp_acesses;
+  double *m_num_mem_acesses;
   unsigned *m_num_sp_committed;
   unsigned *m_num_tlb_hits;
   unsigned *m_num_tlb_accesses;
@@ -1730,13 +1739,15 @@ struct shader_core_stats_pod {
   unsigned *m_read_regfile_acesses;
   unsigned *m_write_regfile_acesses;
   unsigned *m_non_rf_operands;
-  unsigned *m_num_imul24_acesses;
-  unsigned *m_num_imul32_acesses;
+  double *m_num_imul24_acesses;
+  double *m_num_imul32_acesses;
   unsigned *m_active_sp_lanes;
   unsigned *m_active_sfu_lanes;
   unsigned *m_active_tensor_core_lanes;
   unsigned *m_active_fu_lanes;
   unsigned *m_active_fu_mem_lanes;
+  double *m_active_exu_threads; //For power model
+  double *m_active_exu_warps; //For power model
   unsigned *m_n_diverge;  // number of divergence occurring in this shader
   unsigned gpgpu_n_load_insn;
   unsigned gpgpu_n_store_insn;
@@ -1807,38 +1818,56 @@ class shader_core_stats : public shader_core_stats_pod {
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_num_loadqueued_insn =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+    m_num_tex_inst = 
+        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_num_INTdecoded_insn =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_num_ialu_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_num_fp_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
-    m_num_tex_inst = (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_num_imul_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_num_imul24_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_num_imul32_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_num_fpmul_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_num_idiv_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_num_fpdiv_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_dp_acesses = 
+        (double*) calloc(config->num_shader(),sizeof(double));
+    m_num_dpmul_acesses = 
+        (double*) calloc(config->num_shader(),sizeof(double));
+    m_num_dpdiv_acesses = 
+        (double*) calloc(config->num_shader(),sizeof(double));
     m_num_sp_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_num_sfu_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
-    m_num_tensor_core_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
-    m_num_trans_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_tensor_core_acesses = 
+        (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_const_acesses =
+        (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_tex_acesses =
+        (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_sqrt_acesses = 
+        (double*) calloc(config->num_shader(),sizeof(double));
+    m_num_log_acesses = 
+        (double*) calloc(config->num_shader(),sizeof(double));
+    m_num_sin_acesses = 
+        (double*) calloc(config->num_shader(),sizeof(double));
+    m_num_exp_acesses = 
+        (double*) calloc(config->num_shader(),sizeof(double));
     m_num_mem_acesses =
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_num_sp_committed =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
-    m_num_tlb_hits = (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+    m_num_tlb_hits = 
+        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_num_tlb_accesses =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_active_sp_lanes =
@@ -1849,6 +1878,10 @@ class shader_core_stats : public shader_core_stats_pod {
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_active_fu_lanes =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+    m_active_exu_threads =
+        (double *)calloc(config->num_shader(), sizeof(double));
+    m_active_exu_warps =
+        (double *)calloc(config->num_shader(), sizeof(double));
     m_active_fu_mem_lanes =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_num_sfu_committed =
@@ -1863,7 +1896,8 @@ class shader_core_stats : public shader_core_stats_pod {
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_non_rf_operands =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
-    m_n_diverge = (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+    m_n_diverge = 
+        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     shader_cycle_distro =
         (unsigned *)calloc(config->warp_size + 3, sizeof(unsigned));
     last_shader_cycle_distro =
@@ -1892,6 +1926,48 @@ class shader_core_stats : public shader_core_stats_pod {
     delete m_incoming_traffic_stats;
     free(m_num_sim_insn);
     free(m_num_sim_winsn);
+    free(m_num_FPdecoded_insn);
+    free(m_num_INTdecoded_insn);
+    free(m_num_storequeued_insn);
+    free(m_num_loadqueued_insn);
+    free(m_num_ialu_acesses);
+    free(m_num_fp_acesses);
+    free(m_num_imul_acesses);
+    free(m_num_tex_inst);
+    free(m_num_fpmul_acesses);
+    free(m_num_idiv_acesses);
+    free(m_num_fpdiv_acesses);
+    free(m_num_sp_acesses);
+    free(m_num_sfu_acesses);
+    free(m_num_tensor_core_acesses);
+    free(m_num_tex_acesses);
+    free(m_num_const_acesses);
+    free(m_num_dp_acesses);
+    free(m_num_dpmul_acesses);
+    free(m_num_dpdiv_acesses);
+    free(m_num_sqrt_acesses);
+    free(m_num_log_acesses);
+    free(m_num_sin_acesses);
+    free(m_num_exp_acesses);
+    free(m_num_mem_acesses);
+    free(m_num_sp_committed);
+    free(m_num_tlb_hits);
+    free(m_num_tlb_accesses);
+    free(m_num_sfu_committed);
+    free(m_num_tensor_core_committed);
+    free(m_num_mem_committed);
+    free(m_read_regfile_acesses);
+    free(m_write_regfile_acesses);
+    free(m_non_rf_operands);
+    free(m_num_imul24_acesses);
+    free(m_num_imul32_acesses);
+    free(m_active_sp_lanes);
+    free(m_active_sfu_lanes);
+    free(m_active_tensor_core_lanes);
+    free(m_active_fu_lanes);
+    free(m_active_exu_threads);
+    free(m_active_exu_warps);
+    free(m_active_fu_mem_lanes);
     free(m_n_diverge);
     free(shader_cycle_distro);
     free(last_shader_cycle_distro);
@@ -1996,7 +2072,7 @@ class shader_core_ctx : public core_t {
     printf("GPGPU-Sim uArch: Shader %d bind to kernel %u \'%s\'\n", m_sid,
            m_kernel->get_uid(), m_kernel->name().c_str());
   }
-
+  PowerscalingCoefficients *scaling_coeffs;
   // accessors
   bool fetch_unit_response_buffer_full() const;
   bool ldst_unit_response_buffer_full() const;
@@ -2054,119 +2130,206 @@ class shader_core_ctx : public core_t {
 
   void incload_stat() { m_stats->m_num_loadqueued_insn[m_sid]++; }
   void incstore_stat() { m_stats->m_num_storequeued_insn[m_sid]++; }
-  void incialu_stat(unsigned active_count, double latency) {
-    if (m_config->gpgpu_clock_gated_lanes == false) {
-      m_stats->m_num_ialu_acesses[m_sid] =
-          m_stats->m_num_ialu_acesses[m_sid] + active_count * latency +
-          inactive_lanes_accesses_nonsfu(active_count, latency);
-    } else {
-      m_stats->m_num_ialu_acesses[m_sid] =
-          m_stats->m_num_ialu_acesses[m_sid] + active_count * latency;
+  void incialu_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_ialu_acesses[m_sid]=m_stats->m_num_ialu_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_nonsfu(active_count, latency);
+    }else {
+      m_stats->m_num_ialu_acesses[m_sid]=m_stats->m_num_ialu_acesses[m_sid]+(double)active_count*latency;
     }
-  }
-  void inctex_stat(unsigned active_count, double latency) {
-    m_stats->m_num_tex_inst[m_sid] =
-        m_stats->m_num_tex_inst[m_sid] + active_count * latency;
-  }
-  void incimul_stat(unsigned active_count, double latency) {
-    if (m_config->gpgpu_clock_gated_lanes == false) {
-      m_stats->m_num_imul_acesses[m_sid] =
-          m_stats->m_num_imul_acesses[m_sid] + active_count * latency +
-          inactive_lanes_accesses_nonsfu(active_count, latency);
-    } else {
-      m_stats->m_num_imul_acesses[m_sid] =
-          m_stats->m_num_imul_acesses[m_sid] + active_count * latency;
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+  }
+  void incimul_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_imul_acesses[m_sid]=m_stats->m_num_imul_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_nonsfu(active_count, latency);
+    }else {
+      m_stats->m_num_imul_acesses[m_sid]=m_stats->m_num_imul_acesses[m_sid]+(double)active_count*latency;
     }
-  }
-  void incimul24_stat(unsigned active_count, double latency) {
-    if (m_config->gpgpu_clock_gated_lanes == false) {
-      m_stats->m_num_imul24_acesses[m_sid] =
-          m_stats->m_num_imul24_acesses[m_sid] + active_count * latency +
-          inactive_lanes_accesses_nonsfu(active_count, latency);
-    } else {
-      m_stats->m_num_imul24_acesses[m_sid] =
-          m_stats->m_num_imul24_acesses[m_sid] + active_count * latency;
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+  }
+  void incimul24_stat(unsigned active_count,double latency) {
+  if(m_config->gpgpu_clock_gated_lanes==false){
+    m_stats->m_num_imul24_acesses[m_sid]=m_stats->m_num_imul24_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_nonsfu(active_count, latency);
+    }else {
+      m_stats->m_num_imul24_acesses[m_sid]=m_stats->m_num_imul24_acesses[m_sid]+(double)active_count*latency;
     }
-  }
-  void incimul32_stat(unsigned active_count, double latency) {
-    if (m_config->gpgpu_clock_gated_lanes == false) {
-      m_stats->m_num_imul32_acesses[m_sid] =
-          m_stats->m_num_imul32_acesses[m_sid] + active_count * latency +
-          inactive_lanes_accesses_sfu(active_count, latency);
-    } else {
-      m_stats->m_num_imul32_acesses[m_sid] =
-          m_stats->m_num_imul32_acesses[m_sid] + active_count * latency;
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;    
+   }
+   void incimul32_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_imul32_acesses[m_sid]=m_stats->m_num_imul32_acesses[m_sid]+(double)active_count*latency
+         + inactive_lanes_accesses_sfu(active_count, latency);          
+    }else{
+      m_stats->m_num_imul32_acesses[m_sid]=m_stats->m_num_imul32_acesses[m_sid]+(double)active_count*latency;
     }
-    // printf("Int_Mul -- Active_count: %d\n",active_count);
-  }
-  void incidiv_stat(unsigned active_count, double latency) {
-    if (m_config->gpgpu_clock_gated_lanes == false) {
-      m_stats->m_num_idiv_acesses[m_sid] =
-          m_stats->m_num_idiv_acesses[m_sid] + active_count * latency +
-          inactive_lanes_accesses_sfu(active_count, latency);
-    } else {
-      m_stats->m_num_idiv_acesses[m_sid] =
-          m_stats->m_num_idiv_acesses[m_sid] + active_count * latency;
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+  }
+   void incidiv_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_idiv_acesses[m_sid]=m_stats->m_num_idiv_acesses[m_sid]+(double)active_count*latency
+         + inactive_lanes_accesses_sfu(active_count, latency); 
+    }else {
+      m_stats->m_num_idiv_acesses[m_sid]=m_stats->m_num_idiv_acesses[m_sid]+(double)active_count*latency;
     }
-  }
-  void incfpalu_stat(unsigned active_count, double latency) {
-    if (m_config->gpgpu_clock_gated_lanes == false) {
-      m_stats->m_num_fp_acesses[m_sid] =
-          m_stats->m_num_fp_acesses[m_sid] + active_count * latency +
-          inactive_lanes_accesses_nonsfu(active_count, latency);
-    } else {
-      m_stats->m_num_fp_acesses[m_sid] =
-          m_stats->m_num_fp_acesses[m_sid] + active_count * latency;
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;    
+  }
+   void incfpalu_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_fp_acesses[m_sid]=m_stats->m_num_fp_acesses[m_sid]+(double)active_count*latency
+         + inactive_lanes_accesses_nonsfu(active_count, latency);
+    }else {
+    m_stats->m_num_fp_acesses[m_sid]=m_stats->m_num_fp_acesses[m_sid]+(double)active_count*latency;
+    }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;     
+  }
+   void incfpmul_stat(unsigned active_count,double latency) {
+              // printf("FP MUL stat increament\n");
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_fpmul_acesses[m_sid]=m_stats->m_num_fpmul_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_nonsfu(active_count, latency);
+    }else {
+    m_stats->m_num_fpmul_acesses[m_sid]=m_stats->m_num_fpmul_acesses[m_sid]+(double)active_count*latency;
+    }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+   }
+   void incfpdiv_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_fpdiv_acesses[m_sid]=m_stats->m_num_fpdiv_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_sfu(active_count, latency); 
+    }else {
+      m_stats->m_num_fpdiv_acesses[m_sid]=m_stats->m_num_fpdiv_acesses[m_sid]+(double)active_count*latency;
+    }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+   }
+   void incdpalu_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_dp_acesses[m_sid]=m_stats->m_num_dp_acesses[m_sid]+(double)active_count*latency
+         + inactive_lanes_accesses_nonsfu(active_count, latency);
+    }else {
+    m_stats->m_num_dp_acesses[m_sid]=m_stats->m_num_dp_acesses[m_sid]+(double)active_count*latency;
+    }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++; 
+   }
+   void incdpmul_stat(unsigned active_count,double latency) {
+              // printf("FP MUL stat increament\n");
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_dpmul_acesses[m_sid]=m_stats->m_num_dpmul_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_nonsfu(active_count, latency);
+    }else {
+    m_stats->m_num_dpmul_acesses[m_sid]=m_stats->m_num_dpmul_acesses[m_sid]+(double)active_count*latency;
+    }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+   }
+   void incdpdiv_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_dpdiv_acesses[m_sid]=m_stats->m_num_dpdiv_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_sfu(active_count, latency); 
+    }else {
+      m_stats->m_num_dpdiv_acesses[m_sid]=m_stats->m_num_dpdiv_acesses[m_sid]+(double)active_count*latency;
+    }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+   }
+
+   void incsqrt_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_sqrt_acesses[m_sid]=m_stats->m_num_sqrt_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_sfu(active_count, latency); 
+    }else{
+      m_stats->m_num_sqrt_acesses[m_sid]=m_stats->m_num_sqrt_acesses[m_sid]+(double)active_count*latency;
     }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+   }
+
+   void inclog_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_log_acesses[m_sid]=m_stats->m_num_log_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_sfu(active_count, latency); 
+    }else{
+      m_stats->m_num_log_acesses[m_sid]=m_stats->m_num_log_acesses[m_sid]+(double)active_count*latency;
+    }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+   }
+
+   void incexp_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_exp_acesses[m_sid]=m_stats->m_num_exp_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_sfu(active_count, latency); 
+    }else{
+      m_stats->m_num_exp_acesses[m_sid]=m_stats->m_num_exp_acesses[m_sid]+(double)active_count*latency;
+    }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
   }
-  void incfpmul_stat(unsigned active_count, double latency) {
-    // printf("FP MUL stat increament\n");
-    if (m_config->gpgpu_clock_gated_lanes == false) {
-      m_stats->m_num_fpmul_acesses[m_sid] =
-          m_stats->m_num_fpmul_acesses[m_sid] + active_count * latency +
-          inactive_lanes_accesses_nonsfu(active_count, latency);
-    } else {
-      m_stats->m_num_fpmul_acesses[m_sid] =
-          m_stats->m_num_fpmul_acesses[m_sid] + active_count * latency;
+
+   void incsin_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_sin_acesses[m_sid]=m_stats->m_num_sin_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_sfu(active_count, latency); 
+    }else{
+      m_stats->m_num_sin_acesses[m_sid]=m_stats->m_num_sin_acesses[m_sid]+(double)active_count*latency;
     }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
   }
-  void incfpdiv_stat(unsigned active_count, double latency) {
-    if (m_config->gpgpu_clock_gated_lanes == false) {
-      m_stats->m_num_fpdiv_acesses[m_sid] =
-          m_stats->m_num_fpdiv_acesses[m_sid] + active_count * latency +
-          inactive_lanes_accesses_sfu(active_count, latency);
-    } else {
-      m_stats->m_num_fpdiv_acesses[m_sid] =
-          m_stats->m_num_fpdiv_acesses[m_sid] + active_count * latency;
+
+
+   void inctensor_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_tensor_core_acesses[m_sid]=m_stats->m_num_tensor_core_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_sfu(active_count, latency); 
+    }else{
+      m_stats->m_num_tensor_core_acesses[m_sid]=m_stats->m_num_tensor_core_acesses[m_sid]+(double)active_count*latency;
     }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
   }
-  void inctrans_stat(unsigned active_count, double latency) {
-    if (m_config->gpgpu_clock_gated_lanes == false) {
-      m_stats->m_num_trans_acesses[m_sid] =
-          m_stats->m_num_trans_acesses[m_sid] + active_count * latency +
-          inactive_lanes_accesses_sfu(active_count, latency);
-    } else {
-      m_stats->m_num_trans_acesses[m_sid] =
-          m_stats->m_num_trans_acesses[m_sid] + active_count * latency;
+
+  void inctex_stat(unsigned active_count,double latency) {
+    if(m_config->gpgpu_clock_gated_lanes==false){
+      m_stats->m_num_tex_acesses[m_sid]=m_stats->m_num_tex_acesses[m_sid]+(double)active_count*latency
+        + inactive_lanes_accesses_sfu(active_count, latency); 
+    }else{
+      m_stats->m_num_tex_acesses[m_sid]=m_stats->m_num_tex_acesses[m_sid]+(double)active_count*latency;
     }
+    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+  }
+
+  void inc_const_accesses(unsigned active_count) {
+    m_stats->m_num_const_acesses[m_sid]=m_stats->m_num_const_acesses[m_sid]+active_count;
   }
 
   void incsfu_stat(unsigned active_count, double latency) {
     m_stats->m_num_sfu_acesses[m_sid] =
-        m_stats->m_num_sfu_acesses[m_sid] + active_count * latency;
+        m_stats->m_num_sfu_acesses[m_sid] + (double)active_count*latency;
   }
   void incsp_stat(unsigned active_count, double latency) {
     m_stats->m_num_sp_acesses[m_sid] =
-        m_stats->m_num_sp_acesses[m_sid] + active_count * latency;
+        m_stats->m_num_sp_acesses[m_sid] + (double)active_count*latency;
   }
   void incmem_stat(unsigned active_count, double latency) {
     if (m_config->gpgpu_clock_gated_lanes == false) {
       m_stats->m_num_mem_acesses[m_sid] =
-          m_stats->m_num_mem_acesses[m_sid] + active_count * latency +
+          m_stats->m_num_mem_acesses[m_sid] + (double)active_count*latency +
           inactive_lanes_accesses_nonsfu(active_count, latency);
     } else {
       m_stats->m_num_mem_acesses[m_sid] =
-          m_stats->m_num_mem_acesses[m_sid] + active_count * latency;
+          m_stats->m_num_mem_acesses[m_sid] + (double)active_count*latency;
     }
   }
   void incexecstat(warp_inst_t *&inst);
diff --git a/src/gpgpu-sim/stat-tool.cc b/src/gpgpu-sim/stat-tool.cc
index 6fafaa6af..0513d17ed 100644
--- a/src/gpgpu-sim/stat-tool.cc
+++ b/src/gpgpu-sim/stat-tool.cc
@@ -369,8 +369,6 @@ void shader_mem_lat_print(FILE *fout) {
 static int s_cache_access_logger_n_types = 0;
 static std::vector<linear_histogram_logger> s_cache_access_logger;
 
-enum cache_access_logger_types { NORMALS, TEXTURE, CONSTANT, INSTRUCTION };
-
 int get_shader_normal_cache_id() { return NORMALS; }
 int get_shader_texture_cache_id() { return TEXTURE; }
 int get_shader_constant_cache_id() { return CONSTANT; }
diff --git a/src/gpgpu-sim/stat-tool.h b/src/gpgpu-sim/stat-tool.h
index 3a291be3a..fdf875600 100644
--- a/src/gpgpu-sim/stat-tool.h
+++ b/src/gpgpu-sim/stat-tool.h
@@ -268,6 +268,8 @@ class linear_histogram_logger : public snap_shot_trigger,
   static int s_ids;
 };
 
+enum cache_access_logger_types { NORMALS, TEXTURE, CONSTANT, INSTRUCTION };
+
 void try_snap_shot(unsigned long long current_cycle);
 void set_spill_interval(unsigned long long interval);
 void spill_log_to_file(FILE *fout, int final, unsigned long long current_cycle);
diff --git a/src/gpuwattch/gpgpu_sim_wrapper.cc b/src/gpuwattch/gpgpu_sim_wrapper.cc
deleted file mode 100644
index f2989f630..000000000
--- a/src/gpuwattch/gpgpu_sim_wrapper.cc
+++ /dev/null
@@ -1,863 +0,0 @@
-// Copyright (c) 2009-2011, Tor M. Aamodt, Tayler Hetherington, Ahmed ElTantawy,
-// The University of British Columbia
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// Redistributions of source code must retain the above copyright notice, this
-// list of conditions and the following disclaimer.
-// Redistributions in binary form must reproduce the above copyright notice,
-// this list of conditions and the following disclaimer in the documentation
-// and/or other materials provided with the distribution. Neither the name of
-// The University of British Columbia nor the names of its contributors may be
-// used to endorse or promote products derived from this software without
-// specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-
-#include "gpgpu_sim_wrapper.h"
-#include <sys/stat.h>
-#define SP_BASE_POWER 0
-#define SFU_BASE_POWER 0
-
-static const char* pwr_cmp_label[] = {
-    "IBP,", "ICP,",  "DCP,",   "TCP,",   "CCP,",        "SHRDP,",
-    "RFP,", "SPP,",  "SFUP,",  "FPUP,",  "SCHEDP,",     "L2CP,",
-    "MCP,", "NOCP,", "DRAMP,", "PIPEP,", "IDLE_COREP,", "CONST_DYNAMICP"};
-
-enum pwr_cmp_t {
-  IBP = 0,
-  ICP,
-  DCP,
-  TCP,
-  CCP,
-  SHRDP,
-  RFP,
-  SPP,
-  SFUP,
-  FPUP,
-  SCHEDP,
-  L2CP,
-  MCP,
-  NOCP,
-  DRAMP,
-  PIPEP,
-  IDLE_COREP,
-  CONST_DYNAMICP,
-  NUM_COMPONENTS_MODELLED
-};
-
-gpgpu_sim_wrapper::gpgpu_sim_wrapper(bool power_simulation_enabled,
-                                     char* xmlfile) {
-  kernel_sample_count = 0;
-  total_sample_count = 0;
-
-  kernel_tot_power = 0;
-
-  num_pwr_cmps = NUM_COMPONENTS_MODELLED;
-  num_perf_counters = NUM_PERFORMANCE_COUNTERS;
-
-  // Initialize per-component counter/power vectors
-  avg_max_min_counters<double> init;
-  kernel_cmp_pwr.resize(NUM_COMPONENTS_MODELLED, init);
-  kernel_cmp_perf_counters.resize(NUM_PERFORMANCE_COUNTERS, init);
-
-  kernel_power = init;   // Per-kernel powers
-  gpu_tot_power = init;  // Global powers
-
-  sample_cmp_pwr.resize(NUM_COMPONENTS_MODELLED, 0);
-
-  sample_perf_counters.resize(NUM_PERFORMANCE_COUNTERS, 0);
-  initpower_coeff.resize(NUM_PERFORMANCE_COUNTERS, 0);
-  effpower_coeff.resize(NUM_PERFORMANCE_COUNTERS, 0);
-
-  const_dynamic_power = 0;
-  proc_power = 0;
-
-  g_power_filename = NULL;
-  g_power_trace_filename = NULL;
-  g_metric_trace_filename = NULL;
-  g_steady_state_tracking_filename = NULL;
-  xml_filename = xmlfile;
-  g_power_simulation_enabled = power_simulation_enabled;
-  g_power_trace_enabled = false;
-  g_steady_power_levels_enabled = false;
-  g_power_trace_zlevel = 0;
-  g_power_per_cycle_dump = false;
-  gpu_steady_power_deviation = 0;
-  gpu_steady_min_period = 0;
-
-  gpu_stat_sample_freq = 0;
-  p = new ParseXML();
-  if (g_power_simulation_enabled) {
-    p->parse(xml_filename);
-  }
-  proc = new Processor(p);
-  power_trace_file = NULL;
-  metric_trace_file = NULL;
-  steady_state_tacking_file = NULL;
-  has_written_avg = false;
-  init_inst_val = false;
-}
-
-gpgpu_sim_wrapper::~gpgpu_sim_wrapper() {}
-
-bool gpgpu_sim_wrapper::sanity_check(double a, double b) {
-  if (b == 0)
-    return (abs(a - b) < 0.00001);
-  else
-    return (abs(a - b) / abs(b) < 0.00001);
-
-  return false;
-}
-void gpgpu_sim_wrapper::init_mcpat(
-    char* xmlfile, char* powerfilename, char* power_trace_filename,
-    char* metric_trace_filename, char* steady_state_filename,
-    bool power_sim_enabled, bool trace_enabled, bool steady_state_enabled,
-    bool power_per_cycle_dump, double steady_power_deviation,
-    double steady_min_period, int zlevel, double init_val,
-    int stat_sample_freq) {
-  // Write File Headers for (-metrics trace, -power trace)
-
-  reset_counters();
-  static bool mcpat_init = true;
-
-  // initialize file name if it is not set
-  time_t curr_time;
-  time(&curr_time);
-  char* date = ctime(&curr_time);
-  char* s = date;
-  while (*s) {
-    if (*s == ' ' || *s == '\t' || *s == ':') *s = '-';
-    if (*s == '\n' || *s == '\r') *s = 0;
-    s++;
-  }
-
-  if (mcpat_init) {
-    g_power_filename = powerfilename;
-    g_power_trace_filename = power_trace_filename;
-    g_metric_trace_filename = metric_trace_filename;
-    g_steady_state_tracking_filename = steady_state_filename;
-    xml_filename = xmlfile;
-    g_power_simulation_enabled = power_sim_enabled;
-    g_power_trace_enabled = trace_enabled;
-    g_steady_power_levels_enabled = steady_state_enabled;
-    g_power_trace_zlevel = zlevel;
-    g_power_per_cycle_dump = power_per_cycle_dump;
-    gpu_steady_power_deviation = steady_power_deviation;
-    gpu_steady_min_period = steady_min_period;
-
-    gpu_stat_sample_freq = stat_sample_freq;
-
-    // p->sys.total_cycles=gpu_stat_sample_freq*4;
-    p->sys.total_cycles = gpu_stat_sample_freq;
-    power_trace_file = NULL;
-    metric_trace_file = NULL;
-    steady_state_tacking_file = NULL;
-
-    if (g_power_trace_enabled) {
-      power_trace_file = gzopen(g_power_trace_filename, "w");
-      metric_trace_file = gzopen(g_metric_trace_filename, "w");
-      if ((power_trace_file == NULL) || (metric_trace_file == NULL)) {
-        printf("error - could not open trace files \n");
-        exit(1);
-      }
-      gzsetparams(power_trace_file, g_power_trace_zlevel, Z_DEFAULT_STRATEGY);
-
-      gzprintf(power_trace_file, "power,");
-      for (unsigned i = 0; i < num_pwr_cmps; i++) {
-        gzprintf(power_trace_file, pwr_cmp_label[i]);
-      }
-      gzprintf(power_trace_file, "\n");
-
-      gzsetparams(metric_trace_file, g_power_trace_zlevel, Z_DEFAULT_STRATEGY);
-      for (unsigned i = 0; i < num_perf_counters; i++) {
-        gzprintf(metric_trace_file, perf_count_label[i]);
-      }
-      gzprintf(metric_trace_file, "\n");
-
-      gzclose(power_trace_file);
-      gzclose(metric_trace_file);
-    }
-    if (g_steady_power_levels_enabled) {
-      steady_state_tacking_file = gzopen(g_steady_state_tracking_filename, "w");
-      if ((steady_state_tacking_file == NULL)) {
-        printf("error - could not open trace files \n");
-        exit(1);
-      }
-      gzsetparams(steady_state_tacking_file, g_power_trace_zlevel,
-                  Z_DEFAULT_STRATEGY);
-      gzprintf(steady_state_tacking_file, "start,end,power,IPC,");
-      for (unsigned i = 0; i < num_perf_counters; i++) {
-        gzprintf(steady_state_tacking_file, perf_count_label[i]);
-      }
-      gzprintf(steady_state_tacking_file, "\n");
-
-      gzclose(steady_state_tacking_file);
-    }
-
-    mcpat_init = false;
-    has_written_avg = false;
-    powerfile.open(g_power_filename);
-    int flg = chmod(g_power_filename, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
-    assert(flg == 0);
-  }
-  sample_val = 0;
-  init_inst_val = init_val;  // gpu_tot_sim_insn+gpu_sim_insn;
-}
-
-void gpgpu_sim_wrapper::reset_counters() {
-  avg_max_min_counters<double> init;
-  for (unsigned i = 0; i < num_perf_counters; ++i) {
-    sample_perf_counters[i] = 0;
-    kernel_cmp_perf_counters[i] = init;
-  }
-  for (unsigned i = 0; i < num_pwr_cmps; ++i) {
-    sample_cmp_pwr[i] = 0;
-    kernel_cmp_pwr[i] = init;
-  }
-
-  // Reset per-kernel counters
-  kernel_sample_count = 0;
-  kernel_tot_power = 0;
-  kernel_power = init;
-
-  return;
-}
-
-void gpgpu_sim_wrapper::set_inst_power(bool clk_gated_lanes, double tot_cycles,
-                                       double busy_cycles, double tot_inst,
-                                       double int_inst, double fp_inst,
-                                       double load_inst, double store_inst,
-                                       double committed_inst) {
-  p->sys.core[0].gpgpu_clock_gated_lanes = clk_gated_lanes;
-  p->sys.core[0].total_cycles = tot_cycles;
-  p->sys.core[0].busy_cycles = busy_cycles;
-  p->sys.core[0].total_instructions =
-      tot_inst * p->sys.scaling_coefficients[TOT_INST];
-  p->sys.core[0].int_instructions =
-      int_inst * p->sys.scaling_coefficients[FP_INT];
-  p->sys.core[0].fp_instructions =
-      fp_inst * p->sys.scaling_coefficients[FP_INT];
-  p->sys.core[0].load_instructions = load_inst;
-  p->sys.core[0].store_instructions = store_inst;
-  p->sys.core[0].committed_instructions = committed_inst;
-  sample_perf_counters[FP_INT] = int_inst + fp_inst;
-  sample_perf_counters[TOT_INST] = tot_inst;
-}
-
-void gpgpu_sim_wrapper::set_regfile_power(double reads, double writes,
-                                          double ops) {
-  p->sys.core[0].int_regfile_reads =
-      reads * p->sys.scaling_coefficients[REG_RD];
-  p->sys.core[0].int_regfile_writes =
-      writes * p->sys.scaling_coefficients[REG_WR];
-  p->sys.core[0].non_rf_operands =
-      ops * p->sys.scaling_coefficients[NON_REG_OPs];
-  sample_perf_counters[REG_RD] = reads;
-  sample_perf_counters[REG_WR] = writes;
-  sample_perf_counters[NON_REG_OPs] = ops;
-}
-
-void gpgpu_sim_wrapper::set_icache_power(double hits, double misses) {
-  p->sys.core[0].icache.read_accesses =
-      hits * p->sys.scaling_coefficients[IC_H] +
-      misses * p->sys.scaling_coefficients[IC_M];
-  p->sys.core[0].icache.read_misses =
-      misses * p->sys.scaling_coefficients[IC_M];
-  sample_perf_counters[IC_H] = hits;
-  sample_perf_counters[IC_M] = misses;
-}
-
-void gpgpu_sim_wrapper::set_ccache_power(double hits, double misses) {
-  p->sys.core[0].ccache.read_accesses =
-      hits * p->sys.scaling_coefficients[CC_H] +
-      misses * p->sys.scaling_coefficients[CC_M];
-  p->sys.core[0].ccache.read_misses =
-      misses * p->sys.scaling_coefficients[CC_M];
-  sample_perf_counters[CC_H] = hits;
-  sample_perf_counters[CC_M] = misses;
-  // TODO: coalescing logic is counted as part of the caches power (this is not
-  // valid for no-caches architectures)
-}
-
-void gpgpu_sim_wrapper::set_tcache_power(double hits, double misses) {
-  p->sys.core[0].tcache.read_accesses =
-      hits * p->sys.scaling_coefficients[TC_H] +
-      misses * p->sys.scaling_coefficients[TC_M];
-  p->sys.core[0].tcache.read_misses =
-      misses * p->sys.scaling_coefficients[TC_M];
-  sample_perf_counters[TC_H] = hits;
-  sample_perf_counters[TC_M] = misses;
-  // TODO: coalescing logic is counted as part of the caches power (this is not
-  // valid for no-caches architectures)
-}
-
-void gpgpu_sim_wrapper::set_shrd_mem_power(double accesses) {
-  p->sys.core[0].sharedmemory.read_accesses =
-      accesses * p->sys.scaling_coefficients[SHRD_ACC];
-  sample_perf_counters[SHRD_ACC] = accesses;
-}
-
-void gpgpu_sim_wrapper::set_l1cache_power(double read_hits, double read_misses,
-                                          double write_hits,
-                                          double write_misses) {
-  p->sys.core[0].dcache.read_accesses =
-      read_hits * p->sys.scaling_coefficients[DC_RH] +
-      read_misses * p->sys.scaling_coefficients[DC_RM];
-  p->sys.core[0].dcache.read_misses =
-      read_misses * p->sys.scaling_coefficients[DC_RM];
-  p->sys.core[0].dcache.write_accesses =
-      write_hits * p->sys.scaling_coefficients[DC_WH] +
-      write_misses * p->sys.scaling_coefficients[DC_WM];
-  p->sys.core[0].dcache.write_misses =
-      write_misses * p->sys.scaling_coefficients[DC_WM];
-  sample_perf_counters[DC_RH] = read_hits;
-  sample_perf_counters[DC_RM] = read_misses;
-  sample_perf_counters[DC_WH] = write_hits;
-  sample_perf_counters[DC_WM] = write_misses;
-  // TODO: coalescing logic is counted as part of the caches power (this is not
-  // valid for no-caches architectures)
-}
-
-void gpgpu_sim_wrapper::set_l2cache_power(double read_hits, double read_misses,
-                                          double write_hits,
-                                          double write_misses) {
-  p->sys.l2.total_accesses = read_hits * p->sys.scaling_coefficients[L2_RH] +
-                             read_misses * p->sys.scaling_coefficients[L2_RM] +
-                             write_hits * p->sys.scaling_coefficients[L2_WH] +
-                             write_misses * p->sys.scaling_coefficients[L2_WM];
-  p->sys.l2.read_accesses = read_hits * p->sys.scaling_coefficients[L2_RH] +
-                            read_misses * p->sys.scaling_coefficients[L2_RM];
-  p->sys.l2.write_accesses = write_hits * p->sys.scaling_coefficients[L2_WH] +
-                             write_misses * p->sys.scaling_coefficients[L2_WM];
-  p->sys.l2.read_hits = read_hits * p->sys.scaling_coefficients[L2_RH];
-  p->sys.l2.read_misses = read_misses * p->sys.scaling_coefficients[L2_RM];
-  p->sys.l2.write_hits = write_hits * p->sys.scaling_coefficients[L2_WH];
-  p->sys.l2.write_misses = write_misses * p->sys.scaling_coefficients[L2_WM];
-  sample_perf_counters[L2_RH] = read_hits;
-  sample_perf_counters[L2_RM] = read_misses;
-  sample_perf_counters[L2_WH] = write_hits;
-  sample_perf_counters[L2_WM] = write_misses;
-}
-
-void gpgpu_sim_wrapper::set_idle_core_power(double num_idle_core) {
-  p->sys.num_idle_cores = num_idle_core;
-  sample_perf_counters[IDLE_CORE_N] = num_idle_core;
-}
-
-void gpgpu_sim_wrapper::set_duty_cycle_power(double duty_cycle) {
-  p->sys.core[0].pipeline_duty_cycle =
-      duty_cycle * p->sys.scaling_coefficients[PIPE_A];
-  sample_perf_counters[PIPE_A] = duty_cycle;
-}
-
-void gpgpu_sim_wrapper::set_mem_ctrl_power(double reads, double writes,
-                                           double dram_precharge) {
-  p->sys.mc.memory_accesses = reads * p->sys.scaling_coefficients[MEM_RD] +
-                              writes * p->sys.scaling_coefficients[MEM_WR];
-  p->sys.mc.memory_reads = reads * p->sys.scaling_coefficients[MEM_RD];
-  p->sys.mc.memory_writes = writes * p->sys.scaling_coefficients[MEM_WR];
-  p->sys.mc.dram_pre = dram_precharge * p->sys.scaling_coefficients[MEM_PRE];
-  sample_perf_counters[MEM_RD] = reads;
-  sample_perf_counters[MEM_WR] = writes;
-  sample_perf_counters[MEM_PRE] = dram_precharge;
-}
-
-void gpgpu_sim_wrapper::set_exec_unit_power(double fpu_accesses,
-                                            double ialu_accesses,
-                                            double sfu_accesses) {
-  p->sys.core[0].fpu_accesses =
-      fpu_accesses * p->sys.scaling_coefficients[FPU_ACC];
-  // Integer ALU (not present in Tesla)
-  p->sys.core[0].ialu_accesses =
-      ialu_accesses * p->sys.scaling_coefficients[SP_ACC];
-  // Sfu accesses
-  p->sys.core[0].mul_accesses =
-      sfu_accesses * p->sys.scaling_coefficients[SFU_ACC];
-
-  sample_perf_counters[SP_ACC] = ialu_accesses;
-  sample_perf_counters[SFU_ACC] = sfu_accesses;
-  sample_perf_counters[FPU_ACC] = fpu_accesses;
-}
-
-void gpgpu_sim_wrapper::set_active_lanes_power(double sp_avg_active_lane,
-                                               double sfu_avg_active_lane) {
-  p->sys.core[0].sp_average_active_lanes = sp_avg_active_lane;
-  p->sys.core[0].sfu_average_active_lanes = sfu_avg_active_lane;
-}
-
-void gpgpu_sim_wrapper::set_NoC_power(double noc_tot_reads,
-                                      double noc_tot_writes) {
-  p->sys.NoC[0].total_accesses =
-      noc_tot_reads * p->sys.scaling_coefficients[NOC_A] +
-      noc_tot_writes * p->sys.scaling_coefficients[NOC_A];
-  sample_perf_counters[NOC_A] = noc_tot_reads + noc_tot_writes;
-}
-
-void gpgpu_sim_wrapper::power_metrics_calculations() {
-  total_sample_count++;
-  kernel_sample_count++;
-
-  // Current sample power
-  double sample_power =
-      proc->rt_power.readOp.dynamic + sample_cmp_pwr[CONST_DYNAMICP];
-
-  // Average power
-  // Previous + new + constant dynamic power (e.g., dynamic clocking power)
-  kernel_tot_power += sample_power;
-  kernel_power.avg = kernel_tot_power / kernel_sample_count;
-  for (unsigned ind = 0; ind < num_pwr_cmps; ++ind) {
-    kernel_cmp_pwr[ind].avg += (double)sample_cmp_pwr[ind];
-  }
-
-  for (unsigned ind = 0; ind < num_perf_counters; ++ind) {
-    kernel_cmp_perf_counters[ind].avg += (double)sample_perf_counters[ind];
-  }
-
-  // Max Power
-  if (sample_power > kernel_power.max) {
-    kernel_power.max = sample_power;
-    for (unsigned ind = 0; ind < num_pwr_cmps; ++ind) {
-      kernel_cmp_pwr[ind].max = (double)sample_cmp_pwr[ind];
-    }
-    for (unsigned ind = 0; ind < num_perf_counters; ++ind) {
-      kernel_cmp_perf_counters[ind].max = sample_perf_counters[ind];
-    }
-  }
-
-  // Min Power
-  if (sample_power < kernel_power.min || (kernel_power.min == 0)) {
-    kernel_power.min = sample_power;
-    for (unsigned ind = 0; ind < num_pwr_cmps; ++ind) {
-      kernel_cmp_pwr[ind].min = (double)sample_cmp_pwr[ind];
-    }
-    for (unsigned ind = 0; ind < num_perf_counters; ++ind) {
-      kernel_cmp_perf_counters[ind].min = sample_perf_counters[ind];
-    }
-  }
-
-  gpu_tot_power.avg = (gpu_tot_power.avg + sample_power);
-  gpu_tot_power.max =
-      (sample_power > gpu_tot_power.max) ? sample_power : gpu_tot_power.max;
-  gpu_tot_power.min =
-      ((sample_power < gpu_tot_power.min) || (gpu_tot_power.min == 0))
-          ? sample_power
-          : gpu_tot_power.min;
-}
-
-void gpgpu_sim_wrapper::print_trace_files() {
-  open_files();
-
-  for (unsigned i = 0; i < num_perf_counters; ++i) {
-    gzprintf(metric_trace_file, "%f,", sample_perf_counters[i]);
-  }
-  gzprintf(metric_trace_file, "\n");
-
-  gzprintf(power_trace_file, "%f,", proc_power);
-  for (unsigned i = 0; i < num_pwr_cmps; ++i) {
-    gzprintf(power_trace_file, "%f,", sample_cmp_pwr[i]);
-  }
-  gzprintf(power_trace_file, "\n");
-
-  close_files();
-}
-
-void gpgpu_sim_wrapper::update_coefficients() {
-  initpower_coeff[FP_INT] = proc->cores[0]->get_coefficient_fpint_insts();
-  effpower_coeff[FP_INT] =
-      initpower_coeff[FP_INT] * p->sys.scaling_coefficients[FP_INT];
-
-  initpower_coeff[TOT_INST] = proc->cores[0]->get_coefficient_tot_insts();
-  effpower_coeff[TOT_INST] =
-      initpower_coeff[TOT_INST] * p->sys.scaling_coefficients[TOT_INST];
-
-  initpower_coeff[REG_RD] =
-      proc->cores[0]->get_coefficient_regreads_accesses() *
-      (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
-  initpower_coeff[REG_WR] =
-      proc->cores[0]->get_coefficient_regwrites_accesses() *
-      (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
-  initpower_coeff[NON_REG_OPs] =
-      proc->cores[0]->get_coefficient_noregfileops_accesses() *
-      (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
-  effpower_coeff[REG_RD] =
-      initpower_coeff[REG_RD] * p->sys.scaling_coefficients[REG_RD];
-  effpower_coeff[REG_WR] =
-      initpower_coeff[REG_WR] * p->sys.scaling_coefficients[REG_WR];
-  effpower_coeff[NON_REG_OPs] =
-      initpower_coeff[NON_REG_OPs] * p->sys.scaling_coefficients[NON_REG_OPs];
-
-  initpower_coeff[IC_H] = proc->cores[0]->get_coefficient_icache_hits();
-  initpower_coeff[IC_M] = proc->cores[0]->get_coefficient_icache_misses();
-  effpower_coeff[IC_H] =
-      initpower_coeff[IC_H] * p->sys.scaling_coefficients[IC_H];
-  effpower_coeff[IC_M] =
-      initpower_coeff[IC_M] * p->sys.scaling_coefficients[IC_M];
-
-  initpower_coeff[CC_H] = (proc->cores[0]->get_coefficient_ccache_readhits() +
-                           proc->get_coefficient_readcoalescing());
-  initpower_coeff[CC_M] = (proc->cores[0]->get_coefficient_ccache_readmisses() +
-                           proc->get_coefficient_readcoalescing());
-  effpower_coeff[CC_H] =
-      initpower_coeff[CC_H] * p->sys.scaling_coefficients[CC_H];
-  effpower_coeff[CC_M] =
-      initpower_coeff[CC_M] * p->sys.scaling_coefficients[CC_M];
-
-  initpower_coeff[TC_H] = (proc->cores[0]->get_coefficient_tcache_readhits() +
-                           proc->get_coefficient_readcoalescing());
-  initpower_coeff[TC_M] = (proc->cores[0]->get_coefficient_tcache_readmisses() +
-                           proc->get_coefficient_readcoalescing());
-  effpower_coeff[TC_H] =
-      initpower_coeff[TC_H] * p->sys.scaling_coefficients[TC_H];
-  effpower_coeff[TC_M] =
-      initpower_coeff[TC_M] * p->sys.scaling_coefficients[TC_M];
-
-  initpower_coeff[SHRD_ACC] =
-      proc->cores[0]->get_coefficient_sharedmemory_readhits();
-  effpower_coeff[SHRD_ACC] =
-      initpower_coeff[SHRD_ACC] * p->sys.scaling_coefficients[SHRD_ACC];
-
-  initpower_coeff[DC_RH] = (proc->cores[0]->get_coefficient_dcache_readhits() +
-                            proc->get_coefficient_readcoalescing());
-  initpower_coeff[DC_RM] =
-      (proc->cores[0]->get_coefficient_dcache_readmisses() +
-       proc->get_coefficient_readcoalescing());
-  initpower_coeff[DC_WH] = (proc->cores[0]->get_coefficient_dcache_writehits() +
-                            proc->get_coefficient_writecoalescing());
-  initpower_coeff[DC_WM] =
-      (proc->cores[0]->get_coefficient_dcache_writemisses() +
-       proc->get_coefficient_writecoalescing());
-  effpower_coeff[DC_RH] =
-      initpower_coeff[DC_RH] * p->sys.scaling_coefficients[DC_RH];
-  effpower_coeff[DC_RM] =
-      initpower_coeff[DC_RM] * p->sys.scaling_coefficients[DC_RM];
-  effpower_coeff[DC_WH] =
-      initpower_coeff[DC_WH] * p->sys.scaling_coefficients[DC_WH];
-  effpower_coeff[DC_WM] =
-      initpower_coeff[DC_WM] * p->sys.scaling_coefficients[DC_WM];
-
-  initpower_coeff[L2_RH] = proc->get_coefficient_l2_read_hits();
-  initpower_coeff[L2_RM] = proc->get_coefficient_l2_read_misses();
-  initpower_coeff[L2_WH] = proc->get_coefficient_l2_write_hits();
-  initpower_coeff[L2_WM] = proc->get_coefficient_l2_write_misses();
-  effpower_coeff[L2_RH] =
-      initpower_coeff[L2_RH] * p->sys.scaling_coefficients[L2_RH];
-  effpower_coeff[L2_RM] =
-      initpower_coeff[L2_RM] * p->sys.scaling_coefficients[L2_RM];
-  effpower_coeff[L2_WH] =
-      initpower_coeff[L2_WH] * p->sys.scaling_coefficients[L2_WH];
-  effpower_coeff[L2_WM] =
-      initpower_coeff[L2_WM] * p->sys.scaling_coefficients[L2_WM];
-
-  initpower_coeff[IDLE_CORE_N] =
-      p->sys.idle_core_power * proc->cores[0]->executionTime;
-  effpower_coeff[IDLE_CORE_N] =
-      initpower_coeff[IDLE_CORE_N] * p->sys.scaling_coefficients[IDLE_CORE_N];
-
-  initpower_coeff[PIPE_A] = proc->cores[0]->get_coefficient_duty_cycle();
-  effpower_coeff[PIPE_A] =
-      initpower_coeff[PIPE_A] * p->sys.scaling_coefficients[PIPE_A];
-
-  initpower_coeff[MEM_RD] = proc->get_coefficient_mem_reads();
-  initpower_coeff[MEM_WR] = proc->get_coefficient_mem_writes();
-  initpower_coeff[MEM_PRE] = proc->get_coefficient_mem_pre();
-  effpower_coeff[MEM_RD] =
-      initpower_coeff[MEM_RD] * p->sys.scaling_coefficients[MEM_RD];
-  effpower_coeff[MEM_WR] =
-      initpower_coeff[MEM_WR] * p->sys.scaling_coefficients[MEM_WR];
-  effpower_coeff[MEM_PRE] =
-      initpower_coeff[MEM_PRE] * p->sys.scaling_coefficients[MEM_PRE];
-
-  initpower_coeff[SP_ACC] =
-      proc->cores[0]->get_coefficient_ialu_accesses() *
-      (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
-  ;
-  initpower_coeff[SFU_ACC] = proc->cores[0]->get_coefficient_sfu_accesses();
-  initpower_coeff[FPU_ACC] = proc->cores[0]->get_coefficient_fpu_accesses();
-
-  effpower_coeff[SP_ACC] =
-      initpower_coeff[SP_ACC] * p->sys.scaling_coefficients[SP_ACC];
-  effpower_coeff[SFU_ACC] =
-      initpower_coeff[SFU_ACC] * p->sys.scaling_coefficients[SFU_ACC];
-  effpower_coeff[FPU_ACC] =
-      initpower_coeff[FPU_ACC] * p->sys.scaling_coefficients[FPU_ACC];
-
-  initpower_coeff[NOC_A] = proc->get_coefficient_noc_accesses();
-  effpower_coeff[NOC_A] =
-      initpower_coeff[NOC_A] * p->sys.scaling_coefficients[NOC_A];
-
-  const_dynamic_power =
-      proc->get_const_dynamic_power() / (proc->cores[0]->executionTime);
-
-  for (unsigned i = 0; i < num_perf_counters; i++) {
-    initpower_coeff[i] /= (proc->cores[0]->executionTime);
-    effpower_coeff[i] /= (proc->cores[0]->executionTime);
-  }
-}
-
-void gpgpu_sim_wrapper::update_components_power() {
-  update_coefficients();
-
-  proc_power = proc->rt_power.readOp.dynamic;
-
-  sample_cmp_pwr[IBP] =
-      (proc->cores[0]->ifu->IB->rt_power.readOp.dynamic +
-       proc->cores[0]->ifu->IB->rt_power.writeOp.dynamic +
-       proc->cores[0]->ifu->ID_misc->rt_power.readOp.dynamic +
-       proc->cores[0]->ifu->ID_operand->rt_power.readOp.dynamic +
-       proc->cores[0]->ifu->ID_inst->rt_power.readOp.dynamic) /
-      (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[ICP] = proc->cores[0]->ifu->icache.rt_power.readOp.dynamic /
-                        (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[DCP] = proc->cores[0]->lsu->dcache.rt_power.readOp.dynamic /
-                        (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[TCP] = proc->cores[0]->lsu->tcache.rt_power.readOp.dynamic /
-                        (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[CCP] = proc->cores[0]->lsu->ccache.rt_power.readOp.dynamic /
-                        (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[SHRDP] =
-      proc->cores[0]->lsu->sharedmemory.rt_power.readOp.dynamic /
-      (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[RFP] =
-      (proc->cores[0]->exu->rfu->rt_power.readOp.dynamic /
-       (proc->cores[0]->executionTime)) *
-      (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
-
-  sample_cmp_pwr[SPP] =
-      (proc->cores[0]->exu->exeu->rt_power.readOp.dynamic /
-       (proc->cores[0]->executionTime)) *
-      (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
-
-  sample_cmp_pwr[SFUP] = (proc->cores[0]->exu->mul->rt_power.readOp.dynamic /
-                          (proc->cores[0]->executionTime));
-
-  sample_cmp_pwr[FPUP] = (proc->cores[0]->exu->fp_u->rt_power.readOp.dynamic /
-                          (proc->cores[0]->executionTime));
-
-  sample_cmp_pwr[SCHEDP] = proc->cores[0]->exu->scheu->rt_power.readOp.dynamic /
-                           (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[L2CP] = (proc->XML->sys.number_of_L2s > 0)
-                             ? proc->l2array[0]->rt_power.readOp.dynamic /
-                                   (proc->cores[0]->executionTime)
-                             : 0;
-
-  sample_cmp_pwr[MCP] = (proc->mc->rt_power.readOp.dynamic -
-                         proc->mc->dram->rt_power.readOp.dynamic) /
-                        (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[NOCP] =
-      proc->nocs[0]->rt_power.readOp.dynamic / (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[DRAMP] =
-      proc->mc->dram->rt_power.readOp.dynamic / (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[PIPEP] =
-      proc->cores[0]->Pipeline_energy / (proc->cores[0]->executionTime);
-
-  sample_cmp_pwr[IDLE_COREP] =
-      proc->cores[0]->IdleCoreEnergy / (proc->cores[0]->executionTime);
-
-  // This constant dynamic power (e.g., clock power) part is estimated via
-  // regression model.
-  sample_cmp_pwr[CONST_DYNAMICP] = 0;
-  double cnst_dyn =
-      proc->get_const_dynamic_power() / (proc->cores[0]->executionTime);
-  // If the regression scaling term is greater than the recorded constant
-  // dynamic power then use the difference (other portion already added to
-  // dynamic power). Else, all the constant dynamic power is accounted for, add
-  // nothing.
-  if (p->sys.scaling_coefficients[CONST_DYNAMICN] > cnst_dyn)
-    sample_cmp_pwr[CONST_DYNAMICP] =
-        (p->sys.scaling_coefficients[CONST_DYNAMICN] - cnst_dyn);
-
-  proc_power += sample_cmp_pwr[CONST_DYNAMICP];
-
-  double sum_pwr_cmp = 0;
-  for (unsigned i = 0; i < num_pwr_cmps; i++) {
-    sum_pwr_cmp += sample_cmp_pwr[i];
-  }
-  bool check = false;
-  check = sanity_check(sum_pwr_cmp, proc_power);
-  assert("Total Power does not equal the sum of the components\n" && (check));
-}
-
-void gpgpu_sim_wrapper::compute() { proc->compute(); }
-void gpgpu_sim_wrapper::print_power_kernel_stats(
-    double gpu_sim_cycle, double gpu_tot_sim_cycle, double init_value,
-    const std::string& kernel_info_string, bool print_trace) {
-  detect_print_steady_state(1, init_value);
-  if (g_power_simulation_enabled) {
-    powerfile << kernel_info_string << std::endl;
-
-    sanity_check((kernel_power.avg * kernel_sample_count), kernel_tot_power);
-    powerfile << "Kernel Average Power Data:" << std::endl;
-    powerfile << "kernel_avg_power = " << kernel_power.avg << std::endl;
-
-    for (unsigned i = 0; i < num_pwr_cmps; ++i) {
-      powerfile << "gpu_avg_" << pwr_cmp_label[i] << " = "
-                << kernel_cmp_pwr[i].avg / kernel_sample_count << std::endl;
-    }
-    for (unsigned i = 0; i < num_perf_counters; ++i) {
-      powerfile << "gpu_avg_" << perf_count_label[i] << " = "
-                << kernel_cmp_perf_counters[i].avg / kernel_sample_count
-                << std::endl;
-    }
-
-    powerfile << std::endl << "Kernel Maximum Power Data:" << std::endl;
-    powerfile << "kernel_max_power = " << kernel_power.max << std::endl;
-    for (unsigned i = 0; i < num_pwr_cmps; ++i) {
-      powerfile << "gpu_max_" << pwr_cmp_label[i] << " = "
-                << kernel_cmp_pwr[i].max << std::endl;
-    }
-    for (unsigned i = 0; i < num_perf_counters; ++i) {
-      powerfile << "gpu_max_" << perf_count_label[i] << " = "
-                << kernel_cmp_perf_counters[i].max << std::endl;
-    }
-
-    powerfile << std::endl << "Kernel Minimum Power Data:" << std::endl;
-    powerfile << "kernel_min_power = " << kernel_power.min << std::endl;
-    for (unsigned i = 0; i < num_pwr_cmps; ++i) {
-      powerfile << "gpu_min_" << pwr_cmp_label[i] << " = "
-                << kernel_cmp_pwr[i].min << std::endl;
-    }
-    for (unsigned i = 0; i < num_perf_counters; ++i) {
-      powerfile << "gpu_min_" << perf_count_label[i] << " = "
-                << kernel_cmp_perf_counters[i].min << std::endl;
-    }
-
-    powerfile << std::endl
-              << "Accumulative Power Statistics Over Previous Kernels:"
-              << std::endl;
-    powerfile << "gpu_tot_avg_power = "
-              << gpu_tot_power.avg / total_sample_count << std::endl;
-    powerfile << "gpu_tot_max_power = " << gpu_tot_power.max << std::endl;
-    powerfile << "gpu_tot_min_power = " << gpu_tot_power.min << std::endl;
-    powerfile << std::endl << std::endl;
-    powerfile.flush();
-
-    if (print_trace) {
-      print_trace_files();
-    }
-  }
-}
-void gpgpu_sim_wrapper::dump() {
-  if (g_power_per_cycle_dump) proc->displayEnergy(2, 5);
-}
-
-void gpgpu_sim_wrapper::print_steady_state(int position, double init_val) {
-  double temp_avg = sample_val / (double)samples.size();
-  double temp_ipc = (init_val - init_inst_val) /
-                    (double)(samples.size() * gpu_stat_sample_freq);
-
-  if ((samples.size() >
-       gpu_steady_min_period)) {  // If steady state occurred for some time,
-                                  // print to file
-    has_written_avg = true;
-    gzprintf(steady_state_tacking_file, "%u,%d,%f,%f,", sample_start,
-             total_sample_count, temp_avg, temp_ipc);
-    for (unsigned i = 0; i < num_perf_counters; ++i) {
-      gzprintf(steady_state_tacking_file, "%f,",
-               samples_counter.at(i) / ((double)samples.size()));
-    }
-    gzprintf(steady_state_tacking_file, "\n");
-  } else {
-    if (!has_written_avg && position)
-      gzprintf(steady_state_tacking_file,
-               "ERROR! Not enough steady state points to generate average\n");
-  }
-
-  sample_start = 0;
-  sample_val = 0;
-  init_inst_val = init_val;
-  samples.clear();
-  samples_counter.clear();
-  pwr_counter.clear();
-  assert(samples.size() == 0);
-}
-
-void gpgpu_sim_wrapper::detect_print_steady_state(int position,
-                                                  double init_val) {
-  // Calculating Average
-  if (g_power_simulation_enabled && g_steady_power_levels_enabled) {
-    steady_state_tacking_file = gzopen(g_steady_state_tracking_filename, "a");
-    if (position == 0) {
-      if (samples.size() == 0) {
-        // First sample
-        sample_start = total_sample_count;
-        sample_val = proc->rt_power.readOp.dynamic;
-        init_inst_val = init_val;
-        samples.push_back(proc->rt_power.readOp.dynamic);
-        assert(samples_counter.size() == 0);
-        assert(pwr_counter.size() == 0);
-
-        for (unsigned i = 0; i < (num_perf_counters); ++i) {
-          samples_counter.push_back(sample_perf_counters[i]);
-        }
-
-        for (unsigned i = 0; i < (num_pwr_cmps); ++i) {
-          pwr_counter.push_back(sample_cmp_pwr[i]);
-        }
-        assert(pwr_counter.size() == (double)num_pwr_cmps);
-        assert(samples_counter.size() == (double)num_perf_counters);
-      } else {
-        // Get current average
-        double temp_avg = sample_val / (double)samples.size();
-
-        if (abs(proc->rt_power.readOp.dynamic - temp_avg) <
-            gpu_steady_power_deviation) {  // Value is within threshold
-          sample_val += proc->rt_power.readOp.dynamic;
-          samples.push_back(proc->rt_power.readOp.dynamic);
-          for (unsigned i = 0; i < (num_perf_counters); ++i) {
-            samples_counter.at(i) += sample_perf_counters[i];
-          }
-
-          for (unsigned i = 0; i < (num_pwr_cmps); ++i) {
-            pwr_counter.at(i) += sample_cmp_pwr[i];
-          }
-
-        } else {  // Value exceeds threshold, not considered steady state
-          print_steady_state(position, init_val);
-        }
-      }
-    } else {
-      print_steady_state(position, init_val);
-    }
-    gzclose(steady_state_tacking_file);
-  }
-}
-
-void gpgpu_sim_wrapper::open_files() {
-  if (g_power_simulation_enabled) {
-    if (g_power_trace_enabled) {
-      power_trace_file = gzopen(g_power_trace_filename, "a");
-      metric_trace_file = gzopen(g_metric_trace_filename, "a");
-    }
-  }
-}
-void gpgpu_sim_wrapper::close_files() {
-  if (g_power_simulation_enabled) {
-    if (g_power_trace_enabled) {
-      gzclose(power_trace_file);
-      gzclose(metric_trace_file);
-    }
-  }
-}
diff --git a/version b/version
index c832e567c..09e18b115 100644
--- a/version
+++ b/version
@@ -1 +1 @@
-const char *g_gpgpusim_version_string = "GPGPU-Sim Simulator Version 4.1.0 ";
+const char *g_gpgpusim_version_string = "GPGPU-Sim Simulator Version 4.2.0 ";

From b1bb39ee3fad1e689bd0842dca71dc652dd9a30e Mon Sep 17 00:00:00 2001
From: Ni Kang <kang222@tgrogers-raid.ecn.purdue.edu>
Date: Wed, 27 Oct 2021 14:24:46 -0400
Subject: [PATCH 093/154] Updated the manual

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9bb891659..3e8dc30ab 100644
--- a/README.md
+++ b/README.md
@@ -268,7 +268,7 @@ To run Pytorch applications with the simulator, install the modified Pytorch lib
 
 ## Step 3: Run
 
-Before we run, we need to make sure the application's executable file is dynamically linked to CUDA runtime library. This can be done during compilation of your program by introducing the nvcc flag "--cudart shared" in makefile (quotes should be excluded).
+Before we run, we need to make sure the application's executable file is dynamically linked to CUDA runtime library. This can be done during compilation of your program by introducing the nvcc flag "-lcudart" in makefile (quotes should be excluded).
 
 To confirm the same, type the follwoing command:
 

From 2a29ea9296dd1a39134976c8ad7397f2d36de2f8 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Tue, 1 Feb 2022 15:22:08 -0500
Subject: [PATCH 094/154] rm hw_perf.csv from config folder

---
 configs/tested-cfgs/SM7_QV100/hw_perf.csv | 26 -----------------------
 1 file changed, 26 deletions(-)
 delete mode 100644 configs/tested-cfgs/SM7_QV100/hw_perf.csv

diff --git a/configs/tested-cfgs/SM7_QV100/hw_perf.csv b/configs/tested-cfgs/SM7_QV100/hw_perf.csv
deleted file mode 100644
index aa88bb256..000000000
--- a/configs/tested-cfgs/SM7_QV100/hw_perf.csv
+++ /dev/null
@@ -1,26 +0,0 @@
-Benchmark,Kernel,L1_RH,L1_RM,L1_WH,L1_WM,CC_ACC,SHRD_ACC,DRAM_Rd,DRAM_Wr,L2_RH,L2_RM,L2_WH,L2_WM,NOC,Pipeline_Duty,Num_Idle_SMs,Elapsed_Cycles,Chip Voltage
-b+tree-rodinia-3.1,findRangeK,1634256.0,561818.0,40785.0,19032.0,0.0,0.0,259346.0,3524.0,396522.0,259508.0,60000.0,0.0,1343246.0,0.3268163900773488,5.064000000000002,66542.7,1.0
-b+tree-rodinia-3.1,findK,1318908.0,525035.0,42619.0,7404.0,0.0,0.0,255317.0,2582.0,366918.0,255364.0,50000.0,0.0,1250108.0,0.2740918672650619,3.191999999999995,80883.0,1.0
-backprop-rodinia-3.1,_Z22bpnn_layerforward_CUDAPfS_S_S_ii,49152.0,143738.0,192432.0,4232.0,0.0,413696.0,147464.0,60097.0,29059.0,147460.0,196608.0,0.0,704512.0,0.5619432556155418,7.520000000000007,23324.775,1.0
-backprop-rodinia-3.1,_Z24bpnn_adjust_weights_cudaPfiS_iS_S_,465990.0,277805.0,327015.0,887.0,0.0,0.0,286738.0,190646.0,54315.0,286734.0,327686.0,0.0,1263518.0,0.20116733697224465,9.496000000000002,32578.425,1.0
-hotspot-rodinia-3.1,_Z14calculate_tempiPfS_S_iiiiffffff,4250.0,691050.0,0.0,175104.0,0.0,997428.0,262147.0,66263.0,486965.0,262144.0,175104.0,0.0,1732988.0,0.9470499252952201,3.3200000000000074,56438.825,1.0
-kmeans-rodinia-3.1,_Z11kmeansPointPfiiiPiS_S_S0_,0.0,0.0,0.0,102400.0,4352107.0,0.0,12302960.0,92472.5,6742186.0,12321532.0,102400.0,0.0,26022036.0,0.11420395712434231,1.5799999999999947,894550.775,1.0
-srad_v1-rodinia-3.1,_Z4sradfiilPiS_S_S_PfS0_S0_S0_fS0_S0_,158304.87000000002,89035.40999999999,0.0,143700.0,0.0,0.0,28986.500000000033,45424.200000000026,68135.7,28984.00000000001,143700.0,0.0,481258.2600000001,0.5320091849844065,15.272880000000004,14251.741749999997,1.0
-parboil-sad,_Z11mb_sad_calcPtS_ii,101840.0,415925.0,2102177.0,7289373.0,0.0,10033920.0,257308.0,8720433.0,8754664.0,257280.0,9390720.0,0.0,36398656.0,0.25130932753519797,0.19199999999999662,6551129.125,1.0
-parboil-sgemm,_Z9mysgemmNTPKfiS0_iPfiiff,7109956.0,2452728.0,133388.0,1284.0,0.0,8642304.0,393092.0,36894.0,2059512.0,393088.0,135168.0,0.0,5176696.0,0.5495706862295477,1.8799999999999972,358744.025,1.0
-parboil-mri-q,_Z12ComputeQ_GPUiiPfS_S_S_S_,0.0,163840.0,65184.0,154.0,17617612.5,0.0,164356.0,0.0,0.0,163840.0,65536.0,0.0,458752.0,0.5767256645623982,12.363999999999997,691892.925,1.0
-dct8x8,_Z14CUDAkernel1DCTPfiiiy,0.0,0.0,552.8,32121.9,786431.9999999999,114688.00000000001,32786.0,0.0,16383.999999999998,32767.999999999996,32767.999999999996,0.0,131071.99999999999,0.06091433507559575,7.7799999999999985,24207.632500000003,1.0
-dct8x8,_Z14CUDAkernel2DCTPfS_i,0.0,32768.00000000002,0.0,32768.00000000002,0.0,49152.00000000004,32773.25742574254,0.0,0.0,32768.00000000002,32768.00000000002,0.0,131072.0000000001,0.14345732731755537,30.750257425742568,5822.941584158416,1.0
-binomialOptions,_Z21binomialOptionsKernelv,0.0,0.0,0.0,1024.0,23688.0,16778240.0,640.0,0.0,0.0,0.0,1024.0,0.0,2048.0,0.6457304629145744,1.9519999999999982,1366301.225,1.0
-fastWalshTransform,_Z15fwtBatch2KernelPfS_i,0.0,1048576.0000000002,774120.4444444445,271536.22222222225,0.0,0.0,1048581.888888889,945003.222222222,0.0,1048576.0000000002,1048576.0000000002,0.0,4194304.000000001,0.0867005928407203,2.574222222222223,120947.73472222223,1.0
-fastWalshTransform,_Z15fwtBatch1KernelPfS_i,0.0,1048576.0,645060.0,403890.6666666666,0.0,3407872.0,1048581.0,950303.3333333333,0.0,1048576.0,1048576.0,0.0,4194304.0,0.3836524328760675,2.621333333333329,149487.8,1.0
-histogram,_Z17histogram64KernelPjP5uint4j,0.0,2097152.0,0.0,34960.0,0.0,4893504.000000001,2097184.2941176468,26959.294117647052,0.0,2097152.0,34960.0,0.0,4264223.999999999,0.3361853461559831,3.706823529411762,146480.14411764703,1.0
-mergeSort,_Z21mergeSortSharedKernelILj1EEvPjS0_S0_S0_j,0.0,1048576.0,0.0,1048576.0,0.0,12976128.0,1048580.0,950169.0,0.0,1048576.0,1048576.0,0.0,4194304.0,0.9137102229423307,1.1600000000000055,439316.525,1.0
-mergeSort,_Z30mergeElementaryIntervalsKernelILj1EEvPjS0_S0_S0_S0_S0_jj,152481.75,1127706.3333333333,439852.24999999994,829969.9166666665,0.0,3670010.1666666665,1056772.0000000002,959704.0833333334,199523.16666666666,1056768.0,1269875.1666666667,0.0,4878632.833333334,0.44812863772322986,1.6420000000000003,157457.05,1.0
-quasirandomGenerator,_Z26quasirandomGeneratorKernelPfjj,0.0,0.0,0.0,393215.9999999999,47616.000000000015,0.0,21.0,294938.38095238095,0.0,0.0,393215.9999999999,0.0,786431.9999999998,0.6109600290450061,17.68266666666667,80626.8130952381,1.0
-quasirandomGenerator,_Z16inverseCNDKernelPfPjj,0.0,0.0,0.0,393215.9999999999,0.0,0.0,5.952380952380952,294941.6666666666,0.0,0.0,393215.9999999999,0.0,786431.9999999998,0.307434624439692,5.790476190476192,58367.4988095238,1.0
-sobolQRNG,_Z15sobolGPU_kerneljjPjPf,172832.0,31976.0,0.0,1250000.0,0.0,1899700.0,405.0,1151641.0,31592.0,400.0,1250000.0,0.0,2563936.0,0.6380044567750587,2.7840000000000042,112087.775,1.0
-cutlass_perf_test_k1,_ZN7cutlass4gemm16gemm_kernel_nolbINS0_12GemmMainloopINS0_10GemmTraitsINS0_14WmmaGemmConfigILNS_12MatrixLayout4KindE1ELS6_1ENS_5ShapeILi64ELi128ELi128ELi1EEE6__halfS9_ffNS7_ILi64ELi32ELi64ELi1EEENS7_ILi16ELi16ELi16ELi1EEELi8ELi8ELi8ELi8ELi4ELi4ELi4EEENS0_16GlobalLoadStreamILNS_11GemmOperand4KindE0ENS0_20GemmGlobalIteratorAbINS0_20GemmGlobalTileTraitsILSF_0ELS6_1EKS9_NS7_ILi1ELi64ELi128ELi1EEENS7_ILi1ELi8ELi32ELi1EEELi8EEEiEENS_17TileStoreIteratorINS0_27GemmSharedStoreTileAbTraitsIS9_NS7_ILi1ELi64ELi136ELi1EEENS7_ILi1ELi16ELi16ELi1EEELi8EEES9_LNS_15IteratorAdvance4KindE1ELNS_11MemorySpace4KindE1EiS9_LNS_19FragmentElementType4KindE0ENS7_ILi0ELi0ELi0ELi0EEEEENS_4CopyINS_8FragmentIS9_Li32ELm16EEEEEEENSD_ILSF_1ENSG_INSH_ILSF_1ELS6_1ESI_NS7_ILi1ELi128ELi64ELi1EEENS7_ILi1ELi4ELi64ELi1EEELi8EEEiEENSN_INSO_IS9_NS7_ILi1ELi128ELi72ELi1EEENS7_ILi1ELi32ELi8ELi1EEELi8EEES9_LST_1ELSV_1EiS9_LSX_0ESY_EES13_EENS0_16SharedLoadStreamINS_16TileLoadIteratorINS0_29WmmaGemmSharedLoadTileATraitsILS6_1ES9_SP_NS7_ILi1ELi4ELi2ELi1EEELi16ENS7_ILi1ELi1ELi4ELi1EEENS7_ILi2176ELi0ELi32ELi0EEESB_EES9_LST_1ELSV_1EiNS_10WmmaMatrixILSF_0ELS6_1ES9_SB_EELSX_1ESY_EENS10_INS11_IS1M_Li4ELm16EEEEEEENS1E_INS1F_INS0_29WmmaGemmSharedLoadTileBTraitsILS6_1ES9_S19_S1H_Li1152ENS7_ILi1ELi1ELi2ELi1EEENS7_ILi16ELi0ELi4608ELi1EEESB_EES9_LST_1ELSV_1EiNS1L_ILSF_1ELS6_1ES9_SB_EELSX_1ESY_EENS10_INS11_IS1V_Li2ELm16EEEEEEENS0_12GemmEpilogueINS0_28SimplifiedGemmEpilogueTraitsISC_NS0_13LinearScalingIfNS0_19FragmentMultiplyAddIffLb1EEEEEiNS0_28WmmaGemmEpilogueTraitsHelperISC_fS25_iEEEEEENS0_20IdentityBlockSwizzleEiNS0_17ClearAccumulatorsIfLi1EEEEEEEEEvNT_6ParamsE,0.0,460800.0,0.0,5120.0,160.0,577120.0000000001,412167.99999999994,42.285714285714285,48640.0,412160.0,5120.0,0.0,931840.0,0.24658369358809393,60.32228571428572,139808.59999999998,1.0
-cutlass_perf_test_k2,_ZN7cutlass4gemm16gemm_kernel_nolbINS0_12GemmMainloopINS0_10GemmTraitsINS0_14WmmaGemmConfigILNS_12MatrixLayout4KindE1ELS6_1ENS_5ShapeILi64ELi128ELi128ELi1EEE6__halfS9_ffNS7_ILi64ELi32ELi64ELi1EEENS7_ILi16ELi16ELi16ELi1EEELi8ELi8ELi8ELi8ELi4ELi4ELi4EEENS0_16GlobalLoadStreamILNS_11GemmOperand4KindE0ENS0_20GemmGlobalIteratorAbINS0_20GemmGlobalTileTraitsILSF_0ELS6_1EKS9_NS7_ILi1ELi64ELi128ELi1EEENS7_ILi1ELi8ELi32ELi1EEELi8EEEiEENS_17TileStoreIteratorINS0_27GemmSharedStoreTileAbTraitsIS9_NS7_ILi1ELi64ELi136ELi1EEENS7_ILi1ELi16ELi16ELi1EEELi8EEES9_LNS_15IteratorAdvance4KindE1ELNS_11MemorySpace4KindE1EiS9_LNS_19FragmentElementType4KindE0ENS7_ILi0ELi0ELi0ELi0EEEEENS_4CopyINS_8FragmentIS9_Li32ELm16EEEEEEENSD_ILSF_1ENSG_INSH_ILSF_1ELS6_1ESI_NS7_ILi1ELi128ELi64ELi1EEENS7_ILi1ELi4ELi64ELi1EEELi8EEEiEENSN_INSO_IS9_NS7_ILi1ELi128ELi72ELi1EEENS7_ILi1ELi32ELi8ELi1EEELi8EEES9_LST_1ELSV_1EiS9_LSX_0ESY_EES13_EENS0_16SharedLoadStreamINS_16TileLoadIteratorINS0_29WmmaGemmSharedLoadTileATraitsILS6_1ES9_SP_NS7_ILi1ELi4ELi2ELi1EEELi16ENS7_ILi1ELi1ELi4ELi1EEENS7_ILi2176ELi0ELi32ELi0EEESB_EES9_LST_1ELSV_1EiNS_10WmmaMatrixILSF_0ELS6_1ES9_SB_EELSX_1ESY_EENS10_INS11_IS1M_Li4ELm16EEEEEEENS1E_INS1F_INS0_29WmmaGemmSharedLoadTileBTraitsILS6_1ES9_S19_S1H_Li1152ENS7_ILi1ELi1ELi2ELi1EEENS7_ILi16ELi0ELi4608ELi1EEESB_EES9_LST_1ELSV_1EiNS1L_ILSF_1ELS6_1ES9_SB_EELSX_1ESY_EENS10_INS11_IS1V_Li2ELm16EEEEEEENS0_12GemmEpilogueINS0_28SimplifiedGemmEpilogueTraitsISC_NS0_13LinearScalingIfNS0_19FragmentMultiplyAddIffLb1EEEEEiNS0_28WmmaGemmEpilogueTraitsHelperISC_fS25_iEEEEEENS0_20IdentityBlockSwizzleEiNS0_17ClearAccumulatorsIfLi1EEEEEEEEEvNT_6ParamsE,0.0,2097151.9999999995,171796.0,65782.85714285714,255.99999999999994,1464319.9999999998,1081352.2857142857,45.42857142857143,1015808.0000000002,1081344.0,237568.0,0.0,4669440.0,0.38530040572560803,48.440000000000005,228263.9035714286,1.0
-cutlass_perf_test_k3,_ZN7cutlass4gemm16gemm_kernel_nolbINS0_12GemmMainloopINS0_10GemmTraitsINS0_14WmmaGemmConfigILNS_12MatrixLayout4KindE1ELS6_1ENS_5ShapeILi64ELi128ELi128ELi1EEE6__halfS9_ffNS7_ILi64ELi32ELi64ELi1EEENS7_ILi16ELi16ELi16ELi1EEELi8ELi8ELi8ELi8ELi4ELi4ELi4EEENS0_16GlobalLoadStreamILNS_11GemmOperand4KindE0ENS0_20GemmGlobalIteratorAbINS0_20GemmGlobalTileTraitsILSF_0ELS6_1EKS9_NS7_ILi1ELi64ELi128ELi1EEENS7_ILi1ELi8ELi32ELi1EEELi8EEEiEENS_17TileStoreIteratorINS0_27GemmSharedStoreTileAbTraitsIS9_NS7_ILi1ELi64ELi136ELi1EEENS7_ILi1ELi16ELi16ELi1EEELi8EEES9_LNS_15IteratorAdvance4KindE1ELNS_11MemorySpace4KindE1EiS9_LNS_19FragmentElementType4KindE0ENS7_ILi0ELi0ELi0ELi0EEEEENS_4CopyINS_8FragmentIS9_Li32ELm16EEEEEEENSD_ILSF_1ENSG_INSH_ILSF_1ELS6_1ESI_NS7_ILi1ELi128ELi64ELi1EEENS7_ILi1ELi4ELi64ELi1EEELi8EEEiEENSN_INSO_IS9_NS7_ILi1ELi128ELi72ELi1EEENS7_ILi1ELi32ELi8ELi1EEELi8EEES9_LST_1ELSV_1EiS9_LSX_0ESY_EES13_EENS0_16SharedLoadStreamINS_16TileLoadIteratorINS0_29WmmaGemmSharedLoadTileATraitsILS6_1ES9_SP_NS7_ILi1ELi4ELi2ELi1EEELi16ENS7_ILi1ELi1ELi4ELi1EEENS7_ILi2176ELi0ELi32ELi0EEESB_EES9_LST_1ELSV_1EiNS_10WmmaMatrixILSF_0ELS6_1ES9_SB_EELSX_1ESY_EENS10_INS11_IS1M_Li4ELm16EEEEEEENS1E_INS1F_INS0_29WmmaGemmSharedLoadTileBTraitsILS6_1ES9_S19_S1H_Li1152ENS7_ILi1ELi1ELi2ELi1EEENS7_ILi16ELi0ELi4608ELi1EEESB_EES9_LST_1ELSV_1EiNS1L_ILSF_1ELS6_1ES9_SB_EELSX_1ESY_EENS10_INS11_IS1V_Li2ELm16EEEEEEENS0_12GemmEpilogueINS0_28SimplifiedGemmEpilogueTraitsISC_NS0_13LinearScalingIfNS0_19FragmentMultiplyAddIffLb1EEEEEiNS0_28WmmaGemmEpilogueTraitsHelperISC_fS25_iEEEEEENS0_20IdentityBlockSwizzleEiNS0_17ClearAccumulatorsIfLi1EEEEEEEEEvNT_6ParamsE,0.0,3276800.0000000005,429682.85714285716,164204.57142857142,640.0,2309120.0,491527.9999999999,77869.28571428571,2785279.9999999995,491519.99999999994,593920.0000000001,0.0,7741440.0,0.8525726478636384,1.832,161781.07857142857,1.0
-cudaTensorCoreGemm,_Z12compute_gemmPK6__halfS1_PKfPfff,0.0,69206016.0,0.0,2097152.0,0.0,30146560.0,16974052.0,1998866.0,52232060.0,16973824.0,2097152.0,0.0,142606336.0,0.7380984268363922,1.264000000000003,3871172.375,1.0

From 011891abfae060903f76d5c7aee23208cd295e71 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Tue, 1 Feb 2022 20:36:24 -0500
Subject: [PATCH 095/154] Update Copyrights

---
 src/abstract_hardware_model.cc   | 5 +++--
 src/abstract_hardware_model.h    | 5 +++--
 src/cuda-sim/cuda-sim.cc         | 5 +++--
 src/cuda-sim/instructions.cc     | 5 +++--
 src/cuda-sim/ptx.l               | 5 +++--
 src/cuda-sim/ptx_ir.cc           | 5 +++--
 src/gpgpu-sim/dram.cc            | 5 +++--
 src/gpgpu-sim/dram.h             | 5 +++--
 src/gpgpu-sim/gpu-cache.cc       | 6 ++++--
 src/gpgpu-sim/gpu-cache.h        | 5 +++--
 src/gpgpu-sim/gpu-sim.cc         | 5 +++--
 src/gpgpu-sim/gpu-sim.h          | 3 ++-
 src/gpgpu-sim/l2cache.cc         | 5 +++--
 src/gpgpu-sim/l2cache.h          | 5 +++--
 src/gpgpu-sim/power_interface.cc | 5 +++--
 src/gpgpu-sim/power_interface.h  | 5 +++--
 src/gpgpu-sim/power_stat.cc      | 5 +++--
 src/gpgpu-sim/power_stat.h       | 5 +++--
 src/gpgpu-sim/shader.cc          | 5 +++--
 src/gpgpu-sim/shader.h           | 5 +++--
 20 files changed, 60 insertions(+), 39 deletions(-)

diff --git a/src/abstract_hardware_model.cc b/src/abstract_hardware_model.cc
index 208047eeb..fda84e8b0 100644
--- a/src/abstract_hardware_model.cc
+++ b/src/abstract_hardware_model.cc
@@ -1,5 +1,6 @@
-// Copyright (c) 2009-2021, Tor M. Aamodt, Inderpreet Singh, Timothy Rogers, Vijay Kandiah, Nikos Hardavellas
-// The University of British Columbia, Northwestern University
+// Copyright (c) 2009-2021, Tor M. Aamodt, Inderpreet Singh, Timothy Rogers, Vijay Kandiah, Nikos Hardavellas, 
+// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
+// The University of British Columbia, Northwestern University, Purdue University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index f04741f75..e9d7c76a4 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -1,5 +1,6 @@
-// Copyright (c) 2009-2021, Tor M. Aamodt, Inderpreet Singh, Vijay Kandiah, Nikos Hardavellas
-// The University of British Columbia, Northwestern University
+// Copyright (c) 2009-2021, Tor M. Aamodt, Inderpreet Singh, Vijay Kandiah, Nikos Hardavellas, 
+// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
+// The University of British Columbia, Northwestern University, Purdue University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/src/cuda-sim/cuda-sim.cc b/src/cuda-sim/cuda-sim.cc
index f9e5db314..680ce7970 100644
--- a/src/cuda-sim/cuda-sim.cc
+++ b/src/cuda-sim/cuda-sim.cc
@@ -1,6 +1,7 @@
 // Copyright (c) 2009-2021, Tor M. Aamodt, Ali Bakhoda, Wilson W.L. Fung,
-// George L. Yuan, Jimmy Kwa, Vijay Kandiah, Nikos Hardavellas
-// The University of British Columbia, Northwestern University
+// George L. Yuan, Jimmy Kwa, Vijay Kandiah, Nikos Hardavellas, 
+// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
+// The University of British Columbia, Northwestern University, Purdue University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/src/cuda-sim/instructions.cc b/src/cuda-sim/instructions.cc
index 44afbe5aa..e22d88a81 100644
--- a/src/cuda-sim/instructions.cc
+++ b/src/cuda-sim/instructions.cc
@@ -1,6 +1,7 @@
 // Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Ali Bakhoda,
-// Jimmy Kwa, George L. Yuan, Vijay Kandiah, Nikos Hardavellas
-// The University of British Columbia, Northwestern University
+// Jimmy Kwa, George L. Yuan, Vijay Kandiah, Nikos Hardavellas,
+// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
+// The University of British Columbia, Northwestern University, Purdue University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/src/cuda-sim/ptx.l b/src/cuda-sim/ptx.l
index 7706f0b31..15b3cf77e 100644
--- a/src/cuda-sim/ptx.l
+++ b/src/cuda-sim/ptx.l
@@ -1,6 +1,7 @@
 /*
-Copyright (c) 2009-2021, Tor M. Aamodt, Vijay Kandiah, Nikos Hardavellas
-The University of British Columbia, Northwestern University
+Copyright (c) 2009-2021, Tor M. Aamodt, Vijay Kandiah, Nikos Hardavellas, 
+Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
+The University of British Columbia, Northwestern University, Purdue University
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
diff --git a/src/cuda-sim/ptx_ir.cc b/src/cuda-sim/ptx_ir.cc
index 2edc1ed56..029cf73a8 100644
--- a/src/cuda-sim/ptx_ir.cc
+++ b/src/cuda-sim/ptx_ir.cc
@@ -1,6 +1,7 @@
 // Copyright (c) 2009-2021, Tor M. Aamodt, Ali Bakhoda, Wilson W.L. Fung,
-// George L. Yuan, Vijay Kandiah, Nikos Hardavellas
-// The University of British Columbia, Northwestern University
+// George L. Yuan, Vijay Kandiah, Nikos Hardavellas,
+// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
+// The University of British Columbia, Northwestern University, Purdue University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/src/gpgpu-sim/dram.cc b/src/gpgpu-sim/dram.cc
index 545c45dfd..662c2ed3f 100644
--- a/src/gpgpu-sim/dram.cc
+++ b/src/gpgpu-sim/dram.cc
@@ -1,6 +1,7 @@
 // Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Ali Bakhoda,
-// Ivan Sham, George L. Yuan, Vijay Kandiah, Nikos Hardavellas
-// The University of British Columbia, Northwestern University
+// Ivan Sham, George L. Yuan, Vijay Kandiah, Nikos Hardavellas, 
+// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
+// The University of British Columbia, Northwestern University, Purdue University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/src/gpgpu-sim/dram.h b/src/gpgpu-sim/dram.h
index 88e46ed7b..90ea3e40e 100644
--- a/src/gpgpu-sim/dram.h
+++ b/src/gpgpu-sim/dram.h
@@ -1,6 +1,7 @@
 // Copyright (c) 2009-2021, Tor M. Aamodt, Ivan Sham, Ali Bakhoda,
-// George L. Yuan, Wilson W.L. Fung, Vijay Kandiah, Nikos Hardavellas
-// The University of British Columbia, Northwestern University
+// George L. Yuan, Wilson W.L. Fung, Vijay Kandiah, Nikos Hardavellas,
+// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
+// The University of British Columbia, Northwestern University, Purdue University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index a2aeec57f..3a5a67dfa 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -1,5 +1,7 @@
-// Copyright (c) 2009-2021, Tor M. Aamodt, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas
-// The University of British Columbia, Northwestern University
+// Copyright (c) 2009-2021, Tor M. Aamodt, Tayler Hetherington, 
+// Vijay Kandiah, Nikos Hardavellas, Mahmoud Khairy, Junrui Pan,
+// Timothy G. Rogers
+// The University of British Columbia, Northwestern University, Purdue University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 498dfebd0..4bbf7e2b3 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -1,5 +1,6 @@
-// Copyright (c) 2009-2021, Tor M. Aamodt, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas
-// The University of British Columbia, Northwestern University
+// Copyright (c) 2009-2021, Tor M. Aamodt, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas, 
+// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
+// The University of British Columbia, Northwestern University, Purdue University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index e44551ee3..ee243c1f3 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -1,6 +1,7 @@
 // Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, George L. Yuan,
-// Ali Bakhoda, Andrew Turner, Ivan Sham, Vijay Kandiah, Nikos Hardavellas
-// The University of British Columbia, Northwestern University
+// Ali Bakhoda, Andrew Turner, Ivan Sham, Vijay Kandiah, Nikos Hardavellas, 
+// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
+// The University of British Columbia, Northwestern University, Purdue University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h
index 68b3dfa10..de69ef8ce 100644
--- a/src/gpgpu-sim/gpu-sim.h
+++ b/src/gpgpu-sim/gpu-sim.h
@@ -1,5 +1,6 @@
 // Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Vijay Kandiah, Nikos Hardavellas
-// The University of British Columbia, Northwestern University
+// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
+// The University of British Columbia, Northwestern University, Purdue University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index 511c15efa..44d793cbc 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -1,5 +1,6 @@
-// Copyright (c) 2009-2021, Tor M. Aamodt, Vijay Kandiah, Nikos Hardavellas
-// The University of British Columbia, Northwestern University
+// Copyright (c) 2009-2021, Tor M. Aamodt, Vijay Kandiah, Nikos Hardavellas, 
+// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
+// The University of British Columbia, Northwestern University, Purdue University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/src/gpgpu-sim/l2cache.h b/src/gpgpu-sim/l2cache.h
index 902a4b7c0..7fa1f2917 100644
--- a/src/gpgpu-sim/l2cache.h
+++ b/src/gpgpu-sim/l2cache.h
@@ -1,5 +1,6 @@
-// Copyright (c) 2009-2021, Tor M. Aamodt, Vijay Kandiah, Nikos Hardavellas
-// The University of British Columbia, Northwestern University
+// Copyright (c) 2009-2021, Tor M. Aamodt, Vijay Kandiah, Nikos Hardavellas,
+// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
+// The University of British Columbia, Northwestern University, Purdue University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/src/gpgpu-sim/power_interface.cc b/src/gpgpu-sim/power_interface.cc
index 63b985260..470f2f9cf 100644
--- a/src/gpgpu-sim/power_interface.cc
+++ b/src/gpgpu-sim/power_interface.cc
@@ -1,5 +1,6 @@
-// Copyright (c) 2009-2021, Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas
-// The University of British Columbia, Northwestern University
+// Copyright (c) 2009-2021, Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas, 
+// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
+// The University of British Columbia, Northwestern University, Purdue University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/src/gpgpu-sim/power_interface.h b/src/gpgpu-sim/power_interface.h
index 1a488948c..1c6c51068 100644
--- a/src/gpgpu-sim/power_interface.h
+++ b/src/gpgpu-sim/power_interface.h
@@ -1,5 +1,6 @@
-// Copyright (c) 2009-2021, Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas
-// The University of British Columbia, Northwestern University
+// Copyright (c) 2009-2021, Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas, 
+// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
+// The University of British Columbia, Northwestern University, Purdue University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/src/gpgpu-sim/power_stat.cc b/src/gpgpu-sim/power_stat.cc
index fd7a77560..d0e673cb3 100644
--- a/src/gpgpu-sim/power_stat.cc
+++ b/src/gpgpu-sim/power_stat.cc
@@ -1,5 +1,6 @@
-// Copyright (c) 2009-2021,  Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas
-// The University of British Columbia, Northwestern University
+// Copyright (c) 2009-2021,  Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas, 
+// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
+// The University of British Columbia, Northwestern University, Purdue University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/src/gpgpu-sim/power_stat.h b/src/gpgpu-sim/power_stat.h
index e2c3ed5cc..d40f1d98a 100644
--- a/src/gpgpu-sim/power_stat.h
+++ b/src/gpgpu-sim/power_stat.h
@@ -1,5 +1,6 @@
-// Copyright (c) 2009-2021,  Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas
-// The University of British Columbia, Northwestern University
+// Copyright (c) 2009-2021,  Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas, 
+// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
+// The University of British Columbia, Northwestern University, Purdue University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index c0161dd31..9f8a1297e 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -1,6 +1,7 @@
 // Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Ali Bakhoda,
-// George L. Yuan, Andrew Turner, Inderpreet Singh, Vijay Kandiah, Nikos Hardavellas
-// The University of British Columbia, Northwestern University
+// George L. Yuan, Andrew Turner, Inderpreet Singh, Vijay Kandiah, Nikos Hardavellas, 
+// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
+// The University of British Columbia, Northwestern University, Purdue University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 65d56251c..d80476ff4 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -1,6 +1,7 @@
 // Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Andrew Turner,
-// Ali Bakhoda, Vijay Kandiah, Nikos Hardavellas
-// The University of British Columbia, Northwestern University
+// Ali Bakhoda, Vijay Kandiah, Nikos Hardavellas, 
+// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
+// The University of British Columbia, Northwestern University, Purdue University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without

From f0ad71cfdfe4675b02a948c7d3547da0ddce60e9 Mon Sep 17 00:00:00 2001
From: Aaron M Barnes <barnes88@purdue.edu>
Date: Tue, 15 Feb 2022 08:34:16 -0500
Subject: [PATCH 096/154] set default max concurrent ctas to 32 and validate

---
 src/abstract_hardware_model.h |  1 +
 src/gpgpu-sim/gpu-sim.cc      | 10 +++++-----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index e9d7c76a4..6e4a87dac 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -291,6 +291,7 @@ class kernel_info_t {
            m_next_tid.x < m_block_dim.x;
   }
   unsigned get_uid() const { return m_uid; }
+  std::string get_name() const { return name(); }
   std::string name() const;
 
   std::list<class ptx_thread_info *> &active_threads() {
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index ee243c1f3..8284ad359 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -395,7 +395,7 @@ void shader_core_config::reg_options(class OptionParser *opp) {
                          "gpgpu_ignore_resources_limitation (default 0)", "0");
   option_parser_register(
       opp, "-gpgpu_shader_cta", OPT_UINT32, &max_cta_per_core,
-      "Maximum number of concurrent CTAs in shader (default 8)", "8");
+      "Maximum number of concurrent CTAs in shader (default 32)", "32");
   option_parser_register(
       opp, "-gpgpu_num_cta_barriers", OPT_UINT32, &max_barriers_per_cta,
       "Maximum number of named barriers per CTA (default 16)", "16");
@@ -1639,9 +1639,9 @@ bool shader_core_ctx::occupy_shader_resource_1block(kernel_info_t &k,
 
     SHADER_DPRINTF(LIVENESS,
                    "GPGPU-Sim uArch: Occupied %u threads, %u shared mem, %u "
-                   "registers, %u ctas\n",
+                   "registers, %u ctas, on shader %d\n",
                    m_occupied_n_threads, m_occupied_shmem, m_occupied_regs,
-                   m_occupied_ctas);
+                   m_occupied_ctas, m_sid);
   }
 
   return true;
@@ -1807,9 +1807,9 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) {
   shader_CTA_count_log(m_sid, 1);
   SHADER_DPRINTF(LIVENESS,
                  "GPGPU-Sim uArch: cta:%2u, start_tid:%4u, end_tid:%4u, "
-                 "initialized @(%lld,%lld)\n",
+                 "initialized @(%lld,%lld), kernel_uid:%u, kernel_name:%s\n",
                  free_cta_hw_id, start_thread, end_thread, m_gpu->gpu_sim_cycle,
-                 m_gpu->gpu_tot_sim_cycle);
+                 m_gpu->gpu_tot_sim_cycle, kernel.get_uid(), kernel.get_name().c_str());
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////

From 43198e9c34c4ac2c215c90f9b9768b737b23e429 Mon Sep 17 00:00:00 2001
From: Aaron M Barnes <barnes88@purdue.edu>
Date: Fri, 18 Feb 2022 00:49:47 -0500
Subject: [PATCH 097/154] fix trace-driven concurrency segfault

---
 src/gpgpu-sim/shader.cc | 4 +++-
 src/gpgpu-sim/shader.h  | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 9f8a1297e..814311d1c 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -951,7 +951,7 @@ void shader_core_ctx::fetch() {
               m_threadState[tid].m_active = false;
               unsigned cta_id = m_warp[warp_id]->get_cta_id();
               if (m_thread[tid] == NULL) {
-                register_cta_thread_exit(cta_id, m_kernel);
+                register_cta_thread_exit(cta_id, m_warp[warp_id]->get_kernel_info());
               } else {
                 register_cta_thread_exit(cta_id,
                                          &(m_thread[tid]->get_kernel()));
@@ -3898,6 +3898,8 @@ void shader_core_ctx::get_icnt_power_stats(long &n_simt_to_mem,
   n_mem_to_simt += m_stats->n_mem_to_simt[m_sid];
 }
 
+kernel_info_t* shd_warp_t::get_kernel_info() const { return m_shader->get_kernel_info(); }
+
 bool shd_warp_t::functional_done() const {
   return get_n_completed() == m_warp_size;
 }
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index d80476ff4..c3e6f93ed 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -171,6 +171,7 @@ class shd_warp_t {
   void clear_membar() { m_membar = false; }
   bool get_membar() const { return m_membar; }
   virtual address_type get_pc() const { return m_next_pc; }
+  virtual kernel_info_t* get_kernel_info() const;
   void set_next_pc(address_type pc) { m_next_pc = pc; }
 
   void store_info_of_last_inst_at_barrier(const warp_inst_t *pI) {

From 8f71be8754506aa37ad905c2958915a1256375c3 Mon Sep 17 00:00:00 2001
From: Aaron M Barnes <barnes88@purdue.edu>
Date: Fri, 18 Feb 2022 10:14:39 -0500
Subject: [PATCH 098/154] update max_concurrent kernel based on compute
 capability

---
 configs/tested-cfgs/SM7_QV100/gpgpusim.config | 1 +
 src/gpgpu-sim/gpu-sim.cc                      | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index 76c99b7d6..8d2b10199 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -50,6 +50,7 @@
 -gpgpu_runtime_pending_launch_count_limit 2048
 -gpgpu_kernel_launch_latency 5000
 -gpgpu_TB_launch_latency 0
+-gpgpu_max_concurrent_kernel 128
 
 # Compute Capability
 -gpgpu_compute_capability_major 7
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index 8284ad359..5af244b33 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -701,7 +701,8 @@ void gpgpu_sim_config::reg_options(option_parser_t opp) {
                          "500.0:2000.0:2000.0:2000.0");
   option_parser_register(
       opp, "-gpgpu_max_concurrent_kernel", OPT_INT32, &max_concurrent_kernel,
-      "maximum kernels that can run concurrently on GPU", "8");
+      "maximum kernels that can run concurrently on GPU, set this value "
+      "according to max resident grids for your compute capability", "32");
   option_parser_register(
       opp, "-gpgpu_cflog_interval", OPT_INT32, &gpgpu_cflog_interval,
       "Interval between each snapshot in control flow logger", "0");

From 8ba79fb9708437aef8b34ca6b054ac17cdd88440 Mon Sep 17 00:00:00 2001
From: Aaron M Barnes <barnes88@purdue.edu>
Date: Fri, 18 Feb 2022 10:19:00 -0500
Subject: [PATCH 099/154] update configs max concurrent kernel based on compute
 capability

---
 configs/tested-cfgs/SM75_RTX2060/gpgpusim.config   | 1 +
 configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config | 1 +
 configs/tested-cfgs/SM7_TITANV/gpgpusim.config     | 1 +
 configs/tested-cfgs/SM86_RTX3070/gpgpusim.config   | 1 +
 4 files changed, 4 insertions(+)

diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index 2a9bff015..158b97e17 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -10,6 +10,7 @@
 -gpgpu_runtime_pending_launch_count_limit 2048
 -gpgpu_kernel_launch_latency 5000
 -gpgpu_TB_launch_latency 0
+-gpgpu_max_concurrent_kernel 128
 
 # Compute Capability
 -gpgpu_compute_capability_major 7
diff --git a/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config
index 0fb4742e1..89435a919 100644
--- a/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config
@@ -43,6 +43,7 @@
 -gpgpu_runtime_sync_depth_limit 2
 -gpgpu_runtime_pending_launch_count_limit 2048
 -gpgpu_kernel_launch_latency 5000
+-gpgpu_max_concurrent_kernel 128
 
 # Compute Capability
 -gpgpu_compute_capability_major 7
diff --git a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
index 5c6be224a..af561de59 100644
--- a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
@@ -18,6 +18,7 @@
 -gpgpu_heap_size_limit 8388608
 -gpgpu_runtime_sync_depth_limit 2
 -gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_max_concurrent_kernel 128
 
 # Compute Capability
 -gpgpu_compute_capability_major 7
diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
index 854378151..aee01308d 100644
--- a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
+++ b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
@@ -10,6 +10,7 @@
 -gpgpu_runtime_pending_launch_count_limit 2048
 -gpgpu_kernel_launch_latency 5000
 -gpgpu_TB_launch_latency 0
+-gpgpu_max_concurrent_kernel 128
 
 # Compute Capability
 -gpgpu_compute_capability_major 8

From da6a16a990a007edb7a760a2eb5b9b48ccc06e4c Mon Sep 17 00:00:00 2001
From: Rodrigo Huerta <rodrigodenia@gmail.com>
Date: Fri, 25 Feb 2022 10:33:46 +0000
Subject: [PATCH 100/154] Fixed old bug that happens when there are different
 latencies to the same execution unit

---
 src/gpgpu-sim/shader.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index c6e7b8f67..90bb900b2 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -2294,8 +2294,10 @@ void pipelined_simd_unit::cycle() {
     if (!m_dispatch_reg->dispatch_delay()) {
       int start_stage =
           m_dispatch_reg->latency - m_dispatch_reg->initiation_interval;
-      move_warp(m_pipeline_reg[start_stage], m_dispatch_reg);
-      active_insts_in_pipeline++;
+      if(m_pipeline_reg[start_stage]->empty()) {
+      	move_warp(m_pipeline_reg[start_stage], m_dispatch_reg);
+      	active_insts_in_pipeline++;
+      }
     }
   }
   occupied >>= 1;

From 92f313b60550ba0496d004f1a99647b1411c8da3 Mon Sep 17 00:00:00 2001
From: Aaron M Barnes <barnes88@purdue.edu>
Date: Wed, 23 Mar 2022 11:43:04 -0400
Subject: [PATCH 101/154] fix sub-core operand collector dispatch rr

---
 src/gpgpu-sim/shader.h | 44 ++++++++++--------------------------------
 1 file changed, 10 insertions(+), 34 deletions(-)

diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index c3e6f93ed..deea1c93a 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -956,41 +956,19 @@ class opndcoll_rfu_t {  // operand collector based register file unit
     void init(bool sub_core_model, unsigned num_warp_scheds) {
       m_sub_core_model = sub_core_model;
       m_num_warp_scheds = num_warp_scheds;
-      if (m_sub_core_model) {
-        m_last_cu_set = new unsigned(m_num_warp_scheds);
-        for (unsigned i = 0; i < m_num_warp_scheds; i++)
-        {
-          m_last_cu_set[i] = i * m_num_collectors / m_num_warp_scheds;
-        }
-      }
-      
     }
 
     collector_unit_t *find_ready() {
-      if (m_sub_core_model) {
-        assert(m_num_collectors % m_num_warp_scheds == 0 &&
-                 m_num_collectors >= m_num_warp_scheds);
-        unsigned cusPerSched = m_num_collectors / m_num_warp_scheds;
-        for (unsigned i = 0; i < m_num_warp_scheds; i++) {
-          unsigned cuLowerBound = i * cusPerSched;
-          unsigned cuUpperBound = cuLowerBound + cusPerSched;
-          assert(0 <= cuLowerBound && cuUpperBound <= m_num_collectors);
-          assert(cuLowerBound <= m_last_cu_set[i] && m_last_cu_set[i] <= cuUpperBound);
-          for (unsigned j = cuLowerBound; j < cuUpperBound; j++) {
-            unsigned c = cuLowerBound + (m_last_cu_set[i] + j + 1) % cusPerSched;
-            if ((*m_collector_units)[c].ready()) {
-            m_last_cu_set[i] = c;
-            return &((*m_collector_units)[c]);
-            }
-          }
-        }
-      } else {
-        for (unsigned n = 0; n < m_num_collectors; n++) {
-          unsigned c = (m_last_cu + n + 1) % m_num_collectors;
-          if ((*m_collector_units)[c].ready()) {
-            m_last_cu = c;
-            return &((*m_collector_units)[c]);
-          }
+      // With sub-core enabled round robin starts with the next cu assigned to a
+      // different sub-core than the one that dispatched last
+      unsigned cusPerSched = m_num_collectors / m_num_warp_scheds;
+      unsigned rr_increment = m_sub_core_model ?
+                              cusPerSched - (m_last_cu % cusPerSched) : 1;
+      for (unsigned n = 0; n < m_num_collectors; n++) {
+        unsigned c = (m_last_cu + n + rr_increment) % m_num_collectors;
+        if ((*m_collector_units)[c].ready()) {
+          m_last_cu = c;
+          return &((*m_collector_units)[c]);
         }
       }
       return NULL;
@@ -1000,9 +978,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
     unsigned m_num_collectors;
     std::vector<collector_unit_t> *m_collector_units;
     unsigned m_last_cu;  // dispatch ready cu's rr
-    unsigned *m_last_cu_set;
     unsigned m_next_cu;  // for initialization
-
     bool m_sub_core_model;
     unsigned m_num_warp_scheds;
   };

From ee9b6268818053daafaabdd31cdc4c5b95cbcb3d Mon Sep 17 00:00:00 2001
From: FJShen <50934207+FJShen@users.noreply.github.com>
Date: Tue, 29 Mar 2022 11:05:00 -0400
Subject: [PATCH 102/154] Update shader.h

This is a relatively critical bug comparing to other memory errors that deserves early merging.
---
 src/gpgpu-sim/shader.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index c3e6f93ed..b447e284f 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -957,7 +957,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
       m_sub_core_model = sub_core_model;
       m_num_warp_scheds = num_warp_scheds;
       if (m_sub_core_model) {
-        m_last_cu_set = new unsigned(m_num_warp_scheds);
+        m_last_cu_set = new unsigned[m_num_warp_scheds];
         for (unsigned i = 0; i < m_num_warp_scheds; i++)
         {
           m_last_cu_set[i] = i * m_num_collectors / m_num_warp_scheds;

From c9cc0a013e867a4df04e674a86b7fd91c0116eab Mon Sep 17 00:00:00 2001
From: Aaron M Barnes <barnes88@purdue.edu>
Date: Wed, 6 Apr 2022 10:14:13 -0400
Subject: [PATCH 103/154] fix duplicate regfile accesses within same
 instruction

---
 src/gpgpu-sim/shader.cc | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 814311d1c..ad9278c8f 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -4241,12 +4241,19 @@ bool opndcoll_rfu_t::collector_unit_t::allocate(register_set *pipeline_reg_set,
   warp_inst_t **pipeline_reg = pipeline_reg_set->get_ready();
   if ((pipeline_reg) and !((*pipeline_reg)->empty())) {
     m_warp_id = (*pipeline_reg)->warp_id();
+    std::vector<int> prev_regs; // remove duplicate regs within same instr
     for (unsigned op = 0; op < MAX_REG_OPERANDS; op++) {
       int reg_num =
           (*pipeline_reg)
               ->arch_reg.src[op];  // this math needs to match that used in
                                    // function_info::ptx_decode_inst
-      if (reg_num >= 0) {          // valid register
+      bool new_reg = true;
+      for (auto r : prev_regs) {
+        if (r == reg_num)
+          new_reg = false;
+      }
+      if (reg_num >= 0 && new_reg) {          // valid register
+        prev_regs.push_back(reg_num);
         m_src_op[op] = op_t(this, op, reg_num, m_num_banks, m_bank_warp_shift,
                             m_sub_core_model, m_num_banks_per_sched,
                             (*pipeline_reg)->get_schd_id());

From 33ad7c89b43cedc11dd645f2ae0215e6e4531230 Mon Sep 17 00:00:00 2001
From: notseefire <370797515@qq.com>
Date: Thu, 7 Apr 2022 23:29:09 +0800
Subject: [PATCH 104/154] Fixed constant_cycle

---
 src/gpgpu-sim/shader.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 814311d1c..ab84fc578 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -2092,10 +2092,11 @@ bool ldst_unit::constant_cycle(warp_inst_t &inst, mem_stage_stall_type &rc_fail,
   mem_stage_stall_type fail;
   if (m_config->perfect_inst_const_cache) {
     fail = NO_RC_FAIL;
+    unsigned access_count = inst.accessq_count();
     while (inst.accessq_count() > 0) inst.accessq_pop_back();
     if (inst.is_load()) {
       for (unsigned r = 0; r < MAX_OUTPUT_VALUES; r++)
-        if (inst.out[r] > 0) m_pending_writes[inst.warp_id()][inst.out[r]]--;
+        if (inst.out[r] > 0) m_pending_writes[inst.warp_id()][inst.out[r]] -= access_count;
     }
   } else {
     fail = process_memory_access_queue(m_L1C, inst);

From b102f6139cfb509a2f9eed149cea89b6da3caa0d Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <d.papag.a@gmail.com>
Date: Mon, 5 Dec 2022 00:54:59 +0100
Subject: [PATCH 105/154] Fixed regex for files generated by newer bison
 versions

---
 src/cuda-sim/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cuda-sim/Makefile b/src/cuda-sim/Makefile
index 85d1c8c01..88e6f2b79 100644
--- a/src/cuda-sim/Makefile
+++ b/src/cuda-sim/Makefile
@@ -129,9 +129,9 @@ $(OUTPUT_DIR)/instructions.h: instructions.cc
 
 $(OUTPUT_DIR)/ptx_parser_decode.def: $(OUTPUT_DIR)/ptx.tab.c
 ifeq ($(shell uname),Linux)
-	cat $(OUTPUT_DIR)/ptx.tab.h | grep "=" | sed 's/^[ ]\+//' | sed 's/[=,]//g' | sed 's/\([_A-Z1-9]\+\)[ ]\+\([0-9]\+\)/\1 \1/' | sed 's/^/DEF(/' | sed 's/ /,"/' | sed 's/$$/")/' > $(OUTPUT_DIR)/ptx_parser_decode.def
+	cat $(OUTPUT_DIR)/ptx.tab.h | grep "=" | sed 's/^[ ]\+//' | sed -E 's/\/\*.+\*\///' | sed 's/[=,]//g' | sed 's/\([_A-Z1-9]\+\)[ ]\+\([0-9]\+\)/\1 \1/' | sed 's/^/DEF(/' | sed 's/ /,"/' | sed 's/$$/")/' > $(OUTPUT_DIR)/ptx_parser_decode.def
 else
-	cat $(OUTPUT_DIR)/ptx.tab.h | grep "=" | sed -E 's/^ +//' | sed 's/[=,]//g' | sed -E 's/([_A-Z1-9]+).*/\1 \1/' | sed 's/^/DEF(/' | sed 's/ /,"/' | sed 's/$$/")/' > $(OUTPUT_DIR)/ptx_parser_decode.def
+	cat $(OUTPUT_DIR)/ptx.tab.h | grep "=" | sed -E 's/^ +//' | sed -E 's/\/\*.+\*\///' | sed 's/[=,]//g' | sed -E 's/([_A-Z1-9]+).*/\1 \1/' | sed 's/^/DEF(/' | sed 's/ /,"/' | sed 's/$$/")/' > $(OUTPUT_DIR)/ptx_parser_decode.def
 endif
 
 $(OUTPUT_DIR)/instructions.o: $(OUTPUT_DIR)/instructions.h $(OUTPUT_DIR)/ptx.tab.c

From 0d8af96f37753f1624fec5c8e16fa9d4496929b2 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <d.papag.a@gmail.com>
Date: Mon, 5 Dec 2022 01:09:47 +0100
Subject: [PATCH 106/154] Ignore YY keywords, remove whitespace

---
 src/cuda-sim/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cuda-sim/Makefile b/src/cuda-sim/Makefile
index 88e6f2b79..8430eb1ab 100644
--- a/src/cuda-sim/Makefile
+++ b/src/cuda-sim/Makefile
@@ -129,9 +129,9 @@ $(OUTPUT_DIR)/instructions.h: instructions.cc
 
 $(OUTPUT_DIR)/ptx_parser_decode.def: $(OUTPUT_DIR)/ptx.tab.c
 ifeq ($(shell uname),Linux)
-	cat $(OUTPUT_DIR)/ptx.tab.h | grep "=" | sed 's/^[ ]\+//' | sed -E 's/\/\*.+\*\///' | sed 's/[=,]//g' | sed 's/\([_A-Z1-9]\+\)[ ]\+\([0-9]\+\)/\1 \1/' | sed 's/^/DEF(/' | sed 's/ /,"/' | sed 's/$$/")/' > $(OUTPUT_DIR)/ptx_parser_decode.def
+	cat $(OUTPUT_DIR)/ptx.tab.h | grep "=" | sed 's/^[ ]\+//' | sed -E 's/\s+\/\*.+\*\///' | sed 's/[=,]//g' | sed 's/\([_A-Z1-9]\+\)[ ]\+\([0-9]\+\)/\1 \1/' | sed 's/^/DEF(/' | sed 's/ /,"/' | sed 's/$$/")/' | sed '/YYerror/d;/YYEOF/d;/YYEMPTY/d;/YYUNDEF/d;'> $(OUTPUT_DIR)/ptx_parser_decode.def
 else
-	cat $(OUTPUT_DIR)/ptx.tab.h | grep "=" | sed -E 's/^ +//' | sed -E 's/\/\*.+\*\///' | sed 's/[=,]//g' | sed -E 's/([_A-Z1-9]+).*/\1 \1/' | sed 's/^/DEF(/' | sed 's/ /,"/' | sed 's/$$/")/' > $(OUTPUT_DIR)/ptx_parser_decode.def
+	cat $(OUTPUT_DIR)/ptx.tab.h | grep "=" | sed -E 's/^ +//' | sed -E 's/\s+\/\*.+\*\///' | sed 's/[=,]//g' | sed -E 's/([_A-Z1-9]+).*/\1 \1/' | sed 's/^/DEF(/' | sed 's/ /,"/' | sed 's/$$/")/' > $(OUTPUT_DIR)/ptx_parser_decode.def
 endif
 
 $(OUTPUT_DIR)/instructions.o: $(OUTPUT_DIR)/instructions.h $(OUTPUT_DIR)/ptx.tab.c

From b22122ab03365b0570a034b95876df4f83295c78 Mon Sep 17 00:00:00 2001
From: Dimitris Papagiannis <d.papag.a@gmail.com>
Date: Mon, 5 Dec 2022 01:19:27 +0100
Subject: [PATCH 107/154] Added regex for non-linux platforms

---
 src/cuda-sim/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cuda-sim/Makefile b/src/cuda-sim/Makefile
index 8430eb1ab..01bc4807f 100644
--- a/src/cuda-sim/Makefile
+++ b/src/cuda-sim/Makefile
@@ -131,7 +131,7 @@ $(OUTPUT_DIR)/ptx_parser_decode.def: $(OUTPUT_DIR)/ptx.tab.c
 ifeq ($(shell uname),Linux)
 	cat $(OUTPUT_DIR)/ptx.tab.h | grep "=" | sed 's/^[ ]\+//' | sed -E 's/\s+\/\*.+\*\///' | sed 's/[=,]//g' | sed 's/\([_A-Z1-9]\+\)[ ]\+\([0-9]\+\)/\1 \1/' | sed 's/^/DEF(/' | sed 's/ /,"/' | sed 's/$$/")/' | sed '/YYerror/d;/YYEOF/d;/YYEMPTY/d;/YYUNDEF/d;'> $(OUTPUT_DIR)/ptx_parser_decode.def
 else
-	cat $(OUTPUT_DIR)/ptx.tab.h | grep "=" | sed -E 's/^ +//' | sed -E 's/\s+\/\*.+\*\///' | sed 's/[=,]//g' | sed -E 's/([_A-Z1-9]+).*/\1 \1/' | sed 's/^/DEF(/' | sed 's/ /,"/' | sed 's/$$/")/' > $(OUTPUT_DIR)/ptx_parser_decode.def
+	cat $(OUTPUT_DIR)/ptx.tab.h | grep "=" | sed -E 's/^ +//' | sed -E 's/\s+\/\*.+\*\///' | sed 's/[=,]//g' | sed -E 's/([_A-Z1-9]+).*/\1 \1/' | sed 's/^/DEF(/' | sed 's/ /,"/' | sed 's/$$/")/' | sed '/YYerror/d;/YYEOF/d;/YYEMPTY/d;/YYUNDEF/d;' > $(OUTPUT_DIR)/ptx_parser_decode.def
 endif
 
 $(OUTPUT_DIR)/instructions.o: $(OUTPUT_DIR)/instructions.h $(OUTPUT_DIR)/ptx.tab.c

From f5d21b116f927a9c8e2630a26ada045dcc844242 Mon Sep 17 00:00:00 2001
From: tgrogers <tgrogers@purdue.edu>
Date: Fri, 10 Feb 2023 10:01:51 -0500
Subject: [PATCH 108/154] Taking out a comment that is no longer relevant and
 truncating the commit hash to be more sane

---
 setup_environment | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/setup_environment b/setup_environment
index d3ff8403c..7eeaa4f12 100644
--- a/setup_environment
+++ b/setup_environment
@@ -7,7 +7,7 @@ export GPGPUSIM_ROOT="$( cd "$( dirname "$BASH_SOURCE" )" && pwd )"
 
 GPGPUSIM_VERSION_STRING=`cat $GPGPUSIM_ROOT/version | awk '/Version/ {print $8}'`
 #Detect Git branch and commit #
-GIT_COMMIT=`git --git-dir=$GPGPUSIM_ROOT/.git log -n 1 | head -1 | sed -re 's/commit (.*)/\1/'`
+GIT_COMMIT=`git --git-dir=$GPGPUSIM_ROOT/.git log --abbrev-commit -n 1 | head -1 | sed -re 's/commit (.*)/\1/'`
 GIT_FILES_CHANGED=`git --git-dir=$GPGPUSIM_ROOT/.git diff --numstat | wc | sed -re 's/^\s+([0-9]+).*/\1./'`
 GIT_FILES_CHANGED+=`git --git-dir=$GPGPUSIM_ROOT/.git diff --numstat --cached | wc | sed -re 's/^\s+([0-9]+).*/\1/'`
 GPGPUSIM_BUILD_STRING="gpgpu-sim_git-commit-$GIT_COMMIT-modified_$GIT_FILES_CHANGED"
@@ -143,21 +143,26 @@ fi
 if [ -z "$PTXAS_CUDA_INSTALL_PATH" ]; then
     export PTXAS_CUDA_INSTALL_PATH=$CUDA_INSTALL_PATH;
 fi
-echo "";
-echo "----------------------------------------------------------------------------";
-echo "INFO - If you only care about PTX execution, ignore this message. GPGPU-Sim supports PTX execution in modern CUDA."
-echo "If you want to run PTXPLUS (sm_1x SASS) with a modern card configuration - set the envronment variable"
-echo "\$PTXAS_CUDA_INSTALL_PATH to point a CUDA version compabible with your card configurations (i.e. 8+ for PASCAL, 9+ for VOLTA etc..)"
-echo "For example: \"export \$PTXAS_CUDA_INSTALL_PATH=/usr/local/cuda-9.1\""
-echo ""
-echo "The following text describes why:";
-echo "If you are using PTXPLUS, only sm_1x is supported and it requires that the app and simulator binaries are compiled in CUDA 4.2 or less.";
-echo "The simulator requires it since CUDA headers desribe struct sizes in the exec which change from gen to gen.";
-echo "The apps require 4.2 because new versions of CUDA tools have dropped parsing support for generating sm_1x";
-echo "When running using modern config (i.e. volta) and PTXPLUS with CUDA 4.2, the \$PTXAS_CUDA_INSTALL_PATH env variable is required to get proper register usage"
-echo "(and hence occupancy) using a version of CUDA that knows the register usage on the real card."
-echo "";
-echo "----------------------------------------------------------------------------";
+
+# I am not sure PTXPlus really makes sense anymore and this verbose print to describe
+# how to use it is probably not aging well. The info in here is good though if you care
+# about PTXPlus, so I will leave it as a comment.
+#
+#echo "";
+#echo "----------------------------------------------------------------------------";
+#echo "INFO - If you only care about PTX execution or trace-based SASS execution, ignore this message."
+#echo "If you want to run PTXPLUS (sm_1x SASS) with a modern card configuration - set the envronment variable"
+#echo "\$PTXAS_CUDA_INSTALL_PATH to point a CUDA version compabible with your card configurations (i.e. 8+ for PASCAL, 9+ for VOLTA etc..)"
+#echo "For example: \"export \$PTXAS_CUDA_INSTALL_PATH=/usr/local/cuda-9.1\""
+#echo ""
+#echo "The following text describes why:";
+#echo "If you are using PTXPLUS, only sm_1x is supported and it requires that the app and simulator binaries are compiled in CUDA 4.2 or less.";
+#echo "The simulator requires it since CUDA headers desribe struct sizes in the exec which change from gen to gen.";
+#echo "The apps require 4.2 because new versions of CUDA tools have dropped parsing support for generating sm_1x";
+#echo "When running using modern config (i.e. volta) and PTXPLUS with CUDA 4.2, the \$PTXAS_CUDA_INSTALL_PATH env variable is required to get proper register usage"
+#echo "(and hence occupancy) using a version of CUDA that knows the register usage on the real card."
+#echo "";
+#echo "----------------------------------------------------------------------------";
 
 echo "setup_environment succeeded";
 

From e3483ab6cd79a721e60c403dc4ebadb38ba24391 Mon Sep 17 00:00:00 2001
From: JRPan <25518778+JRPan@users.noreply.github.com>
Date: Sat, 11 Feb 2023 13:33:51 -0500
Subject: [PATCH 109/154] truncate the commit hash

---
 version_detection.mk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version_detection.mk b/version_detection.mk
index ee71a6240..81c1d2ae7 100644
--- a/version_detection.mk
+++ b/version_detection.mk
@@ -32,7 +32,7 @@ else
 GPGPUSIM_VERSION=$(shell cat $(GPGPUSIM_ROOT)/version | awk '/Version/ {print $$8}' )
 
 #Detect Git branch and commit #
-GIT_COMMIT := $(shell git log -n 1 | head -1 | sed -re 's/commit (.*)/\1/')
+GIT_COMMIT := $(shell git log --abbrev-commit -n 1 | head -1 | sed -re 's/commit (.*)/\1/')
 GIT_FILES_CHANGED_A:=$(shell git diff --numstat | wc | sed -re 's/^\s+([0-9]+).*/\1./')
 GIT_FILES_CHANGED:= $(GIT_FILES_CHANGED_A)$(shell git diff --numstat --cached | wc | sed -re 's/^\s+([0-9]+).*/\1/')
 GPGPUSIM_BUILD := gpgpu-sim_git-commit-$(GIT_COMMIT)_modified_$(GIT_FILES_CHANGED)

From 5cfcc39fa877ceef377b238407f9598f0ec9be3c Mon Sep 17 00:00:00 2001
From: tgrogers <tgrogers@purdue.edu>
Date: Sat, 11 Feb 2023 16:56:19 -0500
Subject: [PATCH 110/154] Getting rid of some busted old email templates

---
 Jenkinsfile | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index a3db3e503..f6676bf14 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -90,20 +90,17 @@ pipeline {
     }
     post {
         success {
-//            sh 'git remote rm upstream'
-            emailext body:'''${SCRIPT, template="groovy-html.success.template"}''',
+            emailext body: "See ${BUILD_URL}.",
                 recipientProviders: [[$class: 'CulpritsRecipientProvider'],
                     [$class: 'RequesterRecipientProvider']],
-                subject: "[AALP Jenkins] Build #${BUILD_NUMBER} - Success!",
-                attachmentsPattern: 'correl.*.txt',
+                subject: "[AALP Jenkins] Build ${JOB_NAME} #${BUILD_NUMBER} - Success!",
                 to: 'tgrogers@purdue.edu'
         }
         failure {
-//            sh 'git remote rm upstream'
             emailext body: "See ${BUILD_URL}",
                 recipientProviders: [[$class: 'CulpritsRecipientProvider'],
                     [$class: 'RequesterRecipientProvider']],
-                subject: "[AALP Jenkins] Build #${BUILD_NUMBER} - ${currentBuild.result}",
+                subject: "[AALP Jenkins] Build ${JOB_NAME} #${BUILD_NUMBER} - ${currentBuild.result}",
                 to: 'tgrogers@purdue.edu'
         }
     }

From 46e0ec221496469920fbabb91efd447f74da702f Mon Sep 17 00:00:00 2001
From: christindbose <chris241@purdue.edu>
Date: Wed, 10 May 2023 14:15:37 -0400
Subject: [PATCH 111/154] Ported aerialvision to use python3 instead of python2

---
 .../__pycache__/configs.cpython-310.pyc       | Bin 0 -> 1444 bytes
 .../__pycache__/guiclasses.cpython-310.pyc    | Bin 0 -> 69744 bytes
 .../__pycache__/lexyacc.cpython-310.pyc       | Bin 0 -> 6623 bytes
 .../lexyaccbookmark.cpython-310.pyc           | Bin 0 -> 2712 bytes
 .../lexyacctexteditor.cpython-310.pyc         | Bin 0 -> 2825 bytes
 .../__pycache__/organizedata.cpython-310.pyc  | Bin 0 -> 6305 bytes
 .../__pycache__/parsetab.cpython-310.pyc      | Bin 0 -> 870 bytes
 .../__pycache__/startup.cpython-310.pyc       | Bin 0 -> 24043 bytes
 .../variableclasses.cpython-310.pyc           | Bin 0 -> 4892 bytes
 aerialvision/configs.py                       |  11 +--
 aerialvision/guiclasses.py                    |  85 +++++++++---------
 aerialvision/lexyacc.py                       |  31 ++++---
 aerialvision/lexyaccbookmark.py               |   9 +-
 aerialvision/lexyacctexteditor.py             |  11 +--
 aerialvision/organizedata.py                  |  35 ++++----
 aerialvision/parser.out                       |  47 ++++++++++
 aerialvision/parsetab.py                      |  31 +++++++
 aerialvision/startup.py                       |  18 ++--
 aerialvision/variableclasses.py               |  16 ++--
 19 files changed, 193 insertions(+), 101 deletions(-)
 create mode 100644 aerialvision/__pycache__/configs.cpython-310.pyc
 create mode 100644 aerialvision/__pycache__/guiclasses.cpython-310.pyc
 create mode 100644 aerialvision/__pycache__/lexyacc.cpython-310.pyc
 create mode 100644 aerialvision/__pycache__/lexyaccbookmark.cpython-310.pyc
 create mode 100644 aerialvision/__pycache__/lexyacctexteditor.cpython-310.pyc
 create mode 100644 aerialvision/__pycache__/organizedata.cpython-310.pyc
 create mode 100644 aerialvision/__pycache__/parsetab.cpython-310.pyc
 create mode 100644 aerialvision/__pycache__/startup.cpython-310.pyc
 create mode 100644 aerialvision/__pycache__/variableclasses.cpython-310.pyc
 create mode 100644 aerialvision/parser.out
 create mode 100644 aerialvision/parsetab.py

diff --git a/aerialvision/__pycache__/configs.cpython-310.pyc b/aerialvision/__pycache__/configs.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..577be954f86a8aab5b6ff96286c4d64d3363af4c
GIT binary patch
literal 1444
zcmZuxZI2r@5VpOqNptC`FDKyoAxNA=kjS<Nlo#=G=z0_hkrqX&ss&^%v6I_lvzxOw
zk%)2<()EWVANfm`_{1mv0v{+d_N5ByS(?erdS*P&GmfjBohE_x<GV-V&s{?PLT7sg
z;2eSI=b!`;v>>Vbr74y4jIBJ-zF=<>;R%23rNLt&0ujC-BD`d2D8oyVMj{gN3zEjN
zAsW9j^`49`3599{E8XVbP@VLR|Lv3SPZrJIcsicWxXvf0l}VLn#XQ$}RSq#~dogg1
zK=eDHBuS|tDHBvMILpI+;MaHEq=ASAq#K)CU)ze$Y{lcM9OdI7h4jrrI|Mlfv2|;*
zqHD4woGl6Bt^KoL71oiUYqq9<f<Rr?Uo?icSWgY%l$LX*nbK03-9a{z?yRToOX&8E
zuVf|+tF-Z_S$%5!vnnsm%^743WL@Xw_$hquQsZk`jQ$|14Q>7NdvaP$WHQOiBuj>;
zD%Xb}K1d!_!`VcZwN7fO>xZ9yloZ)7hsnwD$;11f9Da;^&WfxCBKcHRXL49`=&$~O
z&kqye`~9o^N`ZIfC)@NVu4mnw{$^s%^D?hFe~dKwpqlh8y4Zi6+}<Yge=K801Y%kK
z1`OcV^8ASWynjKh$Ch+W!M}BZ%#)S3q)WDD7cc=q(O!BZZ;xEU49<7XgUuXnG!As{
zKzF|A9O!PK@DvkkKnuXNiK?kHys@lJFjaY3cYU>kQPEYD30;s0=2<b5mUE|WVpzi_
z@vJDm16Q{|wdpQx(>s*?>%UywyM}~UDMLiQS!knJ7CK-!79391-mrkE2W08NJ)`}u
z=O8stGtFI}>NccaflznBR(qgKB;+WY74>WV19EPW{QzMaFMAuh_t~|bzY6YmLmYdV
zO(f^0$@!!bvjXiF=jXGma61Z~5u)BewT}uf?3O|X6=FUC(J`o)#4Pq(q5XR8U)<7j
z4?=5rE^yp6U9mMhAmB6f?dZM!;_f!{ohNS6pBVz2-GOuyXLEOlU4PfKX8cK(mp@@|
z$V*IP+M?=hn+%9ykMnL*;j^e3sPOqzH*i0a<vdqasqjKrFsg$VCM)cIiv@eKy-e}v
aj>a$X-O<S1?pJ%P?S;2vKMtDIfB7H#oLGSX

literal 0
HcmV?d00001

diff --git a/aerialvision/__pycache__/guiclasses.cpython-310.pyc b/aerialvision/__pycache__/guiclasses.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aaf967c0e00a2f66e8e4507dde3326f801b47dad
GIT binary patch
literal 69744
zcmcG%37lNlUFTbSS5;T<t6D9owAfb5w&X2N;yAXo+D;_du@u`92~DYQsnxAsZ<S=J
zsSb(dBsL((JYZm$4ATlB4>9b^5N3ugY%>E448xl@+~L6_!y|~iuuL8g&_FWZ-~ZfO
zTX#G5eBN|_>fCeBJ^S)M|Ly$GVpmr_#^3zjC%jAlDIWW;dTIVk@$v+}%6L2$^I~3n
zE_T*E<7eaco;aJZXYy>yp6Rpca(XS}C2qvyikY12n9rWg&gahN>^(J?pYJ@|Y5w?J
z`fPzX>A9}??z7z%mYGY>_nhqszxST)wOsqo_Sv)lY`;AR&JNgf@a&*Hht3Y!bNKAA
zJx9)t*mLykD9?_$E%Re%$KtUkV_w$Fy%O_s*Ai#9@|O2HdFw21y_P&X&U?Y@;=PNn
z+j#5tdU)&MZ98whULS9LyzStv-y7g<fVZL-KNXu8yrq6S8Kh5NJUBf)kqG*Z%}ihN
z%ZEz~mrE533<gh@F3(JtsxylVrxzFJ&XxQJPG6je2N@R?U-#7F+@gPYap7{=ua<p%
zTe`Y5w^+rOEnS{kTsS{7J*ASG|1!Kh!LRazIOUi|hEa)oiRs8qPRFNXUg{HZFFhUi
zG9OQ$O?n+(_LUf;mh<vF(_W`n;F<Bdyl$QyUXRzyGwbzv{XBEtfH%l9?+tmwJUhJ+
zZ<J@j+v1J!?DDpH<2<{)ZQgdCJ>Cwl$g|hm>Fwg#=k50P@a*^Q@b2U};N3;9?G1A0
z7ybF83swKB&lHQ@O3Kl+n@C!KuL*i9El16qNZ+z(3H<iS@072AGo|m*-yS?Ue#)+r
z>4?Q@$yZ~OOkiAfMdB0kC2A?$<a7)-<tawmQ%J_*b(lYEZq7pTwe(u&4JPcxgnz)x
z)Dj~~Sy)KBaQSQTwXRoVvxOVVzq^+5I*9v1wP&`M&~8mX{C%2uUg3I%IQ`+be!g{G
zPY^P&@F3~BNmp5yjzM{ybR%yrerw6A)MmDSZSY1cPOD}IW(WB`#BbQ^yPk;0YMG`G
zrvLS1JQn6K;Of5GQR}ccs>di3YcPznrPiToL#&bOsnu*P>vHl&uLC5AJH~gWk+=1_
z#$YR@Z*7jD_qB0voaeS1)TFu{U+h}^NbLRj)tr6ZK`zBwPG6V5U?DrbZ301r?;`i@
zJa-em$J=pTBYX$`JHj$@UQz8MhQ=srOTLzGYK!#%F}Bq5Oz5a3owd&9dRkd|Z`Vg-
z=i@frv)&!oldFYVVSP#DW8=NrRqJZYV@s{eyOXbXk+!?m?P}4~S6$v-n?-ke6V{fy
zsP$c8|LnE?p(XYFdulyx`S;d(!m@j7z4mRQp8LL9FRi%S+jl*&+Gp+U@$L@m(qHRe
zSC?MZrPeolk9UvN<zDJ?Z&;W8stex^C~WpV?_Q0z>e^hUeY2V7>Og$=w%<G8MrG~(
z8(L)#P@4y7)qSLRz{^@6-w#O1dJkMruMX70`O2)@Xfy|FgRbu6GU`3pG&2Xihw9@w
zR2vFQk7nks+F&@IZ?Up8p2M}_Fjk|iG2%V!y)~S{4^hX5!d`fbdO`1PIXBO>k=jU8
z?HY4!gr0vJwR=0OX0$fya-ap_inQ@joJYKOP_HeuE%g`=yERLUx7J1g$$~m>Bj%&m
z)33!}jo0Hfa@s|_cYZYX(fIj<_gKBHW3@3Cw?0D@?{Vw>C+e}b)>_spW%brFlm<Lo
z9j}dB4uT19kH(ZWInGEO^bUE4Ew3Y%*U{^l)orzH7XJ~x9}Cyz7IJ&CK1$nb+g*<J
zIyFk$QQHxIdyw+p;XM^XpnWw#2sf{bwW2G*VuZ77_R-o7?`iK{e7nbcx6S2uGHUO%
znD6k8Ywq$*bNQXoTqeC6#e?1nAo@v5^%yZ8i|XK7ppdo4y|}@nC)N(?-S&q=_zuku
zde3;zdhbzN^}g2SJ$EC%y0f;^!D5wC+evSp;GZ&9chz=T8ds-X^_&Hrc6q0WDaf(A
zwmX95Ylj8Xy%-zr>65mr-s}ic$7dGGLGG!UN_BDCFU{Z19V+?7!xu_^_4dfKCBHN`
zSDq_Ay-+PLRA#DIi_e0Etn{6#mZmP2y<$D?%C=477WbaIG(WLYI5kzOg34I>pmTa|
z@my){@YSigau8d|oi3d_JyV@4KfaQFwlrNX2I<N?1qbPuF3gZf+xo44#a@5wM?nd0
zrh~+}>7eu6qVJXcmu9@`g&^aX=Vr?1gUp5U%=CpS2^D-Z?vIl+$jq0$nM?Dxegrh&
z)@_i8n~5M*neoa&rnE41VbQ<UL$sSo|4ttMT}qN#DtTAdg<QSeefYxSVx?Rx6_3r#
zl^?m8xZU;S%-me1c&fBeDV{ieOLe`O3{utdm1>YWzX(dBI#3m5e(SwBhO696_(l15
z;N0$e^vL3+b93c;4_&HO7Z>im|1rOt@F0Jzbb0ZTPajmkgC0;&x>7Aw%R$G~;{1GR
z!6O^1^gd-XaHLc%6|0NIb7kx2YT0|_cBc4v@!oqMn<7HXpHQL!eqR!1^=fPlr0511
z?9If<mA8boIXpKrb+Nd(P^?}k7w5}We`ZR#SITqcscLbiS_Ek;3N5Y9EKKje84o%x
zl;@UCRHkpzUV6AZciull?zgWUxUe{1J}^JCaG-Qx>ViL0dElY@4;)#Xx-?&2s8$Zp
zzUsqof6IZn(zyo?96fyW;rrh9z*{FPGxL|`3{9R~@)u_*b8?Ak`QYT`2M$axUAi~?
zzUcE7z8@%+{h89-<(Ue|^?~V2GgEV=N~K)czjQUoJ~~&LKj)Pm`zLC(FvomsWltUN
z+&?)vvoKSgoZSED+~QPeuJRZjmw--I-}{fKEtwaK&zJm46sEzwrG8|x6nWTy26@*<
z^NU`Yp2#fvGqm$|=8Q|)9JBjsWy)Win~TOXZlwhAtKrBphnQct)85qs-j#KOs8-k@
zX7OkcZ?L*=#!pgcZt6mL>f({BN3Pfy_=i<FjfQ_z&M};y;%6O^jonPS>6N`)q63+V
zHfyaew=_X+cb~qxR4y)_FFxs)mM%Q9t|rlR@*g9%Ceb|`O(G!7(uHDGSv60j{r(fA
z+O(cOtB+6O+_GU%eO3wwJ+FA`(m74PM_3#h2RBL&+UMQT$4c?uiA3Y`L@G$1_ZR0U
zgG6=Fe@3Ysj3BT1nd-WBi3U**SF|A2!>`8I;x}SiD*hRQAo21g&kJi3B&S&>>Sc!S
z2#=tv;_7&64tV3Y4ZyDx!v>%mAO(pyRi0Z{foO>R7m4hOj0XdLd4BP7IV|;snFVk0
zrA@o<H<j>xs!V#?FP9fqvNnb%$~&p#0m5&oZEvjLu1dLjX-S)aV9lmQ{)ec@iL}2>
z{?_5Kc@<7@{~o@tbU(8sdbq;Q19)9vKn<+voBuq{tqD2z;_yi@(=W<-4hPU`G46pP
zWD#0P^%z0w+#>r!>a>5UOo56RWXe~T0G+poPcAkli7U5wrd++f_srs@;)T-Xa&cj?
zT4aBqLYl4?R9d)N42QHbaU@82bVJZV$RSAiAiuO&saWq;f}~%r1lcHV(5aZNloy*{
z&XC7d?@EwootNEr_D8Lg@Flr$Y2Kf<idReLcnOkBIPFl=<sfnTVvs(@zRh0t<a5tF
z^Zd~ue~!I*+F!i1;00NMw}XDUr07!@+3k-WI~~MN2FcUUJR8Jc2-3$(=gM<I$8q-m
zbBoks>V*q4Qx~+k(RIu#LTtygkW6I}euNifW)>=NCW7SAlShKWmCG~bmnJnItBXO#
zZDB$7$kV3|9y)&XNYE2z5N&1b+>LE5$ewub@zYN~dz^Yty?F4-Ohxl5=yYl1e~~Po
zIeq$>6G85jK}z@~LH7Bh&z*ky@WJCjk_k!DGaDs66J#8#BuHOX9fJ&b48S``0p;!s
zIu8p%p0h2NECk?e=%UiyOZ-%j26>@B@=rbU+|y^DIeD65<1Ys3=bnD@DLRt@Jgv>X
z&yANh2feE|qs8wQXwa=O&|b#SX%kOAJO5IUJ-JvdGkGoso#9*o8>$3_OG`|ca6(M%
zalj;J^TNKTm<Z>rAr=3FnjC3%82>8mu=%Q51l^!dZY<pl>6vQ_NNdhjq7hPWyNJDW
zjsABbL+9Sp7W5QbeUN=InmCyk4TB4^XQGh)GnVw&qTd*|9=F*wMk{m1C8OFJ3vE|H
zPWyxF{QO*FRC><6r~abC{ePimXXj2i02_49(IwGK_qovL;BL@;Zpy%G^xdtnaIO~4
zS!v;0Pt!OGDGdf^78c5WBh>ZCSR)`D?nX0*s`E=vD^XZnIR3*M2HziS-8kHO#(~%i
zG}ZR)ARD16Uqevkk8m={OgtaYwEY*7g?K*E9nYr5<Ad>RT<&<HKi*5+!Ng#^H#HdV
zk9YIy<-a@GA1@?FaeGt!7Cx#_{^bsbe)k@QD4qWE$?bY3U4BaE^6aHF{p)?0-xxWK
zDo;vLivFbXqbBxUISo^Gfv{fOETOaje>Pr7tn{zP)274$jQ-MiGW>)kRDKs{C0~O%
zG#jrbW|LlAWZMg|g;X2@C>}c#<6CMiE$(y;HXm;tYgsQzTw@*ao^O7yb`nzHm)R0q
z>$>4sPTi}CTEa`go_oyFcPn)*VXUY7remwg4O4ZlreNx&FSBRWl50J+l!(FQch{1}
z#7bDn%{iT^#nyW3WLxIsMBINnt%Goxs?NY2Rv-97IBbi01nI-ajz9Bc&~fy|)6XA#
z?$kt|e?w9JnVfG@gv`ljo;z{yxc{9B(YE4$SkCv_HxJYvg6rc7xv7xU92<kbYTk?G
zs}=trTfEZJQhCAif1Ve1ctdp3<;#LVf1>X>MLjsTbfFX^@wz$Oz&-yFalXW_qCGyr
z6v)IA8umnQd^DA1$P*t-rA8U}JiaW`hzT&*m;eqCX+vt60z#BD4QvYhTRe1TY%MOb
zhCx=>Cf8D4qMDw~7)L5LoA8nslm4WJukOq8mfNDV9b1?N8sN%1d20`c_q$<uh6!<+
z_{p_Sm4vV0CAP%E5}QNGU!e<{T+QK3G8bD@Eu`18C;iXSA~)Q_@(E@0MYcN5w{YT#
zZW~26s4wf&kIMODIU0I5tk>kzfZDi2r~+(lRQ+#IpoTQWUHv~*fDM~DNA>zioL**K
zr5BHu>R|FImiK6UI8jKi47HDv?cgoLlp<YyrC#BM=D+?hxxsmucy7StYYYdSJDXok
zR6ALr_*q9?t4Zt9u5bXlYf0Co%X?W(J@rmawRh@7uT6T_f!0qOb)fZZy#xI((ga^J
zhM>oGXZAZbU+q+Fy*iPyg<`!Lbbx^<P7t4N@8KUHubt{)4YBLtK~~3TyinM<f6uh^
zuOLE#-(h~0Q5@I5H+X6htR<uNJ6a;;|1fb}lV6cf6QGG$Y>d1%oVTl98o>hF#mYeQ
zXtRB5_nuPO(VzTDeyYot#&BF+8cVWK8%Rc*;`EYrx<RIoki-EQM@{)3Re)8gQLi5(
z>~ZS#CDp5&O=Ep8+4kPjORALy&3fr9Z>rfWz`|pWc|d3{b3F~K&xIl*^KxR$h3^vE
zMDGa`1MT(-tMP?|*R=|;VaJi!=C>6akkiuya$;*8dP95@zmc^3kgO5+8H?+__2bWu
zAXVg-SFDBZO}=)7dE_jQ<)&DJ_;Z`)(fl>aBX4<pTT>oG`170Q(fl>aqto*Esir)J
z@po>TNAuSxkEn-6YH^p&faIw11eA^<kKA%SK{-kPPpq6&>zAg!?gAG4ymo2ehD3FL
z)iOp=IRhbRj^4+r>g_J}X0U^6s>4=(ZnP4!Hd-Hx`*&H{(RdNQX&d_~Kl<w02%eyR
zBA0hH<uXXT*Bi0M*C-boF>;x2%4LXLUT@4AU!z=X%*f?qO}Pw{%j=C=<Lm2=+VU5S
z**uOoyedrE-_MGJCIe3>`=7_p-+y`BwlbFegS-VDvzIE>ne%Y#DiBMs74Hvn5VB=|
zPHgHdT(Tv@Q09pY;r}g4lb>0L!bNm@LGsMeQ$gb7GqzoZ<$w`(h;8vmG4E>wT`35G
za5(%U#UP$v=`iy1k>W~+ktcY%44_AGRx+Z~iJJ|v`uis0K^hXdd?n}zgI1E{&o=Ti
zR<CMldeRY~zJXSQUTxd4@F|CZy)54~IRbk@rcy5X@Jc_Vmn2A-5B4fF?}!4@MlS^^
zvB%p1q9FKT0cvrKgu4sp&%{#+VblHiw?hf6cgLU$#`)^v_Qvl7lq%+WcPx?mla=8H
z@NXtnTj0MSavjR5ee6H5DC4nde3OEdXeO{SyB7B22R*aZ3q}n7oTzqqiCX+xf>peO
z9X+w8JwLgY15eM_l3JNJOq4=sd@T#M4K{9w8<=_xoPQ*C&0*JDDYb!0rB6ECi4KbS
zfcV(0ukZ)%G^c<M$@zespOa&Q^{?;+nNvrPA3c0JNc*LQ>9T)IUtg1xC5xb^;)u|v
z7huoB#&Zak1|r<`8O}@RPQts1SOP{wLgt4F!5Zc(?E`t9^xr`Cl|ejNh7f!>-knH+
z4UENiX{T$#iW^MGDQ#}ipD^P35YY3nTD+FHnDT8)@nW^ahe4Vc43i7lw}g)}%_-tB
zG%5dRHLg$_hMlYENQi)0Yr>}6h9b6>sAeE>K)D#04jYP8eJHju6xj#?!Jn&72E|UU
z<%PzHBvLt5Y7myJ3GhE(>e=Q}Czt?qgByDv4J3z4PjtDl?^1w`u1$dY5c}vQg?O`J
zNH$yQ)59<DHKK$vi*p`B?W4UE^ZVooVQw4TAdhe$gc{$>=gMB{B-$qh-J@SOu14A3
z5*d5nXxtmwv>-TIdKrF(;Cu!T{R^6{{uW8hTPi|!uCs+s+V|Ns-!g{$gvicXhu5L;
zsAku4%E4$Uk=FUyw2&e%AEHJ1wN9@Sc_p)DEnP1qjw)4x{}l2{ZNy5`Wir%6J?DfF
zca*FOUJvrhBxJFrHxSXq$aP2aWuo__FSzetlCwq5WjT{_mgUfoG5;kwA}su?a=u=U
z@L<0xrzYoJI1@dNfz;t4eDW8sIuhxd_4!-nd`!-_$@!OZ1O)t_mD89P-=WtJ$!S{Y
z{vRmd59Rz9Ifkg3WAu~e%_lY9+wtU~nl@&6WVE`sH9+hJ{Qj2V)-sn2$cFp>!Q(L$
zUV=_%(14(0>R9mGBxLJ0e4?bBFIo4oXo1B$NyWljO)Rr`!xRzim8i()Qpo30JQAf~
z7zND_GiWGKY1$YNN%&RbX4X1vT4dpph_D93u4ZRDrGEx44WVp&n9N#1u`GU<(;hT@
zOoVg~zg_?eEt*bTORb`5c*=>h^UW~bcK3(G!=DG25dU5RTgiNi$drrcXUcP41raHo
zTP|Kzw9?h0?euP%6zDyFx12q4w&K7=v0>I0*4UyQ0i9(7Ui1-q8ZzP!GPX@q=^K^e
zx*VHNno2>z3Oi9+fPwG3o#KOpN8kgA1(JlcW$Wk3F36sK>gbct9ennw_Q|wFvY*yu
zx)V<dfRT^y#ZAFB*~>No7-65+Py4#|@zj4`8EH(a7Fo`^=_CkZ)5*i5P8Tmlup>m6
zLk8MvyqaN4)?~?EbK7-+X{XISD}sDB$CTJrOROfBh3;oXj6<*++78T!!vX`=7^0Xu
z86s<c@pyyXF;NH#)gxsX$1^4(YU5|XtZzsY92qMR9wg_=3qg*qyf|O-FIN1&q7jH)
zn)3O_ghYMbb3T$zIBXC63i(M@N(L?i!Sfv&+ScV94Av7iRq;19B)w!WMy42?8jwP~
zKe4jC(fJ!CY#9SBXF>2Jzse7gWi=#Fuba*yIwFZRmZCvtmZIJT(b2U51_Q}iaw23+
zcSbug3ZE>txR<TN;#>s3iL39723my%gTwT>mh@fKY*l)8`D<~nn>}5`QICitO9fuA
zE>&}_(2jK9DXr7%4eM2?rKMUbJwmj*q%+xNvAU^Ss5z*xwH|MXXRqc8>HA1E%zHnY
zgvr$mI*Dp;cc=-3_8#Bk=o0pOW2TKWNZi4CzEPj_5t{I}nCx9t__C8`!GX~@-;
z9Qs2ovEj85(?A>L>nI?#bF~9s*V-0JMsv}vi{)RlnY{g^gC#o|JO6Dsx3q}-ci^n_
z9lTtcLDONbTnvStH&adwD>To)jU%b$oaf)K_d6MR7;^W>zYk~Sz566FdPEXQZ>F*|
z2lT4Xr3$JVrK*!uhL~mL9tY$E3(wD>OBzAMQY6PD7}`vG;>tVr9lqrI&}Va3(fhy&
zOMnntS4au)DY^^j>Pa!c(CU?pDTGw~JIL;q_BgjaKa3CItl*Tj!nNnEysq>#tsH5p
z_)jU)LvqgHK=F*rUzD>0XQlH!m(V?NGL<O;TK*Ee4e$%qS{%;PkkvO9dMGqEcx{VX
zu+A-S+o+H6`e*dLDIToc-B`G8Lpc@!jP>dw6I&gpJ$rf)V)b%)&X;J~AdvsfDx`s0
z7zQB<!rvz6L;Cvda{i$lY1{k%K)&z73HsVfuH5f`k3Q~Jv`@>o6<?4(r~V0g=E_&D
zmZqj62o`jpJX0y3DEr$LU(nAO!V}q$qAZ<5f*WM3i%U)>EYY>|<s#qxL6_tKh&Chg
z<y_O?*=FT>(E8H4agA~~QV^?1bwk`ak)skk>3L`P)*p(2Bo6A2k0VMF89LaR`m2>K
zn|6E)uoR#az<iQl<>x3b+OQn+znX;cuPrAXZc$x^83{W|+a@Kp{kUyX!ZOmfNkM-a
zX{4>qg=E7JQKBv+7l!1ZNu#h%g<X@-0oMLn!OHEr7I)=a&N%>Rax@YOfagGH{ZPG*
z#5Hp3-L^?~)u3ttZIU#Bz0MkUN@1v#M<sTM;2Mb;$z|~;o9evoTT|`VeQRk)^i^${
zb@_HKm+<#EnH0W0=d1Puasjc1)%DK~HDPsq7Hc@f>iX73Laa_4J$>84Z_Kc|F2m~5
z20+2+KF3fxj9!dzW7=$vplx-FWm++dn%_pp*`l)PuI=A{S<W>%AHfNdGsvr6(d&Rx
zd`P|z%dzEZfI<AF*T?4kVWl;3zz2yuk!djb{hj)>OODVue~TOgHRjmz*iYD(`Bm;C
zv77!v7Wz#JI~?DY$V*nsav8V(UC4AB*xbt2O&3Sy{uV||n%q11CHYm<4eVK9b6^Yc
z*QC`1IS}Fz?0=5^?RA8BLW;OX>xDZuF~Ak_q*e@(6)7+Wonvr&Gv^0%c*J&99Ug&Y
zN<3i)iq|jd&xUxQX^&j#Iw~x>c%odXNUbGpul^nCu`g5I##U&A9y^Kjmi)bP?ouS9
z0r%ny63@K*W@4r1^vry@Qao9HsrcL?ijw{!QK$H|{3UqPX3@~Cf~Q#D5%7;H@5X3p
zPuN2u)23?H9W-Pe8esiqKvhj??LieI1wwMwUOv&`=GDC_!DgrJ1&ysipglrQGf=2(
z!9k{K{Ssv6!sj{HPgcXnl#@-WA$;4Yrs1FxqB|kHxCg!QW>I=n?UuPByuUtIzK)mI
zn=8x&@e$I|Tmi=f%M52inwbD`XEQ-Ztg_6JXvT2Bs6G?An`c6&bUTD^oSs?oA5ohO
z4XO{Y{|<#cit`4B(|@NTt9#IhH0@O7YTac$r2)N{o^k`ayRn=?Mb`D>=}Xk)KY_zi
zH>v7IBWQyc%KHuy7c~_%i^7BKxzHC@L)lqf4pNuB8Z{B1Zq&jN>swc^lM{o|1X)1u
zYjQHkPojP0U7C}>Fgdw=sWj)l_<v4T{%7S#=5E4U{|Dsz8aY2E=jY{|mt#<HPQE2M
zhE}Y}Cm`i(Qu(T*tGD%Gqa(#)e4d|CqS;t6nN7sLA-g5JE!&qJ&1SNx?4#K(d;{6x
zY-je)?2baG2~9T~!zYxHkm2S-Y#bJY+Cg_6-X&F;5=e{wMDlC*H}LiWTuoV~CIC*-
zJSLrvW+iM7N<%cI1?)T>yW(ILF1!d_rY>$rO^2_tRbx)#%X3UDYsbVoJtj0qyOOhz
zDU<AnxX6}gdwBL(iD^nhE@%Wvf{<Q*cF4<)A3&~ffPu4x#OkI_Iwqx912Dd*557UZ
zbG{HD*&sujVu&nR8}<fxju;^%QUszKVkEzswXcpC$?EI!6${zw4ThjS4pHRlm#gKh
zWLLkTaJlx^avZLRYNvxwO=VNNalVh(VjlIju$adny>#@+#maN&#ccp_;<<8k<f%1}
z{#wBs=j%4o>d2G6_BOS@;BEKvgl!8CXz7U8_OQ)6LgKHHzm7$<<{z(hhh^))lfD(}
zx$mrLly>Q0(5hPAPu!hhUHUk1)Lxe!)rF%`yS-gTMDC$3d&0Whp}O$xPKC|h<?Yr8
zs;<pt+P9|q5#PP-;Xs)6l^Ed69<_Kcy{6pqUefybF6-mH9BR^mE0?ahkE5~FkteG;
zdGvV`UT1x-4S4(Nqp5>WO*J9zcx@mY&AaQPse@1LB@Ghq9`D|8PEJt6iLej$sSott
zoU?81`%gOd>?PKu;cV=JMfpX)IdYCA9O86C^7YEuDM!%Hzgy0|awIZzJAhGyZ_#U~
z98r|M2uLT+xkJ83<-8MT9as@$u$}%fMR{D#6LJiK9j^Q8o5I%sh#lp(FQlU*(a!6f
z6L6HPh<yKPIU=e3cgs00=cJrx<-AAEDLJR*yjRW(IP0oSE!q$i|EwZ!R_%(a{XWHb
zzZ}~%zYgEJj9dGoq{vUm*(~F)SH>c)j3&1uw-a|aVa6ZRr++Eu!*X7hb4`vgZHFHJ
zkbFNZ=Nsf0;w?F;Q|kDTeBUO==-=;<&q!aRguhd-B7)sM_TBQ;DOmrX=v7;;{|PzY
zE9aAPzE94l<oxe)zF*EwIkpG>Gx>f{&X36XQ90TYCx#tGoS83GBwiJA9~5-ZAesXv
z+R-@z_Fw;3l<rsM{F<D9E$83J`L}X@UCwXF`As?hPR?)1`E5D>Ue52x`44h_S59LH
zY&L$Dum|b=$_k!-#6ies(fD-#PFgF`AOfQeG&-egDPsH0YIAnCX>$(7S4KBm2PfH)
zpN>=UcgcCT9M##i!y03bHAXx4{c4Pm%xtWXaC<o~Eq`6SJ;AT?b2t$az&;%RL>wd_
z)X@1jwg$@R(jZ96l-b8`BsrySA}KG;w_NzGvz)VU`Es_L_X;0RaAv)W^XlDayUK;|
zzk98x+@+K5H<D%NXAiFXIq_ci4>kFR<v-7!$O-s3C){`F=rX=OcDQYJ#(g{JN-^k`
z-oY^hyd@^k`NfGI^!wqWOkOCLs*`?@tG@*43yTU#@(5DXe(7qEkxLNj5S)MFG@e+4
zXhLM<5}tZ;;m{>i|8aTCTUO`|aO9!T@be|NQuY@oJy_vE-n@3QZXyX_X5)g+j+~px
z&)<7^{0pCZc<gZ>QwcG1h-H3@B_Binbvy_^7bUbT2>5G;tdY_emnMwudzNIH2>ku_
z<IROOrLg>yj_jK!A}yY~RQA7?#4|DGO{3EF`uQc*A}nggN^uoy)Vkaj!=F{ac{$T^
zCUGVbR*LyoI#0g$`0+>XyK*0YL6^wM<E5oa`FWiq=)Q8AlW~>j7CDQ@sg)ql8N!(r
z&iXMvU9fh`bH{i&HM3H_rKyZP2a6vC8eEPti2{S}t1XeHw97?*@@vT)=2y9ZBl!w4
zTNqfVQ6SBg4FlX{EsdM1Nt~9h!30962D=BO7NOf@wg!`G5*E)|=M5NSpt7h`t#wtq
zXM4OjDv}IuwHKC7zkMChRxY(XdIrN5n4k~@JmMrNnJd=JAam6{b5k74aNevRc8kA2
zO4?pkUp1O}rS1J{%e(C<9Q81Q7e&e-H8o#aa-jQJ<=3hAkZ~R)Q3ui?53FTWg4{{1
zu{nA@NG;%<=ybf8D#;v)eM!E{a$b^iMUE=zUzM{W=j-LPL9Nfy0slx)YYb1OGlMj~
z^*=Ag`ozje>q4(%3dXx?Az0F6M22oWpT)yS#H#Vx1WdkMIJO8{!w^)+7%h|8hap>S
z%#EQLhTLi6Z_G1|n^W%0_|{_Uw*owblZ!{3#Gt7Y{)*B=K7)(h=pTJ*>x&@&G>0UQ
zYl>HXN?p=VoB|!v_~qlNudHlo>xxFI7PhU>5dCak3&B%ANJ~L7EXf)#qd}o(jrl1J
zf>W_7QYQ4OA?~gtq*lm80=0OCoXr@n%BluXFvfSyo<i`!aX-zPiZ5-NATc!`WI3m2
zVH2T3RaOyBYCpFOT4zAe`DD3zShMUzY3Y~MY+<R|d}(a5`zxvV%2->2n-WNAUli)Z
zK>I%V0o7H2%_`=9P_-M;f$WlhsywZ;v1QMSrEWoY9fVuO4N3()4Pfp<Kcd{ODWk4M
zlb4=i-vCDWqL0~YX!hlj;OS@8MXxCKH_B-=*nGBIs{^m8wZgTtv1~GtOvDrM%$IwU
ziA3bfCsO>oFA4bj(SQE0^W%jc8-yv6xBO|ypWs)~?h)<pqShUe@1=N}4ld8Mc*#67
z5?1o;5UY_ViX@&a)Z{`7HE2Vcg__RLLQNsGP}3D!sOgqz5#slR7HWD!3pIVAg_{1*
zLd}5ZEYu8!7HWpP7cp)z!M;8WJ#{xZjEElMIqG4Yg69_RKJR{>W8MSagFLrlO5!a%
z$7KqF=QfNtyq)KE8ED|S!+X?wC(j}V79Quh(|f`@$a9y>An@Gn9r2Fx+~b|W+`&_n
ze1}XD5OOCW@Ai%pa+iz%5VBWB19(n&@A01FxsQYYr+MD(z1Mr5=RMvFEReItrEY;h
zn!uV8w({;_K#;_1N0G}w;37>nsH5El?kz5&5?qWr;dX$IB|o7U2t{ynGulz|=60G}
zFt^KGi7HYi!Z>Jl4B!~@>@$gU_PX}0{)O!pN7@3X+Tvg%C5%(p><eiTLGaaC4e1nl
zT`%+N2_FVx6v9%OIF{eYLa|=vx3t9p8H8`GeRO1@Yx8fUAE}o+>g3q$q6kP-BNIMJ
zz_G>RjFAuQ7$f~<&4F-Fc7OV)*^p^rjT1I*akqNI#uVzAMQ2WOZrtt2x07OJSqxdS
zXyoQ%sjQ;Kl1yCvu~96<Ewei<mRL}m#X@ZoW{FCarJl`Vsm8l4mY7PL#Zv9|SS+!X
zHjAYhaEHYbTd6G;-)sh<s{FOpJGE%No!2$<?ply4ItHvvrFxEUuz2d<I(XmRc(3jy
z?p^#wyginB0{_I?z8kT%0aH)9TkrhFuPN_)@ZGa^uM=La?bp*79Cw^_@+iPp$h#Pk
zPm9XmL*T7{BSbaj&OUEfsBu1u^6kQz*^|L1x*u`v9H&Bkp%x@CQTH}Qvs6WVGctQM
zQ)G#<O>gksW!<gxp@98)r5CwqQg}ie87e*7(Ovo|j)So*P)Xo0J^BQ7p>}0u;JNa7
z4lrN9^vBYrs#&|Jtc+k61nt*iv?mo$l&%~&F|)9OSq%Gdh@+D5?pHdGdWh&9nh>PU
zE-ubbByROl-YqeTZ+#sOHaI%4Vpb8lmh_fpK)$8w;D0*~k+os^i};b3oKVn#U_eNn
z;dk0=tqv9YhOGlx5;;{ZF9jVh&CDGypRf8KB!eK&8QSpBwIk#6C^Vpwy&z&rY=|Ir
zWo~9Z=)a=#!^)uY!pxjmH$mAPl=k2i&c}8vQ6Bxl0p1o$4t*krPU)0W6EozxGVU?8
zb++>Ag(r2I+e!Vuk5B$j>ATMRV-k)nSzp~5Z4Np=74*+h{S%9o>U*X4ZW%aE$p5Bf
zomFQtP6>?X5e!yK7t2SmdvO_J<Z{`aV)g&I)d8j8(~FH!4Z3_csy3*VU~tNYEy8RV
zO$IaQa-rc^2E7<h2{FF#glv9=0S(I$CL3fJNQ~V1+SftHUBkMBYg8{zVu}TyFY+bm
z2x}i?oX&WVRgbFsCJGK&*H|~iss8uy$`Y}2!M*30rY+6sKNrqYYmXHAE5Xn?Hy7I)
z)g1;$t?E3dDH_hu^f^t@CLGAnpU+a`e<$>30goa5O^AQ%iQ9Z)3{-F|K8m!~`N!j9
ziDBc(i1#A3hRuG3q}0Ta@g{U4R=WB+!ep}|)Df%`5-qs1c5k4F6M9G<22(eb>B3S*
zi1Ehs!#L=aG~t6Zz?^<qQh2rLKS?!HR3Tb?e3i@>_Ock(VpNAQ5NL#q|CGuU8V7@2
za^boTT2mQ_K8TTJSBdn+8fA2%wlBrdI4pbenk<lCp{*XxWkecfUG-XaBjS)PrXKe0
zhF(~s1by!}_PWd%N-~{X?J$Tw)GUMS5<ILg8-wVMm%5K`UfE&?v!(L1fOz?QqzLp#
zvFJFyqGO!uQfTAHfOcM?;tj8iHR;SKm!=&2;}qbR74<1O7jdTewfr$99aH#jMLxqD
zMo0=Q^+t{BGVG?4d^V?#5qmGo)bY0Wx6HCOV6k5I$-$OyBE4@C&tFjCMj?K)!i)iI
z{M+x)t5~;4@y#SbrtD6#OGPOt0IC|goZqLo;@k!U&Gjd~VnwP+ne&zbVsG~>oNvk_
zD9km#WzV0wbWWyJf)1Ggz@!Tvy`zS8NTnT?V}0>Mv|xz7Fqx%OO_HEoXX2f}Tz+X^
zxG?Eptc<^2kF_k%H05Y{s=Ug_C<>iw&}B#7f?9xHaNvQ(B7*jYkd2Hf(Pm6@D5DcF
z;hC3=7&FAqt>qme0?*Sia4<b-r(&WQa7IC&AC=aT)4%r8sOO?YkNyw@G+IZfY_c@<
zo;Ka4o?6`h)oO3G4-}Jw$6g2OoY`v$EzY$8FJ~poC+#N8TzZ@etB2W%_(oWD5Q4#-
zyV04Qr)bfT<)ZRIz#B0wR3t&q>_}Uvw68GD*B07qJ%eF0Q#p<55VmSM##z9hC*4p@
z$GPRR)+)q{kD6B0WY6+TlsXn6aJJ<8LalLQ5(?i~xnEl23uW{fL59$yoGT*%keVdZ
zs8}jib%^brB4##<rRmbl!v6jHjX4&io_glQ(Um(7>^DPHGF;Vah$_;-tL(30C?sB5
z={a@!;B%*o?=0>*{oH$x?#i#^EN=y@BzW!d8jHjNdm4z$i|qktT!76?r@m|fSshoC
z4}HYXXt9bZ5_H$0jjbjBm-NL}^~d$OtNz&*m8mA);)8F?U0?-UYtlwpatZ;aVP%`N
z*|&Li5uK_sOuI$w0|bdhaHsOZ<ryCyY5FDEF*Cg^P%E@dg~YWBie6D2(o=Kjltw(M
z5tWHKwMsR}V&FrmDrOX`+`eD`GBNjQJwHM&DRxmEt<Y0DDlVE+DU`4Y(=N@**8i~m
zYv&%8l6E5X-&WjVfN1aC04_~YNd2z^UwxemlvN?3O=C{09o7}vd#K24i4Dpo1T0&R
zb#yJx@g~S#v=kJFl{cFrEWOg*9;$;>e92(FA`^%FfB@m`j8w3a7t{XJ5bT+Y$Q#Mm
zy(emEy?ZHGgsJn1VV#j^^6U7MXa&cor^m3Aapj$NcxVaZ!asYP`n!`UjP9y$46%Pt
zh#M$L!knh+HGD)q!Xt=ZF)?P4oZ{R<(qDWj=wP6DGWvK3Z9mx-m^lxmA5)7M@dM+o
zEH0eT$PJR;IAb*kduiO5ti=SDOcE<QTHvnqVOmqSfL@`RqB9QjtIU$G<85$~&G3^t
zAO!eM6E=xR$)4h;(v1cfH8?tSl9n9^;$)4F0q>l=42+tJCblE&<P`QB!UHc1Bz`k{
zEimi=_L@MTsa?)?Gt_`<0e1(2XM0fod|&i7$+2Ti7V_;~i01_M?cIP}jW-4!ASYiK
zZ<f}DKyF=~Ek0@im>c<~s*aVLTB@mC#vY9=i@M#ky{#?%Sp5C)E7aDgP1FR{75KA1
z!*g&=G`Lhi0k`3HHMl08i~2lvLp&sLF!Zxnk1BO~Hc|p5)uebAgm_ZL{R8~sQjAQB
z3*mZhoSxezyi$jZ-~#Aum^cPa{mNU~6l5DI+!%^W(FohYTkHnQY2g)e!!k3u-Aiu8
zdMq2I*Uc?<gR`Rx9_h1|{sifFzUlNwNbmR-^Q7PPrqk~secsByPWs(%I(@3T$LqWn
zf4ROgG|%XbA15c9X`%lO%(dkkwy72zc!3>%X>QJT$6F#)+(An%Z)dc`uwhBHjAybk
z@ZN=s3yUu;6t&HneZZahi97uND9BUaV*DEdP5oEbk?OudC<9I(R+j%+@p8bOMIREr
z;&g$vi?RF7EO_sRsxHkPU!217l_NuuGr2$q=GIi~$K`xdj@aP-_saPRoPYy^b0x6_
zI@uZ~L#a6S)wG>`M1_P_x?}<5vr15C23Jli;){ycCDSYSQ?@g>Xd9h{JrIStC=a&2
zKGgd9milY%=#{Cm@%&*V2{QY?r{a@XmkM%MMP!>*62pScF)aAg#F-TA^+=IHUm(kW
z0aH8~ps@DMEIVYk?V*W;c1fsJVzF03w+<(WYx}6MX6>}1Uyty6xaF5kj9W?!`+UBQ
zwv8A7Q`$(QKSA$01iz0OvgzqKIJ5FHd}km8@Ky?R##4S5;mFSLr~NxnqqmK9)}2|-
z!o9=@2iqj));NPUj#7IDB-mtUq(fVEON{)UAQlub9Bz!{!|jIG$TM@J!*Trhnsr|}
zua>HZK^5DX*EHswP#C0)xS`ikZI-@DZKj%P!<nt7`n>v8>yvkF5^1o$ZWGy)K}r(r
zpv&3XJ#E@6pkc=_ZCY*|fxn$OU}kuoN>8V)9k-rdVGJC)*4@fsaymW6JsZ;KMV5c-
zMuMHPjApT9S^np35twk;t+L8Q&dtx?*O#>M-W&>%w#5y@2PW6MeF<(uTZm5J+jPRD
zF@e8GtTj#Gd#G?K(Vfz<M<IV0KTGr`wlj_MDH%T~Af+V?!(-c-!l<;2wICHOtFh%4
zftK0$6vYVzTQ?iwxZ7+z{kpRenfxZRv3304B~_MJVpW~2#Mpt9JV66Bm+SM8S(>e-
zn`Wpa$IRcB8OkiN8QNGOi|giOYn{$-Iw!v!?#!F5Q)kmGU01(}&XfLM(VzbRC+Dx_
z7;cIu=KqaewNJF*b_oI)Y_M9+c=h7wocQ}wk{ADj#4^(022}uuIkEq~AimHDayInx
zkLnbY&aOT7pUKNCu=d7QTDc*b4s}4>%nON<ZdxrrP9>YlOtiil)J+riMauRwc;J0%
z%W7YC{jJOH#HBg6!8rXn+ZJz=Ea=f9)Yp~;*W0|{^tE}>`G2qSPSbt<m*vw+^QDpC
z4jgBkDoRrutkQo*tY2?hX$f0tZdE1v0q`;*l23>040fhM3aF)Ru+r*X+p^Lm1<{Zn
z=2v-zq8$DRmI(u5jWO0-t|mlad5JN00w8&Yca*%1OzME)k<~kEC0FHS3J1|Mx({Mi
z3=CAoWYh=fZ6f6=cY4%1gvS%RvkH5~%K`H{2Ekxg3&=O~pNOw^RXeL$DKwA5`~X){
z&brT|z@G6cOd6D!VYYM`&MRMc-Pa=@W%Sm#W(J1H*3}-1+gI)K`nh|dcNCIPEik4A
z#-74(N*c5<rC%<qeCkOaVabD({3(?jg_Cz@Bkx*|GmS#pZ);8K4XyT4^I_8ePFwns
zru5tk;mVf*Dws#QCxTiw)#)c%^KI+7zSjPtc3D)I`+!URTVijGt@bT{uGUTHU90^Z
z?eC{m12vTD^_9^yrV>Z};RP!M2-jCDjK<Jh8mQ&A(l1TEF2&%?E_?Ly7a8B)8ceNK
zsXF%o2UFfCzbzw>+_00zc$$nD@qbJ<PY4^o&Q5OgLTQF<<d`L#g&ss+Jcx{NTPR1)
zK&cFJRIL#<I6GF|Iy>%dH%cZWg>@9m88Lcxh~Mzaof;oES8JKASlzH$yK^fysL&1>
zd>mLEsST}hS4?fBmKlsFr(r@mYa@=1`dEB<q1A9DG!OHTo`C@%vv%AFp_#pdIWnr8
zO)*`qDAtCk?@oTZP+(7nE!|xkj@r4l=Y~u}<aA@i+8x9j(-%toF><_f?XHMfHnIDp
z0q7Y65Dfq=zFClZ&O3DJje<gCd`SVBFA3Zn67(^r4iUSc+zldb0E_Eb@J6uMR0$_~
zZYNJ3IOrEtgdCKWrWVk-Tsm~=ybH)F;DIC09X!#9<-Vj87<4~y>cV7uqO2vVT=1IG
zbgJzmFO>YHBQnD|U53GY2$9@H3~?_6si(cU@=B*r?&4FWm6GpaLm1`2IY<0_yIL;d
z=XU&&U|5+vU8!6up8|~6^SD*yd$6_s@yJ#5n`Wk-_UbXPWR4an_XW($S^>09kSil;
zbKbN;e-l#QP8Ba#iny>VZ~Z8?j&7zlQuN}06xO1##*aXDg~P)%$=e?VOeCf!-uJDa
z`K!Hu@%ZPz`yc7=XC6oADv70h|G&}|98fo9x?S}8#9p_X3GH*lo3yU{kIH#f&Y$AQ
zLVk79DG@p(GYzW<EB)z`kH)K62QxD~LCP;L`RH2DEtW7i4L#?GvR9Oo(S09QN9B~A
z@lcyI{Y?MR3UXSCPU}q?epufrRkdx-&72Fe{*+MI`3eFCYj#L>iB>e}aMsloIdPBq
zq_AlHTXC(H;$YYy(`CdGB(*WUMi&h;PXt7r>HnnSr#!#((!^K;5#DG|4h9fb%U+sq
zil)b~`pP(qt9DpiHJc-Pd(N0`8v08pLUOPb``}j>&z}$S$E`)ujSD&tNv&-8lICg<
zp9)4iH>W0FVg^nk>|dO_Wa_M0C7qlvUGe|GY6>gweoULwT~3P`Q_E5Sd|rBFm^inW
zXBR8(ysHt<<644;7?`F7{^#WNH^Lr1KzKe4sR(Ni3<8`S906i73tKh|PT}<Gy5kQS
z1}|$@vVWO^)Rg}oe!avLb}!=dZSj0k#>_Hb=}ruS)fX^&oFbP*d=E50e?0Z&3|ec0
zkgQlJRaykC8{i{#@X~^h2*DKgPqkx}57Gb_xCm`-2x9=>508`~$eA^IQk*+>*S7)x
zJ8Bt&t=U?~T86zW>)`paIABTa%_g8#-^valVGQqvAvju9SaJt&y<>U*!q?kZmp2#=
zdH;Ade=W8OKJps-2qg@H*~s=1N7~+(SnXv0%xV7+BxmnV*7CxOUW+Y1Ma^^68Z8}H
zYm<UP{l1dLsz;Z;g*1k%`JQ$8cCcH+D`JNY^Ih$(a_G#CpB02Z)OykqwshBew9hBS
zVf?0Lnf{5|`y*eay`QS(Rx^(4iTOp*G99AC(PpkwIncH4zPH9UB&C}mzJJ{wlqP&Y
z<0efSqY^qXYy?ILRmHI-*IGw2OeEH&)`*M7urss~jp4<@_;GhIT{Aq~(}K9W_n~RR
z3&)?U?`UsdM{L#^IUDV0ghhk~`Z`@qy?Q|vK(#5h94{|SS1)W(qK*N@wr@nyyl$Du
zcRx=;XTv!qTmKHiimT;_j$88883TR?<(O(V*U~J_%~YL5Jfln`M({<Rkm(Vn60N@x
zh;yZ-XXa%MDNm#zH(#1vgdume62vcZd5_-ooI(GG;|mt^pmvEMCw8IpCYHkff0ycH
z7{y<Z*(v(Ja)|QHqPPea@vn?z;go$MV<F;hi7}9kk0lKp;VZ!PN_<6}uI_jSZP&4l
zfJVJHTY!cbS^6F3S9uPHB@jw_9Dt~FzAFaH6|n>8Yto^kpmQv6ht!Dx0KCA}VCsj&
z4a*=cmh%TP#%4u|qmu;=pP-$g5xZ2|fCZg`+Tq<xp}c0(kqEL^!kF!#`!6Zs8`P6x
z<qju`kb0So`lHT^Tz8;c39UaQWt57@E40-JFB1?LOYb-a|FFR>fWZ%`xI`tPEsXmR
zn215Aubgi6pEhBRSo$(d2T|u?WRTuWK`Ouqgjh?5C^nWOE%!&4-y&9ofha(W;?%*^
zTb8%fxX?wviOk7OVL=;VA1%lv3f1jBrnUc#^q9kD?`v86Z9M0VR=)!;2IGWjPUQW6
z%EK|TZFx2L1utvx#rqC&SDvmo23wH7Sm$v&v!RY598gux5}CH#NsLiC9~X4BS6{f>
zYZ`Eg#kD-PvkdmJED})kFmc&o+f*Ui8r32q6(klswFw|-?1@V%nm8*(10-4rA3K3?
zXHzv!c;8Hh78<SnTD*}g*!YD7*EaBJzgqFdHFt~*MQnnFXBdI+R1>s6Xx$5aP}Nt4
zTSlYaX)VJ}LNg|BRrYu>bA+lmb60aQl-!zhmxlDyAZ8bvx&&6(zjKU+S`x+-^AEB&
zu`JyNzF;1+CG|C!8jhhnfkko+cVqNfvS(>W6MN`owbDc#28%TrLT-PMTk^}7-JNO(
zKQLeRv_$b~8}0T1{$tX8k9tSvHv|P!AX$QXLcfafm8~s()G9S;8C#7}9oX$9SB))Q
zlmZ7rlWSFqEFD!sS+EI#zi>hCg_p^Zd<-iKQwhI^-?o6Iq-$AQs)?Gc@q;}%i~?*5
z0k__j-?cdOF6ozdE)TTj(Yu<jGRGYLV?KoJ*JRQ3R4u=nSw74lq!_qIbc9{p9iG@1
zV4&b7A%r&rvnC}FKOkNc#hsQLl?!h{M#9BGa9_=xPOKJat4g-=c<-X53^-tc`#jBD
z^70D?!og(duJ91Fy76|lc+^R!oU++YM*eo<;v=X*Y5#Y$lt!InPvGV@F-$>21eIAC
z!HVq>Wodh@xO<$I(nN4j!53DJDPb0V2N`g?+Puut&L70=Q^pR3pja_=0UH$^m(h>q
zNR|Ja<c#vg{L&S3*WKihBgY@mG`f$%G>P2*Fx>Wj4xNt16JPG<CI>CNJlMJ>*d8`u
zS%6CY?aH>6NoHh(t7%32%9d#+reJ-V{T(mSGz*6v<ke8<L#W4N&PK=JWRT52(KM(4
z7qP}{Pz~*7m4Y^F4`%qaD&0^saBlcohL}(ZoN65mX_g_zxZ$-pep$2B@QO+3hPQbl
zHJ53*drVyxg50JFr1v+LyG@;3`_%EZ;WlQC@9EWcj$G5+5lC#BHcVYdj5`Z8Wxle}
zv`!%ZywVss_da|<r+{yH?hv!e3k(v6FIW6|#axhci1BHH#D7OQzoJR9H=HE>x|hZ<
ztZ@->kY&2^Lm~Fg6PEFoO(#doITr0Lb>!j~cwsnVYg&_$ol3J?NvK`T!qbooPz#-A
z;XV+7XLy644?-a{m{4b&qYHV2ZiOpPAsbOzrx-`*toX`NsNLkjJBS>n>dgucZ!(?L
zh$FSPD1IYOXi#fCT#y+bf4iI=a*A>U7qCd`R=om@<o={a(*)+P$X92L`1dG^DQey+
zpPdM_Lyu<o=CUG-A>qGO*}BVIHbfB*(YZ~d`rk?OhZ<FZE|Jv_gE8;nzq@_xgn~xG
zz?QL-+EiV2>QlUgi<xC=i@8&a87*cwr%<w>gW_$PSz1}X6yj*MLwTCylwKEt$e8k<
z40*d4-*5V^#Tv!sZkLd|i#hwIaB-m5$LU(@uJc193(`W4nuesOX{o*b;Jkr>2<?b%
zL^XX>#kP$G=63A9bQ=w4PH!w4jaHpijYgBOsD*M7tE0Y{{xvUjg?=LX1t0AoCJ+@I
z2)ZQ|-jI>tjtg}dTC8m4Fgsu~bU3l41eOdS2Z+SdV5`jNjwFT%x<<ODw(PhafmnHr
zg68VWE?*zM0%^MVb)$N&98`-Q-ns0zC9IEk?CUgz4Y-SMZBe0?B0SK4ntsyh;Ds{B
zV(bO_(g&yo`8xBo(zGvcy8eb78`2h>2Aewo-3_<?0~Fzwq#!DLHAgCpgLsvbCho-L
zMvK#yWMg7%v^blj<{MJdN33yI+1ug%QEdApd7kfI(vsE06@jbEwM6jlHx&-?)K*4Y
zz-|-u*s>jK)z&A<Szf^M1l4S^%tF@c(k-^Q))2zkXj4_Yg-P3|t@18bd6Eq@(!a2E
zZde_LsvR|8gaDl!0~OkU@@ZOkh7|uTQ6ocq4OFOm9;BjiPR1$tY_yid)rhmNRD^V-
z;(HNf3WaIYLEJz%t7=Zszx6+5PzP||!V6ZVz+4b7(gifzL^!yvW28WkKzDP3Z;0gE
zD2z_l6068C6~#;VdzZofjf@h)4Dw3YTRG@2{m-{r{)j(WkA#eZ`kHVEWEmz~dim~J
z3R*XHGKimVp*CcIY{M?Lj@`%CbtnSb8k5)=g(X!#gA}Aiikxv>{B_Fm91fEbqO*Nc
zYOo6G?IQh?Po`k*ic%9bB62g4$if;<F<H66qMa^j9fK*&$mnk?p85>c`Bs~Y==!J|
z*W#QoXD?u9L1PELX&&rfZww7@h1D+Rb2q;pe!cwqNFDClXdiL{!ge1EG5KHk4Y~nD
z6dL1k=ZjJ~86j_A1~zmjjPEOunriu`yuZfsUiZz>B;-Bse57r}Y1A$0e-{^Lh=QRe
zaCYq()niLc5>#W1^dgh(hIJB`3{|VG*29}gRBi2j)C`(h38I?->@HV;yXDF=szF<>
zz2uXr=i0|xXY{r#y+U2o)ys*}E@xX>A#2^WKC6Si_P{~T0^YMaAACJY%@hNPxng)7
zNQ*g*ij-Gz2f+(+7^uqD1~~(iUd4db+7M?shZ)~t*}s?_xr{{9*ETe|CJob(8gn2l
zYjkalgAK!Cya`%)o%r<><f?~`#->v&Sn=&gXSX6#9gzt)&JB$swYB^hLCIjHW9mjT
zrZSKSbJ~clEg>7(eA_;|gEPe;Uda~7*(2IEikvo!ls1c$*6dOfvU_b$i)1!Cd#7rQ
zWcCdCbaH+*i@C5y3WZJZzON}PZmlq;A|OM$w4G~rBO@N$8uR*Ll5wup`8m+xw$*!^
z&bMNl(_rrBSez!{b3j6vr+<f*a))dbNGCs4x>Q|+1F=fXpv$!XmloZv-CR=Wtk%5D
zx9J@p{=$1cz5R=iPk;U34?p?Yl~*1QvItI*mzCx?zLcC|)9Q5hWSgjKr2}3hH>?*M
z*e^TwORA4yu!gt(HH&=WfZK_`Q5EQRY$!C!Ps`|tX^b{$k^YgA8!c_@X``h9Rk8zX
zd$@`Bn&ft74a7<IoRtz^mY5nsKmQ@Us(D<AJ72kg>U*C=e=i|`TYPDfTVN60VU>rV
zC7q0@_@7V>q(<uhiBe^yLTbvSTy5mkRsH{4Ut4i_C*Cvy*>!<QHWM%b-l7UTEXPck
zY=l2F@R0_ZzqN_X`+JpFR|N|7W~F26=asLi1_`rpL!5CPfkLT?IuZK?8E4T2{$o9;
z;3T`DgfQpc6u00cvC#+jR+k;VtYWM?6QD|_CGcT_;7UmDYwQbGh)(LZZzK{n+@X!+
z9J2y$L$W)`T{&zh`bXBtmICL&h|SQS+6%^`F!$`|%G6<q&lJZzc_t(#xDQT+j*sj&
zk%10!l1r*68RD*Wn9@2$$QCv@Rf`EZ5KHk(Z^L0<N$^Xyu`miSC`9uBb$}1a$hXV|
zN44a6X}DEr>CI#t)^jtwhcmun)LW2TLLzI!dWf=IXC0OAqx9$!mHComjB+H#T9|F$
zTw9mYrcLp`hy1Kc{y&v(P0pc~B7=c?Ubd)hZC;@Y{`V5O?YzwfQP**1>p0aBdIG^F
zxRStKc)m>=li^exI!K`G*nD7PiLglZmvcAoWLq=B9>0t=7~zNiQ5&4byWJ}}EEJdR
ztR9PPLi}HNNVciJi(%ttDh=E6Lxz{-4Jj6FWM_P8$#=$<PgFbCB68D0TgxS`wZ#s6
ze1tc#$CJK%4x|%OKky+wbR%~T%-R(jA*sr@o{(szWooq<r4TBkG&&9vzmamBXa6b#
zfqfdWlkLp4;~yBPG!mz-Tpx6VcIEy1iL2|Tx%lzu{8II*Q`aag`qQNabVwwcbC~4!
zE7L(apOW)g&5aZCeGp%ZsybR3{zY^Qj&LP;4pD_E#<w3H@_(4LA7u7a_K{xaxe)rp
zYD}?2w=<R9m9g7HU3D=Ybyl0Ro2}};EhphyX15k82%{5M+=&NXBd8(F;sWEzd}2I$
zc_#GqI4q6Wf@LZ0JvD>ZqPE%gsbvr~@bmg)(h4I<T33kwDf<5(t4)!#!d)5y#yipi
zW*XhyLXVMB%2^Ewiw>GWh9{W(iLH({gNK4x5qS$$#j2OIw~$~7r8jCdV*Q9{HN>4$
z{gJv!`K1>{yB<;#%sDEb4EcRRbrzm2Y&{R4ZR=%e(LN~gffQ+4dRc2&)iw#sIXuiq
zTgg;6A!RwYqPeUr<{xOh11=c>r%Y$$;{XJu);PcsDn~*k=o`>lkUIDhCSF+Hv%FQZ
zJ%duTvvIz|=DW>u;Q}Geh*2}Pa;~?}p;`0n%oR*sNi#^YcC?Yuv~h|qt-TQxFg(FA
zHUn@ke!sd!$|GWJN*7%RiW6T!YSzluY);bx<)pYEez)df<-3`YI^6%N?rj3rTu;0P
z&-&HmWU`tf3`?*E`;zdsLHJ&aYh#&-y_T?V^aC+7_&fMzX5lpRu7hkytW5osKI3@T
z&nmxHIml*guC&E181noZ_;dE&1+?q716nV%gmFn2Xfq{Ac(R&W_}!MTSmb4;{DtMv
zV=D4XE#Es<(+e+9AGIYj+hb$+%&KbKTq?(`{GY7$wuE-9_1ysKi*qicb*6lY**oSy
z5B#e1+CXb;iDV2qe<$r6tbZG#RUJqIpvaeBTK}aUPpy|?4zXw@2e(*EY0BzH1$nrp
zbK6PCvp;lxL5f#J3jSd&`C5GWH!Ws?n5cRZGsUd_Gv{ATEq`v|9h5@bt#`iCS|Yu(
zt*v*4)H`dV^?25T`dhu85DcmrcV4h+M(l|^LbZY<x?A;P$3k#6y9Gn-Hm?ljNO&j>
zq23ys-O6uV_O?W-;NJ!@!#A(XYTE_NoNIv0KZGALr@GzASiD}|<7Qp~a@+FS;Y|O-
z0V%p%AgZK{v=d_0QW+VNYmHkkaez3SkR{i_dd@N>q*`v2GeW&EXHr?tonhP|3@ar-
ztRPFzCtlSWNj|DJ?2?am?_MpC=N?zbTH$=6dPnun*}GKsx0t$jm`}##Ln>Q;;la5t
zDi`h8^sxN)O<|HTZM+hf4^Uox1W+#Twi5PYzidP`wSMPnqp&|^*IKRK8}%oopV>D1
zXzZhjk0#G2&AwR{)#ViG-n|>d-A>H7#Z95KJdPQRBIK+rV6V@2r>TkUs>hNo>ZVx6
zTSYy16fouQlw<pzZG(e))po`%V*&O_YOiyGY+w0?uROkQ@?#hOc;y$To(<0jPAbXW
za%eXKPd1p9o?J1!lStNvxRtd~%-OvT_SUet(GqcHfry=9JiWN9B4mEUE|6H+z3+Ir
zbh%u7X{LIi=#|fx;0+d?3{+?TA#Kam+uhE9M9~?Pso3R%Q)JQdCu~8`+R*7&6Kk<o
zxt{BW&~&rR5f8?o?6|7ltlTu2rZIbj+CwIiLH6X)7mlHSeANFX0vv5-+l%e9#|TTX
zUD`R7LTu&1HjF@)mrw^a<Aij|2@DfFiefJ3ydJ|-nm^~29)nzXKv|E=(fRn5;kva<
zyB3e8irHIP*FNphrbegX1L~-K7?-&LYc^_OU)?ZMl#5{rH&gykGJ>3wJZGw<Wh%F6
z15-56pH=Oil5-JfT~)};s?xZwqxpUZE364UQMYB3>-+Fb>}xVzV#ueVwQcxr9m{oy
zt04x`lyE2Bu91(s?)??{459lI`HoXDcW0<6#J#LAL!&O~gBYCc_db2QqL`r+6X45Q
zivNp>XGX0*Dc>qikay?aPLvleafk9H9m%)Ts2@;_?^7yS8~|}YBLD9x2^b32gELG3
zy4yXG1Q;QhS1Q#QRT0MexRiK<?0o5pv(sW4z7}{tp)(5>>gr);nhs<9Qa9pB<=pOn
zk|3qZWi989Jah7B&~xroxq5o>$+^XIr8y1tudBF2s@SL14>p&7gs>lHN>#p_@LmXQ
z6am28vC_baQ=KN2vhS$Qo;tJ%Mc+{{OX=@mQ7L{WG3A#wpVQ}Y{=^3pUoPm*97y>H
z6IIH$k)NsjqSNAhvgOj97zZ62=XjFj&fS!(P^mcSH%gsWc7D}0*23s#n$>Ch%DZ?G
z{=F7A<OGb{kOb~l6;Bp4*JPQqDXyJpe#<O#H=?^F6H+Z3NgEx~nroVF#=EjPVyu$X
zWkxW-Wtff1`ZRqUi9ddZo=4Su26Jbc^(bqjRVvj=$JepZF~>&dr-_}TEk=cy#w0zX
z9$wkr(jV*AdkZm=YwnB{*BSA!Gr)@IjD28EhM~i3PaCF|fMKiSyrQjkqCV4M9Yxo%
zBd0N9Y^v5`-CK+Yh{3%VbY&EygU&<Fs7@1&9(A4w9SOS-S~QmHGy#t8sdu9PZ)myy
zZ{;+)Fci1_x<b`?{%^=vmG2kj`z0J(KE}<mC1XA}=MSkA>(!qiUY~kZyQF&6&0crX
zS|_Qkhwp0X;m!Aq7BWGUR|mf&zsi53W*`RfS9SWhK_<kFOyG2BGnwE>gw>>x3Hl~7
zAr6@khfIivWJ0{1OnCpd5(fm*5eXI#o0?oCdnokFcEdkqAiAhWcN<bEp13aHaF>qA
zCYM7hv2JYSL*DY>%$bY~nO~@0!O%VN+Zk$0KZNy8SS%ot!(5(`sE!a9X~d&y`)HUK
zTT~Pdy2`!6H4xw^Uc(>OXo$16b{{1%>d2h`bBb@w7x^3_M@WdsZO~VS$kkg75t_F8
zZm|np93nS{{V^<$IRp$;4P+GbYXnq{DIUfSTZjFNv^Q$#*A|PV>sDAI&9Myq+WI=^
zmrO1?mBNist3nP`>a?r}&Rl+^fk}a{8Jh65N^B_8Uip^g{Fo;GJic|CbA$&O>|ly8
zG_82h;JP_Yu?|s4y1m;rvCpWO#^O1~>k}*<U)HAEP@h)upEF3f9`*76x$69;5eolb
zC|PSo4yhtSCk#mm?|F3Q&<sGld7Q$;s^6>}KPKnf<TQ{2!#G}6m|+M%B%dJ+c45W$
z>DBgscZH+v|NhTXZPfn6mJ3ZZ{(>6vD{_8KPGejCZM_<Z`awX)c7Z7IrNeF6wpwk=
zgKW#Awk_-IO+Qx*V=gJv*gQAcG&dTZmVH7qM!&=SD!+px4v>khWYiv>DL)pR)CeY>
zwIqb6#_8ZdClsxY;KPOE0Ij$_PY_E7*=-M~ic70KB+k*{$1x=yzQYs3hMI9}A6M=v
z#A06Jt?<rtS0km9zj0o_R_V>2n0ywl5HAs$+F4h><?VpIy1Acd|M9<;E=DU;7P1^m
zwGG8=FraMi7_w<2Y@5_8bV}2P6k7H8j6!X`3E^{7^pL`AD1VC*k7+2ihBbU>0|~D(
z`y0-Fhd89zRBFR<+Nl&)=eoUyeBEAij8bf`Nwn-W3EOLQ&wCR~ig;YC<#r_5J(@)y
zc$N#cW^;BwvD*dmKyKYnjAh8zw1Y_*2!r4C>x^_qd_iRE`Ggt%&I+oF$zm~KI|0F6
zPKwzV#_adHfv<|G-9vf8W@?E!5XKx_7jvjB=5QEuq#jcc9^GN7Lbc>M8phmGk16P`
znC+#Hg)z6Ti`icKxa|bUl*9IDCs2E~M`gdNUPo8@)daUFC)j-x+@#D&EZ#Bi*UapD
z{}IBHkgCQq^LAKFS#tUcV0(~p8LAEw#-8dG*@=~p%=GOH{TSyHLxu4AUJ0|Ks1zaN
zLw>hk5*F^48`Ns~_ig{$wQm1v>{WX-!R?^S!<yi{xg#%#g6(Lcu70>QzYuhYExS}&
zSQ&YGVTxn8m2xpO^jduJUdb}oZC7csGO}G<THCHnPxJ;1V+h+X5r|ZJYtxn$W&C28
zvAbhH>kc~;cFg8Ds5MvPZbZI&+Bh<-nU#B5(LA%i8|Jvd*7cOKcWw25hve=C=#eAk
zH`hr2ca>I+qOQ%2iZWW?DE~hy$sv+BuyPFF`Fbmco-5CDTku!wWI=c9WbuMn)Ev9h
zCGwJ&p`e*SxeJ-e=lFJ-!^9ci^7MbNUhoE0xIh(*4>IU%gKRM1XI_868-20K0H0Ym
zz#H|&Y`rhmkH#CW!6u{e#%eHEuR;50Y*KzS8vcJ)5C6WL|MGf$y`TdAfEYpc*v!IA
z<wDt8w~Y9@DNAM}3~E_<8U`gsps#sxJ6<$S-^7-$!lKy-P5OsqpsSK*Ji~-G++({0
zL>g=yW3hZ(rQ1eZ$FV|dvrSl3-??bfhg0IkppOf$-P&skYKwJdeXKJLtP9X3u9*LS
z$~mHH*tY$nRPt-+oXQgfISec^KC4?>H~NjTxyuNbw)aB(o_O8F*59@X;ij6QOp6dM
zNn!dO=2zK|1K)xLDkLjB<OD(_6bs}BVS#kj&_{4KR5%_x5uMb!liCBPj5Bcqe_)28
zWd8tZE|>6zokbLDv2Yz{O#Sp?)OqcA%>P0Ed`KO_531ReQ}?z4&j!X4WfH-$W9tyj
zdMgjLwo=CfrA&z+*V>qTG`1j}euOy^X_=-J{;=gH+_zmT)B!*37~W!Y`?shuJkNRy
z{r`)AsF~5UX>VpgoR6rP(iL^gfkBQ(tZdrUD5I#U>&8>d+{RPOY~%5yz5QW(8Plq4
zli2tga-EW;muS3UwA8;CYXwoaghb7K;JToP(!^uo?{6MQ$ZB)pj~HmjYI&7*`hP0t
z3vvz-t!a1pL&CmQy`YZNDScgXEr#9Z{SXa#1T5E$c)cH9B+sUP;8rEt1nHpG(k3_l
zBi0{;GXCB4h}s>-@ls}Y#@=lnAVh2LgK!@FIqitrZY^$ffVKUL8?^nignego+qIg<
z-_RVWx1B8Nf12rH!Py-69R@&L#z~!9qXkWBG8%K*Zse6(E43H|R`!l5YKhckF4~Bz
zUr`Qx2Rbl!ek~%i2*GNS;^~`N`4kx?t<mIc0q#l{?7D<MMSPumapB;kVOcs5*@)}P
zH~T;J^tvO(S?cs!;&MqX`MyfNQfGOi`8u1OBK#VqcA3cahsa3^F>fv>v?|wR&Gb#B
zBEF4+k(qzY8#555r+gN!kh+4;x^Y?F8LhmApiL{!neQ8H(skUF<ZNt*KcUrmOOCW3
z8<r{kFY5JWWors6f2LQN@9=e`o()VO&KdbTz34<*Zr41dB<9$jrYV0zGh6&4vHGPA
zfrT(wpwgo|!tY`GJyciLN&V#pdUfmekrujC<bn90n)j7Yv+AK>;Qy&N>#7p}lXNNE
zQRT!rPotyq_&fRKHMd1)p?(okI?zf^{UQTn0)>Yb)r#M9^cLz^y$UBy@&@i?AHIIQ
zp*TTK?gpA^5M_lKKXZ`Fs8!yO4$GKNX4KvN38FqnV1QtC(90P$o!5aJe568X?_!Da
zRIDUId{J$dH*Anm_@`S0%LU1zP*17@irr0~P|dAh(3Z|nhrYF~H;e)dOX>|v5=E-C
z+-B5X5|*<Pb~lpk9qURPeBIK<U1{nQe%nkXWp7wE2Hx#12`L$<?BPwy9`OdQZ(R21
zW@W=K*e*JE;VofVTmF}pWtUsizGj;G18Q?fQ_H`erd~fI-RO^nb=93r>X)sX^b41X
zZ_+R0uhTC@^-I_*+uBOn{-#U9((1-@jS<+{RML*Nl8SG-Bvef{EoqlZqNnTx`0|sq
zuEDdCqKSA`yWzL&3RN=p)biXbwWrR>zr(u|&dgnK@^yEWi@i6zJ4JCPjDNGwV%-ft
zN>`&d#WJXV&+BkP0K85i!tv<^QE#S=3E~(7_vpJ6B0176HSDV^*)v6PJ{~FF#GbZ*
zw=ocODA-vI`A<OSmF$aQBxhCUoU(<Pqs&C0t>h1fYU!v@7?LYY-#9Y2JM01h^ep;K
zW?anciqE1hJ$<BjOF}6-IIuGKV$t+{TfW;ZPb-6GHu!YQ2t%DGf1?ry|Ho-bq>C`r
z#IkAJ@MD`2)Q9OKa3uj37?djxQ$M7VY`Z#!FHXgssNp~ht<io5(gwU56Ei%d-2Q)&
zH6&FNqm;!2nz-YCf;hHghHVUI+x^=%eiMTlfov)X*VKlW<@}L4)<hJ)s9_i7>`pT{
zy?y5ojkwvdg_jh~&Km24Leq_^->2+MR?+3U-B{NqtH|pz2!;e3X1<*>FnPsiDB9T^
zv)T1PlmEe>OX>@nZuV_M*~Z_h>Nc!DMooT3SjY#cLr7VpR^U_)q<%2Wn<%e*T!w9=
z3Lv|sQW5CpDW6jzkRLi6%v{gWklt>*Z{*7-q+U>$khEGnXrELqB#f2;wRUK{dcz@x
zhMpTL&-)vej-cQ9aI<`bv_jmGp5Vi30pEnU3;9M&%jra2GpJH{)4}VtE~F+Y(9~xz
z0@fA|v6$l8g0$EKh%uwDpiK2tvrVbOa7)D*-q%U>+9s*`*ckXt_D5xjs8Lyj2_4t5
zuoTw}ATAD%RE2dCu@lDAbp@Rm`Al(ZOd}rcC7dH;{%_Uu4(m(aI^?AzUlL?*Ar<CD
zqY@VSHi+*IU2P?n8P?<pss!2c1(m%vJ{=uH>s;F=?FZypagOUDwK9?`kG4aH44q)`
zlaxZ!mjBvP?$F`IFm|qfR37^i_1Jcp4((47U5!I<5Ahb&zqq#ZhBN*t^KmMH%PQ@N
z**;>`<zO)O-)niG!5jMDs4Q%k)5UOy*+PavN3&rcF+AKYigv6_R18pVQ?g|vN!dnl
zY+BhG^Pvp{T@^&SEXU4P+bJ?TFV%ot9~00p@MT65bl58x)ak7&W`s=p>YQDDY)7u#
zCyrZVpRDQAKTsxbkz=YPKdBPUwnBIv^=*ARp-_W?c4&LhvK2&g3SUoi_1_h4_R}G@
z_RxYmW))T>r8*YeVXQDn=cF7Jc1V3;K=5w}%dn$Vo+rpj1*L5zz?TB(Nyn<$9Rwum
zUwpT5NH!QU9FCGv7Y++y6Af;hX-`Eg$1Gs3jd0s=A8sMshNw!NaC;u|4HT<|aH|U8
z2NoGy%E&8Xq+3LZ8A60cVQQDPE&<@WG@eK@;2k6gOpW^0bAo>8aEIuezUq2AT}&s!
zN?#GLHGRdbk9`$ErLVel&Y{6d+Z}!t5u~p@?yJgpE3#_ZA;PgRYZ~;X$3&~pW&*l!
zJd?|FUyD~Uv)mvX-~XqpLhgHIz-U!z`^<-FM~Uswo9aSSxKkt}{+`ud%uE+%2Z3Cl
z#>`M3d3K={+y_L)9HXAK9EU%8YFugyOH=;d1(~6CX?~^F6Mc8zTz^@OhhzrY)xA!7
zN>^B|6zM5}VV3J_Aw5O9*79gm;{vya`7%P^3`>vMk(mnjMuDG7EwSkqM`T9fYh3?Q
z>k=yd2J_$=hz;}TeiM1X0@^T-o;Q&Pn)@5((c3~+e1slp%wOqJYyNKW2EDOxE{)at
zrN3P#E4F#tAuD!3Rt(ez!q{eJnskaA>1Qj}Zsx8GS-U(fmXQ^Ez6x2f(s8DU@*o)g
zy96&JUb7%+TN|mc((z&#K``6i?3-cg65+VZq$KGxNsd#t{!4)&!{h&7`Hb)o&EXWV
zyACcb&0T$Cs?{wBey1)Ug({IswS#ho>K#(ybu|839gUx8LE{@2m;SFu;6L1oz;A6w
z;2XNf->kZCr2k|1_ofCfzE^vQaCdi`z08X^pUztOU(i=W<y#T>Cbi+AHdNgIdxj^-
z**0>98BzD5U+O|%Y=pijt<&@s@#g%0qG2_~NMj@#7Jo%yFDb*+6{W&OCwNm`qz1nJ
z<5c_!ZB?^$ORvt|vKjTGjnr$`qdHt%xH+agWg3F*NEU<biZ{Z<J(v7IKH>9Oc#EdJ
zw9akA&07!JHD*c+R<7Aj-5gdf4KV}@>{?B1b#Q7giHTY1CW(E{xj}?!#@r@mXw_1C
zjMCtucqyA6<fC&s9n}QPbeB?EO^>lx2#1q>FrAjkg8SupqTvdo0d*4cVBf+85}J#D
z8fBMS76xL1T=8lDG&8-!upQ!huff2SCX}u(K!G#q&s95XUBb!TS4{Ulk2+&(U4Fl%
zE}|AxE6#&)Vx^0n=a*b+ufq)YWwybQMO<Pzdd`?%YgkJ0yj*D5&&9*;io)L3oYwMd
z3$y&9v<q7><KXJ>`){}oIW7uwWvbu8(ka_n8OmXX!i9+faz0w?SWZ!kKNhcKqLRUm
z-_wZ4PTN!Kuya=LLDe=gA%am0@^Ghv4l%Cw&NT>FBdzwaPuqNH_-g&(MFBlP|6T+X
z1G*+aJ87*?Zz0D1fG%Fgq((P%PdBV$lx2yL<N7tD1C}qLnKlw);~2q+bkiofosowY
zoQxT5jPz4HTTCBMho+Akh$U*GNv&~P_lJn@J{b&7^WmEW|2EKhX@cL)Ksd5??38?@
zv~Ah}j#p*0f~-CG+GHrX4qfo~fI9lUa_*p5OsY2^t8qIv0-y#vvr3bbGp(%msWrw6
zG1z%ouVy^qhCr#26b4E~7PQ-dXhZb}9b|1q^@E(6etd?@ME!1+DzF)JTnWuM*Wo92
zas<G2N)=P?-W4hBljc;_H~t7IfTRf3{qIyozRP8D(rAT^PNw=Dl=Z3rs<=tIrbS>=
z*B}8Tp#h3u^Lp1=h{WIbX9P^6+c8B042N&qVPlLM6`(U1y5~!_-cV9Npd*dKb;hHb
z2Kpp6tR=9Th##A{*MD05`7Sx{mh)M4{twCb%l0bg({jEjr&ajm|E0oYp4LC3)*II<
zC*R*GOh)N^*+TQ5mt)6q{#Ic!4q=uL{7>q`59{?)^63z>|D$qrG{ygE`F=u<RAPN8
zQ~FZP@THF6OIYtqR_#k@>%`e&FZq9?S8ZI@*^M>Qv?PZ4ssjW@fn{U#{FB+C?CxxT
z_F(qz?7i8q$=;LAXS=f<*+O<}b__pn`OfT??7Or2+sc;=VK3nC%I+oYo%wV&m(LP5
znmv(yr0|rlGN$OwmOsjjJ;Be8ACzNfv4MUTz<xH|sLsNFN0TER6i${d&pZob?dgT{
zi$V7(t`eByT$xvXey03V3y`3c#?kEyydX;wiv?)~WKhD`NvWnVO0qES1Fy%!0G5TZ
z@zukSvsoAfT|KNL49glS%dKWzcFTi^VzHVH!3?^u9v4|21qR($THbxZ6lWcC86OuR
ziR|>h6@QSMoSd2~RVtH{L3hQUQd>`fS<Wmj_+zXavNm+6DeIPZvk>F>oQIX?p}J#m
z@Uy)Bs+OJrm?NomENNw^xv%TjyPrgLplxkzrw(p`G;%hdWk1T-&nSB_3bU~kTD`u)
zrg&}nBh1Z#AIG!t9PrYw#LhAT+z1dmoAK=Sv9lf0z~wzFn@2oz5V}1)^H?(K<Jl>x
z9?t?K%n;8m5qms!Ty+c29&gOs%CpxS_qOruLtk|V&wj7y?c_P&?ecc>9Q5w-BteOv
z9rEt-_7XDeO?dlwj(B%7JMRth+_iohgF2^6=lm^dV!Ek;WB4_sYG&ojh=8TM0PPD8
za}B)e1}8GnDA&dHm{q}LLgF$7ap7Cy!iU7|Ksd`)ujc291^had4sX;;R|{)hTrA%W
z$B!<$fgu*cQLMd&PUB?1U3{+->c^`CvxB_F5f$a}59#zYJ2Jwg$>D`v7O$IlU$-tE
z7edwJb-&(M@)&mex%oytQ$`6>3{!63^@JTK86j*7zcB_WkDxCFcVqw)Rv8Tz@rI!)
zO=wH#I5awV1-?PPyF5&&+V+iUVfmY2k#Ae->9%73c9gGB^tw7?)=IB`-Nsj@0#`1n
zSGQC1?G|H)i-9>T_#xoC7-G}6wIX@%<QfY&0n5K{A-lY7Ja;REZ)ycoDQty=X)3R|
z6;VyWaGOiAv|UU;S5uv~Zm(%qQ%(0!=AP#C)D(P?ns!=E_cYfOj_du3zvk+CM_5<Q
zhiF`Mdb=?$`Id3%@pjh7MJKo0TLe$Z`u=|uckaP)U3VRqc2}#_!;&o7l4Co``Vrak
zL$+k+<v4L-InJYrlQ_;K8A4UrSCZFOR`T6dYHuBqrYSZ}8$y6i3+=FmhQ7kHG{ex&
zKpBQV3Je4Ni~ARpfx#JuDSiFXl*i}$JNNG1ySsAQ0)sVseC|2F^SI}ndw%ElIKQ8`
zFUh<O7gzHkxi=D5q4AAYBdQz>$9|OFj2gs^jKz9yt2hrz3wymS>NV$4*Pb{JjqMHb
zxw`hmacFFBDAwL~>fRn}uiG0no)4*Ei(^eKV#ckh#kI8-hc6B{wYWCc;tpEe(Xz!I
zVT(JPS{%7J61Lcl5I0(5Bi{D#E!Sa|E%}zaFa$SN8`a&kxjPPF&*R<>Z>P5_9O2z(
z#=YI%xO-;o<Ld$!T&~BpFJ{lh>%2Xb-fM#632zb(Pq(+vc9Ol+zSrO<=)rzhe|55U
zvcLIQ1-C~FbK^@m)yO+0JIK!6&JQ}V36A{J4|-35-6PEA%)E^V%5L+UwD&qaYIpTF
zDA=rEi-N5RHYt#@J~nPf72l?yCW)?W#v47J-19Se?cDjJg@svNu0K-1S;)+V19|@W
zQabe~6i9Sh$xbbMGx;gUiP}n^O$_au_W!K_1gS@kbL8Kn0un_B=|@jIRaqZ4BxZzK
zI60et9Aj*-wDOgqd!CyqfU{})=yK68d+DmZ*G+1$zn+ap0W7qQscPMNtMpK7X||X*
z7Dq|R?<mmiH~)tU{zJik5>&d>oC7TKe2QP<Uy3JVk4YCX5>@A9FwQDBj6S6-iy3Y|
z%Ht7`n#q7GH`UQ!?YI!9p4=^e&JXol{`*wl0R`79(Dl8sea=J6N0jg)LC|^P-lGpZ
zbnoN-hxODab3K|S_vX4rrV-G<OvE21gWTEK3zPEF$|5qDhiA*D7rkIW?<B9UDkJal
zLHAt2FO{d|?W%t4@~=<R108OHT$#0@MQA6Nptl%1g*uj*vBy59F2Ab47-e5k>=zaM
zk^+-k+a0-&^N821+N^3l>5WQ`@Aa#K{*;1G2(XXN^M>}y00x8gf~+em7p$tokQS_~
zgSdAtJz@idZ94}TjYJ*J7S=?RA3hB<<WDB@C;dNE|9VbM+34CT$?9{E3_a*Nsr6=w
zZ|PnBm>_f|=Ejp#Qpx|CvR+iMU88-OSkPa88Fz)hXMq;NAqdvaF;d|*mPeNH9^qhO
zuUs3>m2*rj#F-gLFFfK0BlQ=J#?%#Vkzr+n0oO?zC<odb_R^Aqbu;>;!_g1B>)-|#
z;#sb?5lga?aoCAqwF9;s1Dyy~fSqiHowT9k<IpU)#%(CW6g-VbQ)HR$!?sV{zV3fu
zGX6#<_b(8U1^jG_KS|}faaOF~#*8Fz16;e5N1{HcZT-Xsm1!xNn1WTR)R0N)Qlac|
zfgF|r*1?P3ufv(jL#>oObvTO%s>A-MH%|-L0T^nr4<#6+9lQ^#%`r%qh(H&L<%PLo
zV$M6ZG_KP=wp*Rf^+=mUWU#;xf0>CNSxrUUCqLb!VaJR(`j5K9_c&?@H8>21H|LMx
z9HEbPo$tf;J(W&vM_UkO{3E+?B}D}E-d0KH59GmH<N7pqDA=h$S3=lDk5B=Y?kH$(
za!PDwnrG~IN84HtM$Kjx;K>w<^9@uV219{zSr4*<8y`(J=;UQ;D-7$6|CNCRjE!U+
zK0#*-8F%rU;#aD$S~Ql`_K=ka4bP}PCNH<7L&o4d2_sSnqU4tZon&;rT^=HPM4ugq
zFY%LXX(CTV+bIPTiPWGgS)C9oN>M%NxQlQJQy6n=1%pqBHeF2}ArzEg2$e0qPEXME
zq+g#7OVs<tz4sRVT1#rOU)+gr=of~N8v3;^Oo{hPVl>qh^=m!-a^<qcqyO){(I|zz
zv03c&HW(vUGYV55hHZN_N^9qKEXL0`utSo`CP)aIw)E~WJyT21B;fIK9KvJlEmlve
zmX1#Z(xZAhYUv$ucp!QpvaYAMz1|HFbs`m2Zk21F7z}j6e#DcwL_q`B<wDup0XhwP
zJ0<#_oYlY$LTX@m(o}SYTuY>`;aH!BBoJ|`5*bR!zHz*Sly%l3WESLh)ost>5J-J#
z#M@yEZ@~=fxE)K&p`;@L>uwv#wlhOb5H2lIQ(1~T8@%xkQTz190<BGJ6mIbLoK3;z
z{|50Jy}gJ9^#29~DS5%mkqnTVin`TM&!1?_FK@1H0b-NPeok*C=ITxPQ{>!aWhoQY
zVRZv5!*#%ql0FLYdFS(8h!u2CX;l7^NEv;;r-DGJx{b80@>8VM^<>*?sdMna^#^=!
zdHb0$U~`CldaK*KNfRKX&TX-By?T1uN|d)(r3PemU+li3QHFp&sJu*3_awGbOCL%|
z+p9aRJ^G;j)u4`g6DS^FJg&OS)Qr|kAX43pz)7PsR2^ro_muZu>_cyUAG<xK<Gise
zX=b+1n*ba1(b6-D-zEhHYMP!XPkQ^ko4s3DYXjwdh&wvGTR)~5l3LUL>djdFw(qEn
zqT8AM7}-n7q*?V=`?dzEw|TeO2;L51eV}@WcRNrAG3Gbu9l+;z+w`5~ySxedBca>R
zOW01nbj{Pg){~o~w6Bpk<lVtKNcjQrJFDBucMJ035c7NwvBUhPu7I<`vHcRyhrEMU
z;@^p@#1Z~?CGI75_zGAr2AzaB+5*D$AZ@P3l0CRj(EJ*EkgW6mraFf-*6Z*u%KP3y
zF7V!gWS`ug!o28>9`Gg%_A$!ciAU?U=?6U=pG`mD=E}QE_SviNv9WJEb9?n6a!n<}
zJLKJsaP%%cYpiwCZ2Dnpy#35ELdEXHmis;Y9X5OJyo)zwZCw&n^$|iw4843QD~8YB
z5xb!{#m#i>m|72ys%>urGf9o!hI!5q%<Ous!Vh^HtiI0`<(FrzTZ{DeG3I>uN-A83
zL$(fwUQ3^=8t3`mV}6o0E|^5%HO>o08yEgavFE_lMj#fR;WSZrpVCAi7CucZ$Sy7W
znD8oWRPqx9SB+}ktU_B9h^j6e;+Rs{mm!c4q;Q=cje-3OD#p)|%8+i5U%GF8zmkqA
zIHBMPf~%_Os%RmOI5#Raui!XAka@p9v(!ws?+}1!_p73_p|NWTLDt!OBFzXe5tc#M
z>B9W!1%8-Gwf|1y4A_5MS?xXsyP>9>=uT&`AaiVaVc`dv@q9i{EuYkjh`(@EM6UmF
zJ^l*8)kVP$o?OPlSaBK`60qfGpOd!s76u{cI4;ZZLHc3z2*v=Pcb=&K?oi^;1Y}Ku
z+2FA8^P$ADl>uH!EdO5Qwn;3v#Sdp8sTC);Yr@LY^n*8APicRHf?>VUaVCH4xEn2#
z48Ew;R}>gAYxK8Kv_{9)RX>b`H4?AC>}L1G;?hHm!GiymO8=&U&nggk=v=R6j>-C3
zr^sln2GPU#o8|;sijs_Q+)JPQFRIj=3jS7uE~4KLhy@)wB*Xj=kWq`^V`y_#BPtmB
zXZHLBynLuisV(D$dbQ$Gv0wBtd|sM;7)x;0Y%s7?KF`H4mH-yxmN+2GVcn9m58WS`
zbvCIB?xTBk%6>dLR-+8=&8RQO7BNy+J|9J~>EH<vOS~Q~&T%zTr{r~J#}Ks~-3h$p
ziw5y|6JFpnGj$Jz{V(f%`KS=_DRxz5Si`0AA__v!I+xr%&n_)Y<8Zr(04V6h@%94x
zmqEAMol@;3XL5bF21&wVWKkGTvQ>!>%~Fhvkg%93Ay<oaUa&p>Y<bx)x;5PG9`xG&
z9;yqvofiF^KYLDUrNvnse$P&url-l9N+yX8dOWC*>GOCKC>8ykT5nQR4Km`o_)k*N
z*odQDa^=M(N0oMInp||%;xc3j3&TAcnd0K6LLc<nnwtr!{@lrM3F3X=l#jxdZ1De!
zM({g&yIeRsrJ$#{DB{TZ;mtZ`i2uCas>7zt`8#+^h0>?EsK1Gr?ME{CaH?FIFt@)A
zM3ldc3QmZ6`YugMB8?s@G^qY0NNarFC?%mV7fKbGw5Y9CKhe(v8C|JgCv|?SO;_#F
zY0YM#ar5mrr25jm>8+HO!|@anaQU85U04llPkq0;cYS-NLkdVY0G}gQPRDU$PFz<U
zB@352Qg8bxzs>ENk;{z|nx#SFq6Jl>v}T7>>wy_)Uq@<&s{+W<+t<Sn$h?!FwKc&S
z<fg5T>2JG_pPBmspFv|^Y=0HpM?4k9g~1%xG3^%_SGNT$YQ3b~ddbZA^G$}<wNZRa
zQ=sogovaMR_Qea&NiMARqkn+5={Lo%^igK{A|^kKO8qlj`*J37caY_^6U*dKnLM|j
z!9Z_4{UQdIKGZ&kFIX9p<{dh!S%hy;9>%hl8K-WKvmK`fU0CRm$tCdmtx~9xYCr!&
z%c3;Q&Hb}&rTe3&kv2>*KV|jxsW$iqv}-QN;5K+UojvX3-cfCp-(_c0YUOO2!MMLn
z>&N<~zVd_SM#Vqle7tzw;hAF}lvBJ<uSZYTr>fOuZE^;|iJ2Uwh5ga_1QY)1Z!9m-
z7brCKDSPfEU^_7CpAP4~KZ<Ma?F7@~Jg3jwY`r*sHy*Vy;ll21+!D@Bcxlqwm{l;X
z;C%{6$G~Wiby(Rc$>`%YRtXA#+L7F%`ez^=k~0Qt62tf(P??JgZ0FYE^IuY+ZOay9
zWPsys>Ir&ErweoCqh^8L|AauS(O*`~HV`vrdRR?bU|h8ilKWjP*uA8qt)9x;H>Gb=
z_Cxh$cf%3O@a=JZhrjjczo0tEZjkwAWiYn5!_B|xbJLe#pWEw17;@M#vS-6`gsDYn
zEG;C<x>18}AE6cnJ|<vWb)A8fvJbPT;kmczd6l3&^QcF8WY6}T4a-N*EEUyP?X$zK
z407O9_L~|L@#N}qpCBzC&u5fYAQ<cSCl%LHaAyF+`*kju{AKyKDQK9}14_F^flZ?L
z&i=nCXqbq7fUJdeWjZG~P!IoLMzV`Y{YaKeD%ZsZyv~=_xnbpT7u)__$6ce(he4>@
zFL(%N^zkoP8@=T7s)qdL=RTFL%?AE8>^Yo`YJY9?x!<DE$M;J4d~EbHhUabM2mJjS
zy@ru@Ckka46Y_sN{5Mn84h_FVclF`ViW{s6un-Gy>sn3#T>7T*r$Srz{<n!Z^H>?T
zLhMy+>8!XKZm0BJbjKGeuZ%P`YG;y(AFWMWf!3xC)rxK2L7Ru)t<A|qoBFo1;d|Pt
zqt!Yi4X=7$s*Ic1a6RQ3p3xkE0Xhz()6doax_Ksq1(OAx*;G-?k$SFf*u`?otT1P0
zrHlzT(3|rklc^4<Rp;3DoR|oh%%i2Uyl+mYD`E#F`toF3kz>66o^DIjluOi95OQf;
z!;a9jZN%(L7?Q9hJm@w6n*&?8*Iur+wbV!ox81y-$q5D)7H2#i*fsqU75e8%u5T<*
zu1B=kgbw_3%6dXU!%lN6Y4>TVj{?S0x24^kLST}@@eA7H-58lpakx{cXO?~Q3Usoh
z#A*=U6Cp9huXKwJGvRJuM$Ijpc{GR^|Ged3-dV=(yo?<c1nW}30(Q8Nd$=e~I6)4F
z5o$P8AIsAb2f;@o54)4J^PU(KYsYHUI*Mx53Q~>nT2b7h<Oj&<vO1oP!E^7WsSefG
z)ldhkmpWdFmE^XIO6ph<OO>&f?s!={cC6;kpUxH{j4PcYCJ*GhO>|!AJ%PB{)B;GV
zWL=&!{r`@jdt&*OoR-Oi06W8&9b8=cWG$UorOQpH(sEIBtj9m7<<o%Bi~=@XR!4!h
zo^5xr<*fzKxazd+%xl%<iRjQSqGTgx#icmQ%8UNb408t|vEH0}D`c(+3KQ!s+I{F`
zTS-`s8$JnB&OD<#<)w8u2qTz?dC&;KLE`gvh$goK0gf5agq7}KyFyq_5+Q5~^P8w<
zwt(#kVKL|&!pgX21FX&Hs@Q#^N}o6YdCBkX>i;P^p7^KN8RGYyZUSd#(JU;6&Jk9N
znINUpNDajiA#iHUNSbW`_Ql)z?kIJ<q)q)kB9&b+MAE=5iSuflYJ}sG^-|8?!E&q&
zSic{V+Gt*zPqT%{sOSC_Vv){9U({bK8B$EYA%J8rU#|6X$Qp81C?k7`r|3rFU(2>u
zJG*Tfoya7iHPWKFpN*Qs?z^#>cI%USl{TDNJmWBlxfE`yQhboxh&kr1gc_}M5A+<q
zm1#WJ+p@7=ppP$V$qN&RdL9ga!}xq!c_tDBBE?Ohsn;!4wWqdM+k|<_m@L1Xaq7eJ
zZp1Tsmr^E%-LEy<K}r^zk!&TzaP!UjZ`X}aGXbkyz6EJ!N)%LX$4eVG4rNHTijb3<
zjZoCCEmPi&&{khpSf32VHbI2-$v;w5AH=5Bhsms#_31XL`ls8ZO%P#y8272w#~I%0
zbMKmXo7h<B_Ni*?mtLuf+1e4BWxZ9VB_`!unKF06I>Bj7;&7|IL3g+CQu<P6PR7Vp
zVhv(b3Gfmn>|0+O`_@fE%l}@AMfD&P=<I>$W3sb?t_;fFNWPMFHbp{(YeAkRXj=2s
z)1VVo_vX>cbmXsTjXy)7%FftwOMZ_lefP8Kg%r1%deIs7!glh@nJTJ~EE;6&OpU^{
zLa+T(pnS-y)G@4I*#+@sz4vnjSP@;5?1b|g<Iec{h_KUQo7tZN9b{*pnqidq|7neu
zJL1@<R@>pO=quCc<lD@4`PgcAlf2=>{wNT!TZ45KsIRGOhSOf)LkqfefxTQP<2&RX
zYW07p;0+ZPqZws*ZL&dbalxB|6FM8@U^yFu(EoGm@Pc&|Pa@WFXKdpy)Wp{nSl2&K
zJMQx3XNdir`aemfx>F+BTQ;cO-SJj2<*(zn8Pu5|36rCGikfTx6c@sr;#YbvL9~%@
z1Ys*-doXD%R_HjQg%V>veA0F8TGr`M_^zefb`qwe{2iuaAft9D#@eIJx>+iyPkoP@
z7!9*AV*LrlbZ@{n%K~}D%-X<>iv5g&j}SyA)HLXBsO_eZjim*)7Cr~$XEc^Voif3$
zKW^FFxZ?76D=)F-_E1(o>S`;!&5T7O#paN@oc+Jpa+|k1+hB3sVH@JLHrNbXb<<{<
z*q*Poow7ZzP&(R-qwO=Xy@s1{$K@6qwY4o<YTC-B0nyaE`c~eY7u&d_?X}yk1AvGa
z{~ing^x4LZ@;}=!9-(!~)9TU2y(-+`1;;&$`o?{!<5G666MYwzTFd4wh5Kmpwo<lv
z4>xY!oVc|QghrYgcJa!pa0A!98cxxx+%_HpSJsBP;ZdZP96}|N|F`%4uh;he@fM4}
zc6R*nw}0w2Bvjf(<Z?E){oBS5LJ$(@s3QbW0bDtU3OX84L6Ey}vQR8a;FUWc#y}4v
z(MDG5XA_3#jA4kb*~ql3{{_BU|BD1c*KB0#vlYhpOYIVD>utM2jM4iOV2nCS2o06|
zo?8EH1wsV=?<n@W)|rK<Gsu2xoiz^=4T#~(Dr1}3uM>My-8@J)8xcc<7FPPd4+gDJ
z#Hdij#%ryB1eOqn5JP8*U+M1H-skWGWG&8yLW}|4sHU&9m9cLdUgpL?IbUh*88|s2
z;^4^@UX{GdN{YeQF3l5zBu|zgYbEX3O3FwGY#uB5Q&v*!8uO%MCCN*av}Y?Re+HNd
zv62-l!_gC_P>+tjup-q%@|_OJxw-nYmF_2(E6i5<lU915sWkav=?`0Jd$!WDt~%)K
zQreCJd)e@yNx75A(EokS^_K|7)-<BUH<e`<Xd4iuoki+c(#|J_DsN^rJHO^1VBUg$
zGr|)yvQyEDtmD&=RrFhG$53f28#sSJ?q6#Oid7^@-3=_MWavz8%YcCC$YRsxT8pg(
zx|R4#M@pJ%EB$~Vfo|nK$Y`1$jlzM>2l04WHhJip7^S1TP2G;taR|yFduxz&6q~Cd
zkvcsUp-?&4=nGK{{!hJrLq!9%S|Vhr%Ybn>qao<okU@%eh!A59vNi2c8H|iJ=Z_-j
zUWNm5;S^((_@`}541>CpKYRo97ecoVA)0Z=&MAS_&r9MCyv_VNA|Z|_nh5EY5QXsv
zrd6#m2?ir$+dp=HQBp8B6z|sGe~tR<94+aR#!o;kNqMGWAih<rcvf|`q<xx)I!1xD
zsPo&kI%Vq+pK7Mta&59iG=}qDB>WPI`0PV{g^z@*4AGh(8Vz?VA_GbF!+K3oD-NZ+
zLHU28OD6h`P>Q_w2~|W?#rj5+(X<xKGEL&l+(8riQGZ+MWdi#!_;6pwYmKgj!kB~k
zj&TOlV0)gcpl8z6&@wuDmYADrgo1^SMs*eGv`RW74(B$>+|$_(|A5M4MOx;cGWsC(
z`B<H#Nu$j^r;cL@exW8$2u|NhN&E3Ugw0QPiW^ALwl#5k4cxxJ#LM_+6-)seTP#w1
z`o2a{6%ONKE4JgcQbdV=IyE&YCI+#r7DcDikuhPdwQ0oBezKe)$$bKH3ikeHr5+@J
zxO8UK>Y7>B%X?MrWjg&mb#Q>EJR7zQmdSaZE;l5Ip;p~ZjG4MgZ6QyeNMyam8F)9Z
zQ%p0ynFmaoi(I#gXF`+NGQl-+5Vr@i^ll7jsC&a~_Oj}09WBL%`Y?^TRr?h{+M(!-
zsuSxt%d0@u#d&BAyWPs~AWRm$gEQN&cWBvXJRw@2|3qD!Qg^IR^^Sd&jGQ{ABa;k5
zG<2$AOOr{59tL!5-O6{?O|&mX9;a|3@w_k5urC3nFR`F6Ih`*dj$LB%#bxxxMDRuZ
z`Z_iG!rQ(+c3TR*2GCb$V=qvDi%n$w6&ScQ&Rgh8Jcbr7@LDhAuFqw1n=qTOhW|Tq
zc?>IbVN@Z935MbBfn0BIr?0YOn}STafH3MC+HAI`Se%{ZvxOp;p&Vy*TuJHW(rmEl
z*_rYZ%-My)$?=mjr_Rn6y;2mJE}xy5pEo5m(>e=Q$AISw^UMCMQ&W3^W)hj$&0F+K
zz9zxfNy7iCf^F835~>z>f%W&QQ0CO)(gptwGW^dfP-DJH*0|tVq>YlhZ^x&@O4Zvu
zs$f`wj0D*0wi?4AcktOoZ+T(%E<oE##|UzVWKKzc=uhwo>hz?r@sE>Js_g~+4ZOGS
Pc;5pq9uNCo80`6PSh%l?

literal 0
HcmV?d00001

diff --git a/aerialvision/__pycache__/lexyacc.cpython-310.pyc b/aerialvision/__pycache__/lexyacc.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ec78ee46e51540f598ff8917b3469247c4df25e
GIT binary patch
literal 6623
zcmb_hTW}o5b)D{+-JN|801_nlT#=T<C5Z(I>R~aAA_{;+I=~VFNHGS9+8S;T7Yppp
zta=)+&8)2y=@MOWRaEwut8CY9Rh%k)_%9!+{N_L9?{-yU$MJ)CC{CQn2@_ksw`Tw>
z9@bZ|H9dXL?c06(cK7t@hL_EzH27OO_XhjxE1LFW6#D;SAiM#O{}>XcG1AZ$!Z(TD
zbb&f_S??O4r9DbQZLAqzh)4NE(_Ap4d~zYFYDq1m)H}V9VVa$FG^im5X`bmL8Z%g|
zr!QD6&JvHcg#l)=B)kV1nb(S`r{IZJZa+6&e=z3Pg)`c+t1EWR@yA+?&Cx5R@vXNs
zZrqxgYNtxIR;{&K@#{^Qu^nEw8*6pH?zvThHAR0WywmVZz~g@(l6FRDf-KXnE(}I`
zl+j1DL+)yBifH%~F{VEvMEfFicwP(0q_#NSq3yANcH=#*Lq#H>j|ekXC_i^sdv7G5
z%z&=NK)w{pU6aLoq@xEqYC48Uc2g_?oi%!TCl<uI>45g2g&-DafpJYkD?<~RO^0Y5
z2yQzc#5)P_BN-%E3R*J=ltF(w@h`>S8wua*WL<e!Oqbi)H5<DBpyBu>u^}Yk?Gt6M
zunR0atT&v(LvPh(C5{1lM$SGX7p3+Tg?8NG+~Zf<M=uuY?jFlV-F2=Ov(oTdjw`9>
zOQU6rC21^sbywzBeTUCGLe$;bEW)uCue(CVe8KB2sW-qT=C>MkA?aFGnp<8;tKMvR
zT)f4-<~$U2Yce^tQFU4ZhOlVL<W@mw4t<vf`XPxgO<&leY&RX5z}&kwmjfFc8x`R(
zuTovI-I@bB;(Vm>Q`ph>$CkXNGuEuTWA<2eiP!z{moAS@dezmY;|hOFIKFu0wU@^l
z_Ji@UsfnpqE`5IdRh(9<4I7=Qw79qIR7Is_b8)4zHa-T^ZZy(+9HdAd>z{>V4QFH1
zu2xH}O?fixY6U}8Va`M72&-)OZx)>NOOQ}{l*EaNk4}k6GsGbHq;!+yNQz|i95u)>
zV!ls*V|>gDV6tnj4*`IW4v+se0~R)6v<9=A!AyJ%bC<<f9Ns!hbc}HEbz&^V(tyQ<
z*4272|LfXU;b9p;x|C&Ej^&xvGrmFCfI<e@5d01)<gh}9`;a3FIofA6qL5=yCLdWH
zSICJz<fKAQ^&zJfQs_gTQ^=V<<g7wI*M~f>kQe%p7Zq}@4=F0-d>?W_As3a;qpDw}
zeaP58<kDUQVs?2i#>V#{SN0(<?L%JPhrFVYSHbJ8*neIjul4ogszQFR54jfAgR%d8
z)w9<l<PQLOLm}7qA#d(OCiWqd`;e&!QR6K1UJPc;TL)lo_hab6cS(fY0H1F5jQ>aJ
z3*kIOZ&a;+xJRDeBhT!S%X{Qod*s<r?io)s_RhC*-zD(XHi8819<Yi?km$yG8tfmZ
z@6(RiNd_jHTcIBk{<j~J_x>hGF2~u$6&>Us1j#1^<!f4~0m<wpdRi#6`A!PzqaSD=
zkPqlXo!w@49-EysdzamXm75V}*Tj_@Xx$|HqaOK?azmtGm(kc4d$f~vV?yO&pUVbW
z(7q95yP2NW&BA_`3vwS)f7m@A<hnVw0Q**6<byoya5?Y|_Bdg&dqHkk!(Ih>mLJi&
z17K%CekjThu={}(4Ak__V2}<5RY_>a&p<na%CiJ)@QY<>+`Tn7DRbpJGjC4K%}>2^
zXR16gwd)gxn-M<q@ca_KV9+TrCGFvR_GtV1=t5;N%3K<Kt+IGtec(SLk_eRxlha@(
z<D!C%1y~P-#GvgX@cs6k8K-ou;Z^O1|9UAb^E(~jXTcB}f8pG{(ZxT$bn$ZM9yGgn
z;oQCZrNuB?4B^7!`QnAJ_0NILUS}*(+1=;&vCxZ<w4Xb`soiDU@q5Xug~bc|+(ZtC
zaq>?<5@^esp)C`51GNESLtfRWR)Gxw;-|Ou`|brvQstr{W3VwfT*h%bbv+qdvm2|9
zj8%ae2!4?FNl|f~bu9h?wuaF>LEFa;a6Bru&r=*o<>?v72-uduOYm$&i>6iRvyP+)
zNmtP*w})>w8cxk_6ktQ=c2zjM@Z$6Si;B8%By~lD=itRJVsai5Y4|Jk)`2}ph>GGj
ze+tEXbY_rPq<!>&9;jk6<uBRH;Wu3$_)VCvLtcgGDfVBvA>**Ezw1<Gs@Cux*p0Aw
znBOk0e3fw@CLHjTbf}%!tOB92LZOO+s#<2lw(i=zHCbQV64yawAf~1--*z^{Ox^co
zzTvr!0BMywO$5@@mnS^e54Bm;R#8+sFr0I2rVKNv;vj8V-Q~cwl}cHyqZnC@_gRT`
zug_|{-wIT_td#Bs@V!i9OSJ@2a$41giZm*)I>fVK7;bw)1#mDFg$IN(3b{-$ZZ{`p
zCV2wKIbLTQ7<Pr0Z_P>*)#d9Gx1-$b+*FjCn|ep4Q9EOA%*?$r9}3sk93?2bJEGnQ
ztD1WFP!2}}<TYCjh%dXngl_tB2q<243ltF;4d6VmNY!T_h7`G=>jR=f;>*Kfg|pnN
zI==6)sNmtSATY}ij!Ikhn#1AD(|icj({ZbtGG1Nvg$E~`>B*+k1eDc%@e~=J;=$FG
z2QaNN^S5`D@<=p)9A1MeoHwBdE6iJW<uEF4`u?gj-|)l^eta9B+;r_`y?T@FC=X-!
zDxrUs7VuX!U*#bMtKjsZ7^Vu4@<yZbV6!qae`9X_!Bbpt(u6@9pTuE1UA3!AYKn2_
zV<n7_`XV<jSHMyp-(4V@<&kn^w^B{=NOfknYInu1*Q_lGd1`l=ow_6aiQW3G>LYE&
zUt+u}Y_(d8IcZ@K!geY^^(%ZIvM%#jMs+VLvImGtsFtD<185St7D309U${)?c7itD
zAFpj)WEd&Kcx@|HGm+}#@2J!DW@<OKLbYwVQ*ERVL$$5nsyWnhj)NYXcx(FB+x*Lz
zH_CUWr+4iP8Gz&P;E}~QU`hjFAwVhczJYXLk3fm(aIiiLc{oZxp=*T0L4d)*8Rc;I
zVDTq3l&@(Fh(?(2Qar2&^a)wh_%Gm)ZbCg1LJtT_jsTfqsbP&J6-HP(M8o=ke8J9y
z$M<DJ9rN!8cnOg17=XmO@j&k;0;2~zEYaR8s}`{&fx6Z|qhvL-W*N21p`U1#1*@-r
zhSir2w90|i_ddhwn+ICu!Rll1J|v6Sr;F-3LW2Vy0g4Kj4ytc>c@g$90Xe{y;VJ^g
zB_4nbBoJkYKFClql5yO?W;v;%@>hWlTqMa1#X*u7BNZpgk}Urfh%OsYMEJ`hr6T;P
zDv|7%HLa6`C#Cv!5+ZIg5{?tl<fB2d8;21z9;w*7i6GT4Ls@9AV*-Uv1mS$q)(F_m
ziF7vuM_?HFp8h17AJ??4+=zywTGP<;gD{rdK`Ysv{B~4SpLgM5bW&KO$OXwqq?3ks
zy01+35#5n0S?qsO#i%&gK84!{ez^-*3&>#$yZIo$0_)`&DX7g*(Tn!$cdyTtZ<gP_
zT9~(S<0?P}d=qYac%kXQ-AYw)r^1r$7hJEvY+;uQ^FWqbaO+Ykm2Q{$Yfz^mHFXPU
zY5Q2TUxB_5e%}h}vtM}L=kH@t8<S;7o+1n7xQu~9K5TfkVxA8mgyamkrne4UQ^yM-
zLu|I-76)6Gak~X~Nep&rkQ8p!-MW|$8JP_(Z^)#t`chtPN(-5t`j*TBg$JkjAU1&?
z4jF6O8?zHqhgUHp3EVijU;#A8TUl*Y>TYWln#Ow~Nj79m1xD%|7>go~qgVxPRb@Kd
z)I%N?1G+Ff@sS>Wxh%A<gcS<BWs^e+X~NeSe5n9GgZrGmvq!2`*&4zhK>hw2Bso2!
zXMlC8I~{706OcD)3T}H25#v+5;W-3I3N(A);oVOPv=-<FiSet!v<bID;SaSI$-!Mw
zjt+zV5XeX29?2wUNL+6p-^X!xeB1Rg#zPmfifx)6Xgs||)uRtBC;@@n&xoe$<!6WH
zH(g<G6cleR*g{RT_RwB*>egFDy2ICtg#Q_|wfC5mXoWQW9600Slro^Z8F~Owj|#T+
zQmxIAUpymM`M0skKY^q;gx8K6{gVV{Cd{e=C}j)@t}j#33=wvXKStcZ+d65&wDi~O
zc)eL`!*L>sDie8*x_*o^;MH(6P*dCjUD&C-s<_(ld<Pmg@xBs>lmn+wZ^dza{sNXf
zr>YW_dd&s$%kjpKU&3S<6^1g<;Z{&4)vTR_rNO_2`tM+J1e2p!GKnS2Y8{!Fg*73<
zm85bp7;@o<vHd0*sq=6mng}gG{7)dEF$xpUOqhldGpT97Tc?Ia=%*H?zp-@sYs;Yj
zXT|9MT5<Z!O3?qYO#0telK#p{(NC;2{Vyv+|I^CSUs^f(A6B0JyJgW|SOdntS%dUr
zYl!}<b%_3pb(sFMHBA4>IzpdXN9oV45&ARh82zbroc_c*LI2S@Nq=meqW@r>rau}i
z&>s#yNB`bBWBgt2Yz`)J_``|6^~({s82!HalA*(5fh7Rbo><hNAX9V3jKdWvETTBz
zxVUog#vd-T_$Ow@p!_eP>%}3Sf|pEqeo5VA>;9(C@xKIcwgB=D;Y`5)97;Hcx`>)6
z;q>5t4kG^x#AvN5^ILbg)rJk^-VbNo*RTwd8<}w2wL14)o<*pD3AW!i@9}~<93b;r
g(_^a*=XG3({xL|*mrd|H>;vXZ=;bgqGnv9~0mL#y&j0`b

literal 0
HcmV?d00001

diff --git a/aerialvision/__pycache__/lexyaccbookmark.cpython-310.pyc b/aerialvision/__pycache__/lexyaccbookmark.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..689c13c6e4ed8768b6fa779495902ada075e7b61
GIT binary patch
literal 2712
zcma)8OLNpl5T4O%wJ+Nk#}6Kf3c_PUd6zhZgp@-tcpa#Mbz<`fYs{9c5w>itq>N-^
zZ|$a10V;6JKd^J-FU=vBoP5qL2a;|{YfM-}w56Wu?oZRxJ>5O)7mGOp&$F{v*vG?!
z{E5u|M*`*wyx}AeMi_NTJsD}8#=O*&>oQGbN>i<?>0GO4n8f6#q^>hMPfE%ztUziO
z((cm-6SObG8~y=g87%xxJ0$tCLz%+Vr=%k>jb&hzna;8>DlEtHFsiK3(O8l7!8{X@
zZL&istnq^UN*+<x&jz0A9i1IwgP_kwbep2SH4sUWysEInV}uQj5ca_nsgrBxw$&Zd
z$=@U1F@;3h)<a4<1w^+Cs~I-DCh<8o(#d;LyYLIWNA82iox)EAPff<h^f77Yr^#Zz
zQv^$TyU0e_*fVL2>`+R6G@lcQ!I4@?70SKYxoM$QzrQneb6%)-r*7V<3axhM*465q
z$kb+UU%yfN)}j!N?t^oD=i&WJ0n*0BXlr!g#NvWE-kKV(pDmpqzx1HoFc!~&@FjUk
z1&z24*qHqNyt3jq?Ml<}DrUu6;Z8Vl;nT{rZ>=|NFA6J>9Yz-~eO__RrHM-QTJ_@j
zFDJfegidqaH6zFO8UgoLZ7XU7CXYUAY)n*^gZ1%L?{nZ%zGB+kG2IO(gs&>Dy}4yt
z){^h9HBG)&4z@%lYQ!$zMQ1R*6sa`|tA_S1yFBUomg$C<%PHu<Fz^g~2_euHzFu4~
z786giU3U{U53JCmMiS{dDvLl`V?C@TnD?41rr~ZtnBQepiyC;!5BF@2^stGD_gc*3
z89!k0vA)yGB!QOpEcWs;!MxX|hBi;UBdFP^k>=^6J$pxbc$k7(%0h1O+*@WblQE^k
z6F?%U{5Dyo3@T=rJb{`4PTH24jP=7X;gwoz_=fA+%cg5sD<(Is$mYhGQ{fqo)v=4D
zqzE<Oju#1a!*thep+XAUTqxn16YyRM0{Lij?sE@!f?FD-15jE$yoq5wy*P~Dmcf1-
zynwC(eGe-ixk;o7$mpI-iDigWp{txs^v?;C$4G;)zWMGd8fYip(9(U{3}~m5uP{yt
zE{@gf6?DMdO)<&=m@HscQjB^4CI^^SiqQ_h<N@>IIoXz@%(lK0cfEbPEr9kn<fAMr
z;_2eN)ZYq1Xg{`R*~V7})`O80q7k<UBWLt@+HU}pl>R@!q_OfrIIARI{()x~uO6XB
zPUPAm&+O3RPJj&$7&9`jt@yCPANDi%MShu^!AhbM3fp3vLY==oHFsMS=WH|dy&0d+
zIIi6~KIIwTL1%NkWuuBYW~`W@;ak=^=QgXAa{LHx<|vY5xDU+?0^4K!I7&W5@)3%q
z4NK_V9?CJ<u)POBimnqzSJUnq?rVFwpwW%&W59=}fefibv>*+p?<kDvE8~cAg%;%2
z(YI3n^;&ObKZb9whTDYayLFt@FxD+dr1-?KhUHpoXnxC!%uORMC&P^53T_?#ue3Tl
zQ<7q5`4AZ4r;zMjXgqH2nINg%ZP0}p5QU-}J*qba3g!Cjt!nFZr5s;ac)`7SVRfS%
zmZQx`P`-;dlt>bWkHWl^6*9CGq2RqH6d%q@NPZ|XwzuJM$eS#;P3AhD4YC`v@v#dP
zeD!?jOnA+e`h*twYql5i0xZb!PY~}%GJxa|5WKaVWzXj}A3<3O$tS2#9zkM>tm6gi
z(KIBepthi!NnFk#Dve8WpU!|JyZ}Vc=viG!eo8zlQbE>Bx+Y1G6ts`#s768dO8;4U
zrE5bl=a=EbQkGA^Amnfh{0ZExGC0pGDAJL`DDd;h_HUSQ@h`cy<(gq=hx{Z+<2Cl3
eT6})Vy*%0U*}7|A#;zS=AoOo^*hzr`mHq`{1Z}$j

literal 0
HcmV?d00001

diff --git a/aerialvision/__pycache__/lexyacctexteditor.cpython-310.pyc b/aerialvision/__pycache__/lexyacctexteditor.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2ff3a806a2f4d90a6eecc2ab3011b19b52126897
GIT binary patch
literal 2825
zcmaJ@TXP#l7Vhq8Nu#SB2V9t|2&9N`SqctGz?FrBL@B6Bv@BkrFiuTP)Qvp$XlCec
z8+&TzDcK_M8}`9^<WKAm=~tfo%nQW}oHLS>IAJ}iK7Bi<``o_Mikr<Ef$!0U>!S1`
zA;03}>5s$YIy4;u5k%0CY)pH)K_%H{hYqZ=38T|hJ}zyPX5;d>vQe3h-Hobn1b;|2
zYN8~{(APypxX?FbQ&e}DdS5mk5a6ckMeEd#;6^e?-@Fst@9DtEmBb(J_#>%%NqDew
zsk?gI4=Hlaz83T)Xbv>}KOlk%HexxQUZ1n+^|>P|ITx;|J|uZb)I=TnvLT0L0!oZX
zUV-KY<R|hY`HqT)Xg+lFs%VKh;Hw#WNRiL5jEMOMEU#yEbuX(QIunvNvWEW0BeG7i
z#vvDHCRBX^qtbNeuq+lPG;d~2aaNH3uqBc=qbi(N$r|DX@glBFC?#24U&b>!iv#Q4
z{_IYFaN};@wgz9`{iOfJdjDTv_6IlnCk#g?U|Waw8-^`IahksIt-q4qSlRG;-Fvg)
z#gz|n_U>jkG5f!eU#O)<q0<WT$Ird3cr1J4AnN(O;g$;Y>ih5YZpFjhv5buF8L7>c
z4=?vZ|Nd&Pf3tt(;s>i&Jspg9L*E2(<RvQJmP6wuzA~4*z17|**<G3K#|mb%dC!+B
z@WZ`8gH$h+`v?ATXym?;A~3P)CI_}=ykqO{pkWLMXX$z5>K9y}`%25ZvU@FzhkmF(
z>OPV9zy49^AMlO)Ex4k+c5iuVefii0)2RznJ8!&`PX8R45H#uXsY;xbN}cLj`#F8k
z=NK>jDv*q96Hc}%^oS}tp;rkb9;7eC(Mg@nXxY-*pu=qmTqTt)he0Hx*p~MEa97&W
zFy4)f`a2`4#*5?+$o>`HhE9ByrEj0A|4iUHozZoL&gWoY0A8Rk=(t1{$pcK>srC*g
zvA_)EgLLt;Fq9)dY!A15<wJy3`<?Up9fj$7gamC$Q~=t~!Av!QSgv=1<ZrF17*8q{
zs|{e&sLleNr|G#<ttkXf*n)!S|Fd`s<`w`RY!P$`iU3qJnzM}EBKPXKlQ{@?=;<NL
z3d_E)XHZZIPClWJh=Dy(S^^JTBccpeSJHDDz#^leY_GMCQ@DLHDUL7@f6z6xQ1tD>
zAw__75(dVWv{6A~%YKr`NLU9P1fHUa<z}3`XC0uH?gM<Qu^RbNkV^4X=GGGLSOtML
zHh^%}qAvUz^mTeFh@Qxva2f^+D{GMe@4_G>YVJ4*rU_tR4HE%_4?FC5+;Il!3+o4w
z@%P(BG}^u?(jYzi>}vm?w>wN>V2XU`Q1u#)A6F<<AhVK8(;$L9jVqkCz_128H5Fz;
zma4S5BUL2BHsYeare1<0j?lPiUj7MR$F&Y%&2lVEmd6kV>niCB6tOjJpup)L0<)!F
zM!6**kjPk&IxXvd7lbnM$I_Nfydxv6s>oDFCf9huD2kO-XK)Q(UZUo4qXH!qlIB3Y
zg-dVa6325aFRq+}f_(?>UN^oOz|m8myotPpPtH6Cf=^*juL5xy{5a=Gp=X?O@ReJ0
z-5QKtnDJ`ucbC?{iwlhZ&g<$jh@A*O8i4iXCN%wDAOT<o11=~$f=n=YRvA32EH7nD
z-OgBnn{rmxi_yijn$Zj}0vH0!Q1Ug2<|sT__}t>owSqp`{{~P652%xsCoHdI6+w0!
zb={P*3Ou54!UGDoa8bHSa-K0gXDWbCcLK?myBYk8=#&TQJm^@2GktBd3lGGmUcRs?
zmOIb(Y7rR`hMlUyyjIIdUIAj6f(UBqD|iaAPTSH*#k&B)Fwn;G9eJQZsEa$!*iRHX
z+w%Ky98N>(z`g1cF4D1TL;q|d&qGlE9pBs){#`(N5RHBWCVc@2hiG7&@aNJpWpn|k
z3*`<X2XS&4|E<yDRDBH0Luik`W1tq1dL5PxF?5?@bp!uky#i?BcvhvIL1Fz7R*DWU
zB5xgipe^^+Xs^IxhgvRtNcm@Q$j5#V9fDj@i(8<ax&<@qdLHh0UZ<sQz=FDht32_|
zmMsU7mde-){FUnh!W3F-U7R7*R0pSXdp`U<xF5=4=xZ&tWqZRyv%=iRmmtJE?<uVc
l{R%j^*T%8f4dqAZ4vheJucP0fN`Myu2_L&lE(ag~`4=PcmO=mk

literal 0
HcmV?d00001

diff --git a/aerialvision/__pycache__/organizedata.cpython-310.pyc b/aerialvision/__pycache__/organizedata.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4c4e1c8128bddb4807825c693c72a157b532d9f7
GIT binary patch
literal 6305
zcma)AO>7&-72cWs;fkUtijpNuu^rY)Tr-X>NBMD@Dn=67PSVP<VcAY%)KJ!(6-A5O
zC1+Q*B`g7>PS6%;fVSu{2kPikjzte`uSIXYP75?ZQJ^`rhqjj%Xw&b_k`yW1MM>=Z
zym>S8=Kalk6ON5#4g9u`zbQU_*)aY{oxvYR=S_Uk4^aq1u)1+0t=SEx`}_viwRywT
zwROW5hUW<GWrXRu_(p^!?2nBbS>cEb>QUi}5!5-66{D!fL{5yM&WpSlM?EeIVghwR
z6vZU!2{9!Ope~AOF@t(iuvMdc@Jmo+tbEB(Dx*SG??-JD-x9v)DvB6{Z*p0TSz-u2
zYdm1W+_Ypd?Fn<%@Qj#ioMRft>f;;-4tg2~2VKSC)30o=NZRB}Vv(dRCAG??npnE9
zeCeI{f+%<|SPw7M>b^4TexPz1-i+@NsLWI`3b$m%zh3k25@fj+#SaZRf<^xI$^1sx
z@aG$~VBVXrY{*))@bZiE=fcWX!w=$UKK7&d)o;8qU-xb;%%5L6|LW=27S60iwZ>N6
zi)&%9)|BCGzY?!CJsH2Wc4uL}+T1$X9S<dX-Tu7i%bHieQ;RTbK9p52sO|W|i@j6L
zZ8Z`3arZFkL842bE+UmVXy@2LR$%T|aunSm{!aox$I$%$0!)gHn8iFcV=J}=6a2GG
zo5hYW<BYIsyv<`**bf*pgp(L|40!@PcV>;YnXtsXJu+*c%qAS=DCEM^(Y0)1baK1K
zIpb#OJG^ZH+Y*^gQ(i*t=o)i{t6NW4CxK+wvaNYBqW8U9HQIJO7U!`8cD<c_z;qk5
zQ2PPLUYWL&*qw2~c0q5;PwXD<jFGn1vT=bl=|@cdRb*&gYQ+=nOk8ZcIB7PXoQ4K6
zBc5!JBqL%Jn)x~4xg^ttPjwC?na=dCvCG<7jEp62cVs4>>KshmbgVPGi*?$g-8KGF
zGupW%3mV7TV@Wm{n<d^j?R+wp<dXcN(fUQ}XIL$dGvywzbVZzaJjnrX6nOb$6jt&3
zWV|z;<n{S}n~Y=3#(KGA><y#!2h1NIoFC71o+5JvKTMpzxM=KyOfJIqz%xT+G(CT6
z7qG@uQb_kt;Yk59Hj;ukFk@haLwiujUC$(p+lNW^DHh2sJvX@ft>Ha^F|F4*vWJaz
za^2Y&quDz@Nt~{n*7xwR8_v*Q+1Dd_21am}U@IOIS}S^V^flu|d*actWP&`2)|}Zk
z`I{o?5<!!q4LbIA?m=gbGjLvu(3;kF@yR|dzZ+OxQ}l$LyJL^+Jo%WN2XGTm*5CP8
z*!lDKjlg9F{UNa#(%26T@ND#q(RMISnxA<CCm?6MqCH?aqnxPX)jg@4TBCX0uP8T)
zy~^e-4H5S=q$chyoTHW#!97L3aw=O<95xhtPqFo#uU|^NP%WsI$PtxlK^#)y%fPFb
z)<fCw;!~$isnKdZyyexGwkz<kw)Ru92)CmO-W^|hRll(gZ|Db=?VZDymrC!#zlK%m
zHA+}+>DsxsN~^L`dZYB5(2jfO_@!I1R|{y{%U3@*8HXn|(nYV?B-pwP8>QzWY`k-H
z08^$*Uh#seKZx^u3BRjh42pxT#w}l#!u5352)ov6fgjP{dc7A2ccqUNhg+2mUpxU@
zUpmsFbLRD#Q!apHH9pWM+IOb+vl7h1p2{R_OQSq{UP=IKL8<Bo;95*)?K1Yla+r#s
z{Itq<m-1Ir`A!B~^?DB`RyRE1OEohLB+2{YZODFNXy6=#poSM9(Dj9ZT~f_3A$Dc2
z*Fz6gQ&P_jyO)%*NIR5EWxU;l3e;l15h+`eETlsmav(fE5NagatTn+M2rC1)v}tgv
z5(ZEdy5Yj=0&xSPQ4@9l-nLh%#Qwe57qvK)s(?HDYPhr|ysk1RM@F%%HI>=$?x_hv
zJ+DzOlRjVcm9^<_M~dHp{@?L%!QO&~7PNGFEF%fnn0<Mv8!z+*Xq)n+a@P?OT*q>%
z&^z#jt)N1eyUNp%*XYb^l?`y0Y%D&fnL(`;uK{N4_%Z}DD&ZEGmgz<vDXI=h@|we%
zCOSGDrKwy{w2CDn<plm+oL1RdRh5ecmPu(<w&BUm3!yyc*Odj1OJ&LMZlp4mu)fs@
zA|1e_1;q&}*CU^Z{t%GpGZft9Z13;<GdJRL*L2wtRxsyK^82o7v7;=<rq~p>n8k7^
zU7llkz^2hc53MZYUlq8;UG&htPj!J6d68Mb&G9LI9Pnvck!LYNAHRQym)L1$eU;^h
zSlY7gI~dP#hgqilHVCXxFxUH0oJSU?1DZKB1gj#HoG}o(7##y)3g1N-lW@2~p1Q(^
zOxd24-geoP$1p=>dwRN7Q6qhx?BkX^1k|77i%5Mz5Du<kL*&pHsj(*KH)!C7<+#bs
z7~c2+!;L-Bwh#qz#6FV=7bZM5Z@FkOKxUZ{FzU4ti{SuFu*B9kR1UMu4&t;9A}>1P
zF4s}OGlUacw6?}O0X&0ZC+xuGHAH6irXjB+W`fwdgI#D=Z{Kt@TB5yLXL>m4{6Rb#
z1N*F$eac$7w!EB1)V6Y^4?~T7A<H<ee4dJnR1hc1j>?dGfw9OH0-mLU0#(KCDHq=N
zay^Xq>!mylgkR!|UO-_%opMmWEY!z>N>M*=PP3w!10;{n{f}c7p(2*8XAa4OV%@$x
zevJ;vApx(5yTo;!O>o3kw+8!j$SR;hra<&c*$0hvIs6vl&{PUKb&7dnkz5}@B5kLK
zo7nx>qS($PnZ$g`XuFA%xH__cR1jYv>Pm0*)@$8)PIr$i<~T`4<C=$c%+b062<9PH
zENNwSQfbr-Sc4YEP@GRDLZ-2b@9zZ^Rh;{#)kb3YW8Q)>JY;ENL<}l(#ZR49Cfvk6
zrB~U-I_~C5*l3>pGc85y7L1(}Lzf2JQuj7FwO9{v2}Nho=*=rzS}a;2QY9h%D38c1
zs8Y4i8sP+MWQW?lEh8s0;k%RtU$-qk0H%gNN})W0u`lpNWc=_U<`mSV$mYlsp)BGL
z)!-U$NHw$>V~#e}`)H*8A~mimOaj73;0%*}QJbs@gE~O4RxW{Jpm!v(w4nmBGeW|F
ziLt~Y(|O-$<<bBx;T<-(a%xzExE}&ni?nNCTAK1QC@S;RmdL8g&VGiO4Zk7Z#gKB)
zLXfggc=BnC{S9AKLQw!)$OXfObV)UbGj`y{;gXrg9%7J-S@dZJ(anG?hQn8+OgZ26
zV_YKR0#h6?g);d$VhXk4*-|Jm?ioNGflRRF>`cN2ky8qLhO7}f4@=8{eXxfC_Bq}7
zmUlQjo<$yya2Dac1~I`r*af^3?17Y1^G(NKTE7)Lg|y#9NU-Oc%I-c+%8zKG=457R
zqt*l~RpxCk*rbMucws+dOTf4Bt`L!6=ms2QtUJVDVy>gL5IRWnhFDBG*uTBf=aAd<
z5CXRp8@Hi>pOT|8Gz?E6eHgA24TGca!{GRlz#E@%1pY|K6RT=M5H?tY=MZ`X!Mlbn
zC^ShKMf-v`HoS0iTk9FZv><+I0h)&Qrmkroqt;C-$OyIX-p`G%Vc-aHBZmfJ0{pWq
zi<@84$k2J|J%Y|mt||H{3c7kgH!deqD>^qEE-o-6&$cjy^)a3&NLk3i{=oR2u@1EX
z)CCle6*Kxu1+Fu2RUUydS)?LxxTda0^d)H|Zr8o$+DK57EL<=gGS%da;aFd9l@ljb
zCF0lr_Nww4p<JgT^`Y{8v=P-&&a<CI6kN??O?d;2ooDxln~!swkGON<8$I7d2T^Eu
z?&7HiF^(|4Z|w5at(Tc39J!1F-b0hbn-zpS`Sludv^B)xO-j8|3bTYu#GNMjk^LN`
zX|$Dg)#XuTkt3I{5`=DVHPsK0u5L)*6H7XZ9qmIAHD>~y_#wpAd&=81&uLcZ8BLAH
zd($+41t~3c+Cmfb3R=gBAadyxTEe{GspeQ-2a`p(cEkauB)lQ@A_eSwtbd>bRbm~c
z-(ontmx<e2D;ZSA!icHgsHX5nW#b(L9t<yBPqm5yaTyd3aayW)P$LKGQA|u+2G0Vs
zH}f!NW}s%c+2P}sdoykp@Op$fP^*#dO9Ip}!Mj!WzW-)xzEwt4n59=Y#C4iN(AsV3
zNNx_VN28je%_Q3aZ3$l3=<;aYq&^;QhiL4L?3w!`3e<}S_G#|Ju9{@=5L%5lC|P-&
z`o7aw-{*$Zmy)9YOMjoRy}ScCJ#NG6y$#D5d75w*sL-l;oLUrbA~mW7+OwB)G9bVx
zvY~|vXri2Hj2ag`ogS@(%8VPxqk>frCV+H_GA~M@luKyTk{4)+aZGLcVyzFoOWR2*
z=#JP=>lwgaCv7FFC>6??1;neI9YorMS6E~Ulr-^wTh0;m3T%#CBh4yLDIP}Y6~93%
zpG2X?dueH<?nRLw$(Ly81u9NZK?!o2Dw8+W8IjII(kP3<A$^tW%SL-(Z6|ue0n*85
wX=_vNVj~n=b^k2RU_{4IxM#`W^qpqocEgd-%eo)AwtLhqx|8m#o6Qyf3vTU&b^rhX

literal 0
HcmV?d00001

diff --git a/aerialvision/__pycache__/parsetab.cpython-310.pyc b/aerialvision/__pycache__/parsetab.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1cea68fc7e1a7dfc21b00632a4a41e9e2b87ff77
GIT binary patch
literal 870
zcmb7BO>fgc5M8e~iQ{|_aY5|`ACO8UQa33SilVB}<W>SSs*qeN%k?hKLTo2%Cu)2{
z{0$Cp<S(tbamtM|NQK#aNVGRb+Sz?GZ{Cb%<4VOuV88DiQm!NPLxF!S2jCD+@)I0I
z4x*Z)QS4x<2O1@T_72k=%?E&KA;5qj23W<7&$^>?<PfHR$$do~bZ))HPTnangB7U0
zfxzcB&?gL;+mI<bCMz;B)x<tWjId%GF;m^Z%(v0FF}BWhT3mF_sSaFoN}yk6rT@?`
zgFXlS@=71>mHzqo)me#}v=m^cLd$(ruViF*ySXFD@w4NTtl2w#@w|J|@4i0m_Kv#N
zo0lh@gheTfd}cjeK<heX4ogFGtemDCY#7b|fYo(LRCz}3v53m*^)hL!W$XE@+TXGo
z2i7{0;r^B&pwsL1AbWQmR~)5yV**hGs!)BUIda$J%p9`Indkf4quKns6028b$xXeB
zDH92gBWb!r;f5?7#8g()JK;g(r4zwqO+n93f#&j*g~^h<jL6(FgxwDkyHlxWd<2h6
zI5SyQl|h`wt2$$i<t_CZiI`3ny2&>rHr3$D{rYGR;*i-Pk8IEO2ZAT9hfTW^`;!p1
zpV%o&(!Kph_Rza%+1;b=-p-TOV;DG`486saApD;BsXOvSy6a9`b}*VWR{2-}T4CE`
zf_uX$S1YiAoiJDl>_{5z5M~>)1NDu`18{^8Oz?d;HEmO?X;lqhVZC+(V~jQ9>MxlM
B^fUkf

literal 0
HcmV?d00001

diff --git a/aerialvision/__pycache__/startup.cpython-310.pyc b/aerialvision/__pycache__/startup.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4a6cd5f746c676004d9684f95090a71c6297459d
GIT binary patch
literal 24043
zcmbV!3y>VgdEU;<zISi$iMzw$wID%|2NG`*q$u!tJdiN)pnxMO!j|B@?Zsh%z1>6a
z96+q+WQd@22ive{sp7mU%0ZGJa8gNBQN?kRN>$2LDZ9#*#EHu#j~z=DS1Lv-Nu@+v
za=|i{@B4dZ_H|HX-_`WYbpPG`_ut+BtN+=?z(6*Jzvg2vs<%Ig#r}jZoqq|uyojs$
zNh=mpF=f?a=gi$YXUn^D&XIfkTwLyna|yX87gOibmVwQj%Sc}KTvqP6bJ>MhDUWNQ
z8Y>lWAFO0A4XqC&9Z~k?m~vG7j&p8QB~<eL*ts$FoJy(m`_{QlDx<Q<8CN-#$9+N#
zr~>YjYETW~zF7^c5!|QLs2an4i`t~dao?&Y)FkfP)Mhn>`*yWOZN+_u+NQSSUQ|0&
z5%-;Hr+NtYht$Jr7w!+MN7QcIcd19!W4J$}rqv$YcdN(M6SzOB_NsljKc@Dp1GrDC
zgX$3Od(>g|B<_!^r_>SLpHNS$XK>%EKBk_<eV_WcI*R*#Wu1*pKYx!jKjp_?JAHDt
zmEE_nw6L^%zFA%L^QDTemTFh4&1$1w#vDX{aXck)&ERT&1&Oj&tkw}X=HY2`?51_e
zQubwAKjOt+iG6Bx#rAC63oDLmd(KTuS(hDM^c>~zuHq}!m60n2&w4v{7A+Xap!BJZ
z8czx2qXy23@jry_fI1P>JR#7}dsZ^`4%(egoYEYEAFoxLt{*@3=7|&cm@)16argRC
z#ZR9<UoS0I&Y$<4YTccVYmU;7&o>%1KVEe!+PB`l8`A^Gy?<l>#l~V~|6;YizqG%6
zQCFLXo;<k!l}34au~K)N``t>@J@T=q_SZ`DhxX6T%pN)L%%P`A)a6>q1vSqv>Bgl>
z**(8h((d8&R}bw6(f0=B4UM<peSha1>~FdNUtZd`bluOEFILKzXBU^;>u&?2<{l(*
z%eF?WH2%jdJm0WCN;+xVJSVMWl%KZrAi%q(^kc}#<8p8{znG52+}L{TPV81@EwhdZ
zyJK0%xn!?pl+6vI61OpJp0)0%B+Kv=zk>+@Ov1CcVE~f>45t249JxuAy=~fHIW`c=
zv3+hQRvgdif=I2WJ@mH{_u`l0+*)e*wv8U*>lrnI{DeG>;wedtyaZYr!&6pmLVqbQ
z6=7m~DU^-hwpY?#+Vlaq9Jk|b$Bf4#o`lGFnGT4}>=CdDFQcXavE{b2lJ&9zk&i|)
zfYJ$+ZoTc?N~|S%N((66wwkzLsqJdV`)O4~8#ynx0o2ab7~pL6Pz0Cv@||$4!|tFv
zbZK~f#2r-+-?7!MoA%0pH?Tf-C$>^hkKD?61$UEIz}DJ58QY9JudqI@9tDjG?u6Ri
zF1yK%m|YoM%c(~vVqU=;+#H*T-O640-*3n2+v72J@&oJogcVyEvSO`aZ|Kry_1I<m
z154-B^sQXo_J-cWHhl-Ib!F%?Lv^rQ-mh8LNG*<d&uU^dhp{{!jb+#y?i@=1_k>Vp
zZv-{sjr2g-(cFxBqf*BC&Z+j?jH-RX+>Ci+J+-I2QLx1^lw+mdimk<Z$_pqzuo}bc
z9E@<-<ZbGKYU42Ojc<s<xH=TzFyT#f)+Q5PuvaFrkG5bRW!JLnTh&vtS0*tB<3C}q
zY+jkdh_-oYZ}VE#<2pNn{jkN`BJJeV)7&v1SXch3hgAQ*yL}A|?<V&`3^NKgYi9da
z4t?xEANjRB`q&-p)gtKkjEb-2)yJ?ewt8DTu?uR%*W#!lI~bU-#<Qpqmm1r=ZQV7n
z*OI`oz1OyT+dKB!c9d}^?eKO;Ju_RKdu>MmQS^!(5bgOaLNJ-Vw$t0$)AxW^jP_b`
zE!k6AK<T#CBxdtyG@B234{ZRoXEiD7^SKD_VejD$;GU4yo{yk*dAoX`Y%d(oBi<uY
z*1J9*Q7;7J+3oG_sXgF55{)Odmg*@jpmf`63gdY(!r)Qw(G8&Xtfpi<FGX;Vd5>)X
z_k^_eas)N)P4_@0(C18m?;dYYPicF!k9&_xS?_2cSFZ%MpYWdOsXgF59*s7=W_WR%
zj}}n6Z8eS2&PMp{_4f9(L8v{eX&LRY2yUOZZv(g|q_tNgsQupl4WM3IwN~v_2P+~4
z_@p{6@IR(r$9_5B9q6giroch(pp^Ae;GlxiA+q9-cc`cKly?yP=p@R?kE}JTr@Y<z
zVejyUtshpWg4Un(p6sbT<sI&5-Ci>?H?K~sHv&mkK>vo{eIlaDQ{GcO?X<Bt;vLx#
zizDhxfW_0^(>=APydxc0IBQ0(cVl6xvuaMg8BjjHnpw@R<}akwTY}Zw;D66}&p_^v
zsZWyoLHpQAy+Toid*@Uud#qZk6pz=Jmfd?NprBQE+fdRH*DgXqyX)Ygs@#iy@?xdB
zaMAVc`2|t>T0<|1D*KkGvc-e@4o*Aw^RHHGwc^=Qy;(dtcW(@O50$-|LNjr<@-B45
z3yr#~H=*FJ<;R;<RcY;iqgE+3D@9;YE1q6<i)WWA<?4lMS+u%lF<W#S#aAoHr>0g>
zcjI~-_3u&L(v$2Wvs_mdU8~kB)eoS4Ld~3F*%l`3N{<o>h+r7;IlWxDYXcE<=O-)g
zE|u!4wWCdfX7NHp7w4*rmEwug67YJfTDjIDb^(nHz}d#KE?0^(4P_wi(XyI$q(475
z-_XD)Ae^q$s+9|OEh)RV9dLIYy^Sa(OZD=_hW6u2C3XF-Et&cWlz_NOnJJN;pDm(P
z1|_7kWkja0#0<!%%c#`#hbfG4*dIdegdVX5$#xbrKLxU9%dBy%TB#|~ypffzpg#^#
zGIRlT{PfXUX>neap8q17VSDLVYYWI$k)^WlXsuB$)tb-a!N3J5`l)hbaS>F#pXMOg
zY9Dqq1BhL72nn%Uv(~LUNN+l4WBM>Mr)}6uO3gJ+z_gfeyz4s)6<4rBNv_ORn`pkG
z^wWTDqphF8>)*m97!Sqn59&%8_5<;5?sKsOE$zoNEv@_3{(BsjZ|(DwFYCs&W<R2Y
z6EbowuR(oRmTN&}hb?y7x?_W+j<JEJLDy=%>cY5FRgLwYtk#<qjs0>=`GckA<;GH_
zP9$g#1>+FZ{cIb-o?-kN`jlae#qQ_N$S{PQ%|3dDv#k%IiqH>}TBtQj%2Xq{Oyfrg
zKzmCjS*Z1o`9@o%H6Uay3SFNq-c!i)lV>W6jjR2X*|5tIRjIL!W4f^zOq_Iv#s30(
z>F)R&QO6I>Tx>M3*Jf%+o7n1QU8&TIXDZD`ZJEa1v&Ct<l|6Xi(38b?i%%bU#<vb4
zBX{t~fdj}mc<8_Z-#V<%p}}DH`{}DCSlaWoil4Yzsx4P!w@h1RdySmETwN;896Qli
zD8gK9R_hDcyU_;IKaPf4J7*g8tC+$74qRyF%~xJ3HqFi_yJ>CV3YXkYen1L5Tl^6w
zNk179h1)XRgLAVzbow@-I6-|)hRajGMv&u7US~o&Puu#_d_Em!v}gQJnDqvdz6See
zRiBCKe=2$+gOIiJ-}xYGXRDB+p>?IX)Tk>^wD`u{Cj;*Fqx92Bi-9&R&`1wC{%_c4
zU#tHga{MGKZa|I?ng7=($9Y!kUeSNfEFs71drmH6Ay@0=+A>5$$t_aKxk3~0MhY{x
zcmWHgS)8vx;#7)Dpd1%s(Y;tHHkaoYt8Q_A*>xNB*0z4yO8vjtLMyb1%T{`Z5QhZ)
z8!fi^2T(V~2LJlhE%j436Qb72kRTj=KP#jh7STc_!HAYMdjX2mT6`_B;M@eyHUb6=
zveY49f?YSA^anbm%iOF#)b)zx=nr@02n}VY{UlKqGIXzsJn(LEZDwO>EB*sNu|Vzy
zB|sR?$61SFy|ryaP7<+WhcG*4Hr!&(5K8X=RG><TUh!_<F4<s@zeOAb=}$&a(H`F!
zksazz=l+h?VAQ*xJSz(q%v5wVdcQ<RyGOTlA12gIWUd~<1Lg`dBx*oXQb$rK1)RT|
ziE!R*f57aHnOe1c84AGj>_j+TN>gO%3X=;+pz2(xH5Lw)msQD6Lm<0QxGKI=Uxp{P
z2@xO?;H{GG*#rAHVI<Q-p*S(hByrW8FUYgKxJ^Ig+jE!wgj;nfXO3yecE7MxDqp5E
za;daXax0vKGGI=W<|{SdnLGW4@4T^i%}<|dxRsY1jmy4+W#}hgJ#+f>TeH6PwjVz+
zdu+~6%tBGV4#h-RfFGy+;M<KR+oK+PsZp)_gUiq{&Q@F(eB%vfXLO~cD4LqS16{*U
zG?!{s*H7qDeWBu~j-Q^@8VYBkqIJFDr$Xf_bJo>p0+jHbX2tb$XG*HtFx<x<0)7N+
z>te~{b(gTY89Gb@u=HZ7TCX)4OVc0o)8*QEVQzk0HySR`LOLf@DllI(Nb{bl((sro
z%{Tpa5zGMihDY0=upMSq)dlLADAUh1VJSd!sFpF@6tFyoru}s7rQj(yFS@ZIWMMwY
zHY*fh(2m%aKj1Ec_l8QM&S7GG3-cIOFsh+H8oswxs-~Z13q}$3bE5hLFNJv`V_u*^
z`I)AuG|*`L;hIrz+S-kuI0-wVDg3~XFX9>23PE$_d1@-MO#Q*~ygVEtn`r8?pJV5S
zl(X0rZDN3HWdd^GF&h#D&jTGKA#V%EWX6b`O<_*QD3MKFoCBJhB?BC2x5(F_u11*?
z(JE}0nUfnf&5}foo53H5Iw5C){u<2K+^oD0cXcCKV_}nFKa#sS%<UROcLlQwf|m2`
zStN}*<`J)horN;ou5!ugo@`Va>a1k2@2VwPsrj>SzI^idoMB!AMn&%CV0wuxM@*a^
z5p#tFnu%yMO+E?B{xY5}<7z$xE|!jGtr0tmV~P={fFq28m9=?gt+7<vD&Qcafa8rJ
zq`W3^O`+UQjG;yxd0A^5<?$a3*?B8z$L;(_=^OT6O{ENMSa&3-8@}_nBx~i---wlT
zvi6Xbc7O+<(&(A@u{e5nHtwBR6yp6@F@Ffy6%relZMAJH`GbNTmb0INV_=zfw#)a&
zoV=BHH0jlKY(oY>aYcFC{3eJ9H!hBAXz|^G8)zMeHFO8JhFr&uUrMYeT^xzv03m?`
z1Rmd{ae!dM^}iLa|IGEL;rPeFNNWg3GFg?x3Etx>wPM5V??i5Y=QEDD{c#W6{<$?<
zdbH)pCJzh-R6029DR?~m8N^YLy^_GwkQY~3mGcsphE?7;=GRBm01iOlio4*bf*O23
zeFrXXIPBTdsA&oCW60U0hS)CNqzvWb=5S~shQpu<cha-r6d#$4@dWm!12_AQl~ilM
zORjJBa2SNMUF3fbXS}4z+p#OF!6{;LeX76w9hBqnWE8C?Z(4G2l3Pi^uHJ%vw|Yso
zM@+B$6TG)+z@sK&w=JT<cDU6yMI#*N2-O4*gK*@uo(~Sq2r~%^SU4jN%W>dj%EkFL
zbxIFolJzu`PcV6j$;(V;nanWRiUj6;vZ1SRB<MNjQOkmoBMQM7p8djgF!)BT7Wh6|
z!$)6fEW`TTYvlLdgU@%ZF|4XssHuK9f){8%7U&?yf@?P2Ab0Ika1fQ<>Q(xyhKz(F
z4?ZrA1yOr|R-$qhSi#|tE4SGla;ZINVG`ZS=rMFHPKA+JtJv)!=y&PBUZ%POQ<DBF
zy^9GIS@8hg`vLxR4X=HQT{txTp9uWBO#TZKLFM<Or%1(QG=lyE=0A(XPtH(JtMxS-
zYOdncqBrIaT$rJ|BLxZS>c(~H1Pa;)V?W1kXr4{)(97&7J=a*Ofq*rAm7TxO<a10u
z&t#R!8k1X0ZZr9LCfAr;XYx}>{0!tdrhXn8QTnafGjqphUOJ(_z~WzEaeTg7SNgMj
zOqLsP0xkH7>sR4b^sRUF&k^irSe5`(<y}nq5;Q%fPcYA!J@pDyz|(Vcr%&pe$eGUR
zZ?eYMSei3yLPU-JW#)9OlDE-Z<C^Fl0d5ZU9BSkc@|9ZST6?(!usP@)RD-z4{Naul
zGaVhXClhxTO+5vQHjm*U4~B0;oaDiDvp5$IM8!viBt^@ZVZIb4_9RZ!$01yj))Yh&
zL;zU&RtStq)YWXW>>vsM_#DSIgR4nzuV-`Gd0xlOf>Uj<3vOm?7g?29tA7kIp&xcA
zspTiJFreQ0$!kIsGm!SS_&vaUiUVOsX`ri_)3;wEG`L^Uec%fW5zya3HXVQPfmNWs
za)D_Ef?yNLoJoi6UY|lyiM?`Fk64=TU41g28%c&vp5!u)W1KPuvLQyED@3sGMDLI;
zJRG%o*bgphqg-SZ--P2ir>g&qyY@e$IheW`PBlG}O<^Z~Qj4=se;37Oha_J*HS^l(
zGjzwwItwj^gjsbxV`A7ug=3=pOk3NzC~knB%rA31;h#_%FEX|d@ZbtHc>pU?VfzZa
zdx$`>JdASPxf4@1q^kq@0(S!c-MDhvr4y)U;RxM~5^cp@D{5;J0TWm@XeZ$&FQwYG
zICH@pLKaxYoGW{UQiyF4PDjGG#%3DJHC3!P+#>BTaoiNma?7;0u;CxiqXHc2w6jh&
z7i1l_*Md-XI_{@x#;t8uhL{{UAi*;2A!r$m)kshZnGuUSCyCtzb<Uo`J^qn+{=0Sz
zNlVOywB#Z!2v;r{64156#StzIvxBg!lFFt!LlV3#izOg*N+G<6#8q&eOvg_d@d%t^
zt<eq0GMzSL_#OcCpE4m1JrvO2M;<)2f5oB?naB?Lh)+Efcn*2*k^)0`u(6=B;4ovL
zLK>H5Y2U(1AyY<w0S}&qO|uWnX(tp2jvm_ubl@|hWbRmcyBl9m@JV<Il(|c3ynzb!
zq?-YUam?bv(nFpsJ?7fslDmSiInc#E)!MWHHQ=|-wj~OT)cK~E@5ySD8%RHhN`7Jq
z9E%ZCLL$zo99dNiwR&juB^19$8qw*J#wvkg3px#1c{_e1iF^EenlN4Lg>8~j4zA{V
zNJx`F&E-)?pyndiS;K~6=$=a96c_AA&L|TqO%+sPdH{hiwJd5ALgJGUkA#S%pgf5d
z5Sb*gAq?zGL&c1nC?!CRPyrPxj&>+ytpV{>6i>n}T7vT;A+LAi_Xl{$*bGk@^#oAY
zMJ5yvzN6t1+RRrL1fDSQ*1+MG0Fgv@6Ifjb1mwkDLepgO?bEO0SYeQ-f6d~l%C+K|
z22Qf>QQUDScKwl^jpEXO#N;rO|H_1O19v`MEbx}eOo_OB2Km3p{;7bpra}hMuXF)V
zaSU!@YgE6qi)1=w_RzPPA=*ldPmva4)PuOJ4?pP&vHWB$xX;VoV3Q*ff)aP(7r1V?
zx42?x7SU{)q7Y<hnF1*M<5(hr^djm7OgkPh?cInYOCS)@*5h8>OMv0Rh;eaZeaBu&
zqJ-g#yZTB}S5kd>Y2?{%;(QWm>OA%%p0VpzGH&)#Zawb~+_9RkxY_jrpRfW4C4a~r
zfN5r{I5=I_9mc-TAUZ6&KH}w6VnPg*D{pwo4fB7(OKq5c(MxZb@1ot&JKzW4tTgIw
z!KlmEGk8nm?E{s;o2{}ddGJ|8lmW+0UY-Vym!=asSnoNbvlvP963FXYCEv>GclelS
zE|zdcZ5{HHixpLc&(fK%E=-Si2pr0qHf#0s4ZTpRS6dZTa!XMCBG*x>R(bb&sa*Dl
zg1pnOzG05N{4uDcJmu<gEp@Og9Fap*HAN5h;p2nHwW^zj$BTkY7KR`Jjc5TsQRV@j
zC}4i7X(B792aGDN2`4y-Q&v11ekPEEP(VZorsig;lEASho+k60Gj1C5l8|QofeYu2
z_A9yiFWES@ezRA6G*Iw&xx{H$jMxyI5FBIHn4Pp4z6muI%bo%iJ_`CNqKo2%R~&W5
zEX{VxWSQX~pEI~b&`?Q$-etKqgo@?HUD)M}$)%8>?g4?45CP+en1zU;fbkrJUy{v+
z%K^ewbPnJfyPttiR9vJlOlSQ8I!KQ-^sBnGbg?ON)z2<0SIf0h6DOwdiZ=A(taywa
zXc1yjw6!=<Lyblp_v1JiMTG^n#8&lV=x!qo)z1kGoNxg9ccdYa+XX)on8N|QcZn0?
zOi@C5d`NzRg&*Ttd8m6DJIygzc>us+!R(3InK{^M`-y_E%7|ZzE{cwSiyf2K4%wR4
zMwhf_J};RBsz`9Zgw}!FDOK9M8Pb4Op&!4pT<ue}Xf^x}JLAlzX^b$ZYm8)mk;weF
zc<B(C2tVo)nU;u5Fo!OYi6GucWLk`UhsY#E7}A?Sm@mbqBVsj)I0F%zs}{mPdju!?
zPT@+c5&Vzd#<?BdSYr%=)`Hgqo}EC@O4u(-5$ub9X<fwac&E5^_U+r(N_+y8c|pq|
zU?fB}kMLUCU&0~f^~N&JCWD7-rMk=D98&WIjgJFjMxvVNmJYG%yi;jHt`;P!rY%-E
z*JWH?e_TTps~^&YfL8>5k?72>EnxK#_7*J`Ig9=SWQk`Z8*6QbtxL4}rK=ub&E07H
zZoD0E(}Apgs?moO*Vuz(J<L1!_<E7~LuOIZL;n*?@Y{HPpxhSkfW83`KXK;xtFO&X
zC(KGsxSS9;$jxnDZqEFvh8eq1fuh_`1q1$F^ee_fUOcT7=OZw$$05?kEj@$Mt_?&T
zgQsw$dh;t--{4}{IJa<KYJ5@f8pcI%PhkDnYxusvsDfOx^$C=vAnqOs%AgQZO_Z`s
ze;$ksM`eW%+Ch*jyr)JXXTUmFz8c{E0B2080u=SQJPqP05#aG@whNCZLfE<P@s?I&
zx05T$`nCX@MgXy?4}v~K*6l0tOz>+4zD$JMqW@%wsnkfKrwMpl5eP4;Zx+GsR6F3K
z^iuo+1?5{X>M$g}UC!`cQ#TBTkIL5bUgnN<%R(y*dfzIZN`AeP=C><oEewv2Y6#p;
z3G<7?ulta|KHL!kA3z3ykkQ+E0eof`rA*KlzJWpAm}1>9EWX`K8)t;W%XkO*wM@Xb
zkf)~Aj#UR7W{+U-IB-R9d?#hhTzz2aCxw2TttW!H+KX_+!TMG+SAdueAod~5(affS
zKmfMPRZqS*h;@Q6bMaXZMzh3qgjGWSeCIlHhoHP=SB6}K`FlfmEO*o!I?C|xYBjbp
z?2e-Jz{=1<%tKiI(HKiN)5o%F1)=lmpakjvETG@R+aY=TBH!FG1L=)yhW{LLJ}YIv
z#8#m6j(hl|LW0cS1N6Hd715!UAybOy^a?&XAfz|)9C7(Nu7BiCtWU0qiYhqgf%C-r
zgn{e8`#0Uq?&zA0lXB}Om~=$vdqS;5->w*~Z;vSbmjO)g>L0C}eUa$f7vV<&?R_yC
zQJiakRJGTB__;uP9Sxv->ncCiI4tYLtm^QCfNr=a;meN$*1`8x+yPvf;pYL}kdLS%
zfG}$}{5+ryVWUs8-zL4u4c6{tw057Cez<lwdz&{{ySDyfMAOc-I~A>6TYoix=w72!
zb+dL8_>`h^O;34K9cy<gq9oVu6xObM?$YLg?$jE19s)O^3xFHrgNMLqc;*yjacAu9
zt8ctn6vdFyLv#4HN{@oiSNDRQSI>a6n?nwIE%hcAd<}Y>rivyWEWHy?2+u`atbQ1e
z#{BIJejVxzZeuuX^}j(!x_5~667vWPu82H^%n}(%Jxl^x^rL)w3`zAqaDk=$1T*$B
zkt3I_c%l;tdTXOre4|va)SfN!&nOdAKw2rBo)@79K3FCb57F@;iK!tP(fDNa6j|TH
z5aY)B-W=E_8>k0K*1IS@L-XCRXN*O(fhJpaq2w||1rrsILR3UO)(j&NWTqWI#h@z;
z!>GEVa(8Plqd-oymx4J2(bK4XKYy}Rhx6kBu}*-1ev5ik%wcha$}WzDa`0QAqOp#o
zr*;w%WFqV{X*4%|3GMiCDj7clxuLgHpD^V97HXQOsP%0jx#5NvhsGwt(4kTNKPZB4
ztG!&>E9ErCQvGeNT6#VF1nkn&ullLkPtLvd(wVbA0$Lb+ma*noH>5X*4R=hMS$Gc&
z#0be!h@_u16V%rO%>g3X<k1NfQUw2UQ-*UL^cKu6d*f`wj6Hs0_N=k3&PG&<x}_2(
zvz_nPfWukUKgN28*`nnS5`VK8e)ybLIG_`#**6dr7$r8^F#1%4*#`5s7?Ngst5IRa
z>vRpR=y#c1XF@|xuQ2%xlb=T7Tc?aZD-Npnn0=Fp_<^5ik8~O6&3N)tjJ~a$teCwr
z$l7r_FO26HAI-)5VFV<}A~;^Z&@f}C@u}Zr@{6c3y~U9EEoS^3HZg{%diWvcKr(!u
zuw1IaC8GT-zPgQ`NzNdmC=U+^8IbTFKYTOMNdvWjJDJxsBFE;riu@Fg$GE=mV5Z*B
z&v#6ZpSprg1XD-9#4!}al0Jz5DTxs??uzVUNneId{|e&5GBC;|TDj>D5w)OKgVJz~
zBk9V@%`+=N!Mp@$3`t2RDmgY7pyOwxSj>n4=C^T@rJK}3#aBs6G0No>RoIoRVp+gN
zlzt0W^S40kbOO;Oae7b??IDpSS$GH-VUkWwA$Q7}gii%dd*qsk3?2|p!O6H4R{?f5
zqd*GIkmP04lPI0E#>FqgmeNTc8m1ju<XIdW@?s=M{6_qKI?dz8_z$uu9mhdpHZdh_
zqeq8vFLY#@ex#2?&<BnngFb*I<6-i4-qsvd*PfvPO2+EoYJLW#koKX2+@|D5j0UYC
zOFrC%zvB{*72>deEYY+RVGM_SDrw46{ERoS?h|l|Gpa<QMx-tr>>HNTJHYzMsBN5#
z!PbK22|bA)`*xQ`Jxlb6^o-i@ECf0{J_a_H79F-db!i_Jf|ir8i%iS>WH`h-4G6K*
z<dZ8?9(+*jW}x3@U?(Nkv*;1l5v`6ZU&3?Vv(-oxvB9tT5y1iMVKec3Pbfy1aj~>$
zEp?1ZycLrj-U@vj^P#t6W{j_&nuKxq2hIn&^F`BuezA<8Rd_0dcLe&yeirXzGKU0`
zj2ym|ow>+3$tHmAJ<LLed5HHzAH`GG>cG1kOm~U~#&AgNM#!~g>tSQEYa@vq9IQ7G
z)V(N-;3PuA!I_53CL9f4Hw74IiHH!99m$NpM}GV^#*L#a*}0K`-$0F_yR(%8gTS{N
zGIETOLKxoIwmU)?w6O7?LXin;_$;4pF!=%#Vb#L9d6aDq8M5NX0K;n!<Y3pZPA~DB
z4Foz8&w%_w{jNHPFz!A81V>5)aR({Hx0|0QXh2Fd!#9jBBCNi~1tGK|={^fuH7V!v
zhUwU3J;HwAyac<EcRDhe2e&V-t#I@bMo3@grVsxp_#?O9e?aS4&T#7no0r><>_A*2
z!3M*rVNgof8%z0(qG1c|GH$-^GJJ=RQ#PKWIXE#%N@+5zxsG!Zm<bHyVy!42=aq4l
zma{efm_{1g+!0qrKU7|z1|ldrEc&4kFb0U2w8M?q4Z$;>p#Vp^D7s34BOKkr7`pO-
zBhCxOEjb+ZPIpQ_R9>J)BB+gUOn^m={tz5-Vx-0lvw@?64xf-N{Ivk~2}c#@qt6X$
zwmkJ<iTi4G**>z`gmoMQzysOAhg^mXPck2i@iQ?A4$*(kW&?#ZS20!<zaZ>UoWIVE
z{u<U<Yg@;nf50Z!uWp1&XrhQGK15~RzJutx5SiCQWc2&oI^DeH8${tnLVRMQ=22=4
z5Bd&i0rBfXVR~?}pJlbDnMj-MB;sVh&erK1ZB0Dji-HHzR^KGD-(l;W)XY@QB4oJD
z4Wh1oGo)o-iy?{h=c5+eB)Qd3l7AnPq_3rr8hVwrsIcp`sQw1S|Lt)2KRkFYFfZu=
zhtc7*%}dLlFdw0D?F1~*a80wg;`(#!{%4pld;sT%B4>K$_;*6mgf41P?L{1jmM$_u
zM8sz~r1zNIWb!|eOy~W?1%B^f0u*keuP8{|d<T(s8%yR1pTkBH4^i^%b&o)__De^s
z{$xjXFuouNe_MY^J7x{1TD5Q)KringiT*j#LKekUJbeWtZ}L2zyuZMZVoCwrsSHjA
z*BI_YiT=FpkR6(>bR7p$QKY3fgG-v`5BKl`7Kyt-z%EXv!Z&bvFR@@Fyh$LA(%mg8
zkZ>c!vAv8&U>?7Zn~BTHw#g)?Qq#$vDb_#F2}qh{fe)*r@4DLC`ey<0A31RDSs5_T
zxR~1|>>QL5LI$!M57D_MVy?Q*H4y}J2LdTzqClH8Ku}qZ1`_O=If!Uu$30Huvh~U}
zW}#H?MTVSF!Zjdvg;*G3di3lUT6y0j*0^H3EwTE_$cz5?Oq_|YKFD@u10Lej874gq
zegb)a7&eH5Oum<+*6=|1NZKC)B&0>8_P4{ok?VvC{tA_U24&-Sa{?)!6#c*njzhwj
znN(DsM!6gYMn``s9tQ^cL)0=4VuF@UJPn+I=qHJTm;uP(LZJS$?{?c6ltN*@ZAZV+
z5|(8}J%>|qoi55e^a-lx3wTC9fyP3u#I%=-@S*e`Rv8>UI5>#Ju?w_}F}O~tqjZm(
zkoZwA%3eT@N08ysNt;51oQNPNBgpU&r5kdy^lqa~{1jNwdw8VM4Y>u7Hk{5|Be?J&
zC8R><%!R)m#6bE1%s@_U3y)F}HL^XRad=45ZXttGL*wvwCxSBB!#ny+6){j~uVBTC
zAB@Vld{x$RiZ2v(i%Yf6JkdQS_|!yRYp}<B(V~CQYSAvreFqG(YyF}`HUtfm@DP<A
zD++#Y4LFlV7TeCjfWOK%frs`w+srpafo}@??vWbb!H&=$CuZ9^TIp9#LZ=_mFi}iz
z;^~2y>Sxj9M)<)y5{Y%h!4H%>?Fm7*2f;rJ5%l3YzJQRBi$7YY;~m2ZDn=dauMoR7
zFQx4GZrD$&U?fd3u-zWq-IqJYINZ@)holOpptIP7XZLjf7g6{5)BIHUL7fAv$Log$
z&^cDt4y{g(5MK0O=^yQVc1M*>A4MTnWn?KA`8QqYFe}@B!nSSM)+27Rk&SH28|-`l
z&Rc#bXf%gp&G_9C^RHEEOZYOYYw`obP(F|}YZbBfSkJ~e2gBD-!%HViK>uxEjeHZy
z?r&=L7@20J-w4MI;rmrSb2k`jtn*0}h}ZG-El{KR3h<>*in6qATAJ^Tm`9_M3I_dL
zofmyl1^BDP5T(J%7W-rx=WPa;xSgRWY!J?Znffz8n1#_^oWF|Z;!YqkA>#Sob?DEq
zrO2`MGD^YwB6~KnFU4)7oIoPcS!G;Zjz}zHw}q}Q@RvY(gw}A|Rh5D*mO?4qN0D8K
zU$_h8BrP?_74U0(!UVpK;1qFMvHfnS0r{NChUFK+h4ynd+@Q2J6hVb{bT`y+U}c7F
zg|=u16vi|v_VQTNR%D9;yA0&qI9++d@h3Vt{yq@6RlrItmL!U_Z8wS-*uzOXz+^)+
zvVX_6e<wm8VYT*LlMxMLu8lRdtAnQme7&DB%O^G)TOMf0@QO}mJlWr9H@^~7Pr{VD
z@fsywloUVQMBUK-+F+QQ*xZX8{sT=?ku~kpjWYxmoHl4~BBGjebno6WK)e2;N%$jV
z8<jGO?gax)m*CVB^QU{vo5NO3Sn}hj+;3fdmQB7x!U$IsJL^5>-DGkD$#g~(dShGt
z8X)}<{Fo4a0f%}uXI8rY3c<h1<X4#dDwD6XS>gow`wc#~XGgf#KSW*{NH%u?hJ4F#
zAu)u+uTJhn|GK#*EzKHT?1N@b6#4jRytKv{b{9H3$R5@mMdJoMi($72x5w8S{D=Tw
zMGji<^)hT0Ut=ITumcXN=sPr;-~f}jLA=Y>LR2JBfOnP~id*T~;{9E77pu*pu3TB>
z&&A-eRO6TL*YSfJ^7|RcU#=iz`gv3|l1bc`|AVjJXCf}m@8Sup{+rAYLzz)j{K7>G
zpiFIhe*ZbJ*hbQkkTys(0s|@uLXFs(Wqm}-0&WJ^hi@TiO+JW7{G2o*6Jn{mzrW}v
zQ~)ca65<3u!P+VWf-WGCeBh)11+coC{NY5n8`t-No>^+YjtU!&{g+ViD8~LFeW4sX
z4^)!)z(=#>fg{)d5~bnK#I^>Y%i+tYJ{0wDu-kvdq=)2pkhdS*>VM5bzIAm)JC&vy
ziQN1?O1nv{u<|?`CeQz(NTGn@H&gh9TwDK?>mUcea4b3eayKELfkb(<YuzMWt|1hn
zaOh|a>Ku*9_pu#05;-(Q$14KD9evKtbiyOr2j8|{$}%b#{hvdGSpugc=;g|b5|WPZ
zV0+=8i(UDYn}ey=)k`lPS8_78l#GPI=f1#a%-Eg_M-jXSwS%{gQ3N?MigDm#u#a-P
zu>Xw??7!NDeFr?S|2Gl#QGRd#0rpwJJ_qbS6zua6_Ho3?@w`c9O6ZkCnEwE`3F^ky
z3sGE5Aqb|gVm0W0zy(w1)5}aw<MkewivAi4vV5fq<R{IV)E`4mYg)Jpxe0!GX94lB
zMSLB~-=;I_A~V5rmKxff9_-y!@3ZmmG5J#_o7v4dCK3fQ$h;rt>jaa(;H&VVKgJV2
znG!utSkO?UnlhZbirDT%xv^Y#5kX?!^+&8+WiywU^lX`5M&8rdGEK6+f*lt%bOb)D
z?KnmqvvxyawQukjjCa{X_>AA=nh*7YTpYTNSy=~1qHN39WC!rOAlGr~?Ow)LN%(YS
znK*b%WT4{U83_vaT>y_(_#w2$g86+6^Q=!2T!#NSJo%|cb05EjGDO?Mu7p&7pHH7B
z?5coO>Q}3}Q8ypr3LU@9!m~_%kI7R^-eyu}@*I;dF%bec_#^=X5*tK%o3NE4`;lWd
zI`hO4W?~BH5fYtTG-r{2!rz6}2@^7eX%k+aC>8LddHD5?<ywUfRQ$*~lJpDwH3a_W
zgL^z3#~F4UXWaNTgt6F9r(aJ$mPTeemCmQL=>c4i;W?g8qzfp0IK4ByH=V%=cs%_G
Ku1zQh;Qs>^Q=GH_

literal 0
HcmV?d00001

diff --git a/aerialvision/__pycache__/variableclasses.cpython-310.pyc b/aerialvision/__pycache__/variableclasses.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b348933624221b45c2ed7c8779c089a792fcf5d
GIT binary patch
literal 4892
zcma)9O^+N$8LsND>7AKf+v^W(C4@-;DP!VYXOq}Tu#AOm5K%G*R*GYTX?nYA$L;Nz
zo~^3!u1B*ZiXDj?0uEeq*bOIh<R5V0gv1Z1D<}U0WCg<WR?qCm#F6M$y<JsreZJ58
z)#dzro#FZT)oX&i%-FxEarp7kxQ3$N2l*)zOz_7jkGwn7@`R6;j~4GT5q!Z@PlO_R
z%<lN$)<g_$3~qqdoT#H!M=KPKRn~51Sn<fSzt?$mOLm6~cE=N3`s<$CiQfrCAVRE;
zJwy%PNW@ra&c+W?m8WZkv{5$HrtD?Wp_Qrbp<GANZ-R8$m?dn?LHXDN^~OG^KMp{H
zaR?d;;zo|v1Y2cKyv|chW$oAoA~mV47iGCI93&>~+Q2*<NE@t`Wntq|b<=)6k~XGs
zj?>8dxyjQ4y~57vv^OYZXV_~;Hqf$IS9C&EBN7vtBSN@5VQK-NXTM#(SN7y`FYhm>
z%h^4Z>y<ZOU%pXh!=CILy=<g5m)?16xk%SmmTz9adFkTYE0+_U_l8Ak^0J={RC!-!
zCK;s4ypcRuS?&&o7bg5tp*3kQrxJoJ9^@LcmZwt9id1W<mj(}QoFsIYByqw#j>AR#
z@LT7j6Vqu+)3F&zfBMi3>27X;4>@Co87}ze-i||=5GHg#4|jYKlh%laXo`8XgVDzh
zIZrGQTH=^E{un6k2I9mXzrh~yop2nA#dCzN4@5)Zonx*Q8itmC4r2&a;2erz!mB|8
zLJ>|{@tEDmybVu%h#E9_!_%wpu}}Fe=>95J2G<8yI_}8_Os~yw18|Mwp2FPC8Ey#f
zBgbuk`^gM90{3^0+XVLy2i$7+fSI|6f8lH=;%rnOM>{oB7pKSk4DQ9*xb`V5W+(bh
zw8ghr`%I_pSwHU^8?F~+YMw&W>?tAflnm`@KqLfdhc?n#TBOQG51=HavUP3JY(t7Q
z^u<-*iohtSkCy5zCfO+O4L+0^zG9ocF}^D|P*X3uHLFoEMiqIM;}CgY$ZcD5pB3Ro
zS(eeErVpKsPPK}nX%j;GZoI>RG23Iie2@L!+w@i|qrmM#ZB#z6VNq^Lg|@eJ(T1kn
zkbP%16*-25Z2d}+Ym=3|!PQ%|I&F29jV|n~D$ZFd^AlC}R<UNkyL6=}ArpNSmFdi9
z{53`AYkTV3_^6jaZ0tk@L5<$HsZ^<2dB2q@3A5{WTYb6J0^hWz`?S`v1G;F7cU#}r
z?S?u*ODz&1Fq|vE#UM5FJda@KZf<9CKt^rD0hTfCSk2Qx8r7<a9srGbay<$8o&?=?
zeo&O=pxK<dD8~%kj1$}2+leVfnPm6UepmLS<#JYx$7jXV!Or>$NSz<Y>v@qE@l(9c
z7bb7rr`F-Jz@x7sa0A{%i9hlFGdedF&(iGrNA!bqg;dvh4%!r!f>;Ebc@Z@ZtM%1_
zK}>qYKSexpuk-7I{5s+jXh2p>r5$!0nCe?tXmpXB+Nx9U4^p7B-2(W96ct`ussxzR
zL$*k)>LjMML)E~i!s#`tIS8AmJ>d?OuM2QypFYNFno#G95J&`J1z#Q($jtaoyGA~^
zpY~*u*m{!mN-->`-b|9sVOmt2N~_2{tG9?;CPJa)5Fx`*B-3jsn%r{C7W}x;Y`NYH
z9Y-FrnQJKe6v#n%79NFr;X|Z>jS&Jjda2s@8gMk*0k?#T3#(Na22g;-bJTY<aC91g
z2`aCnzKdD7dT4TkC|_UC3wixssb$|bNY$(QM~54?ZKJEw!M(|_Ks*%N;7gssXOR7<
zOIUMO9wf3Y6ivdK4Q$iP?FY6_J5X=aD(?_^mxz=3^CbO4aEB<GTu98Co}!+1-G*5?
zgqN-%RLg$$&^Coidx=r$`g)$(6S^mR$=XJemHl<NQlpVTP}f$f2KIE9s%e%<t&^_o
zONB*{Vz_QM;+WhXKu#i=E`N-=6OliV3KKGE8?GbQdML#a*yx}>ii7P}s33~E+>bo6
zx&W^XkG$*cu^n&h=@TwKdb_^x_PA<~z1^VV>>^_B@f}2D#CG->Vlvg!$c^XPWmgl6
zv|NqmH>B#zqE+O5*?RXGZ>iHHr<x7Q0pO0|TLtlKBxF_!C<8pKGI}dF_tf_=RQ&+N
zHfhW%d~XM)LYhe`L&jkPs@cG;acp3=-#;{PMjh;NL{4#dz-L%np8=_R<iKLsD;08O
zX#ZP%1XhM}<k`2Ji|9VE99WG@pr4o4iaA(SP1K?N4O`EKBBfJwO6RjOLS_0;?ntSl
zL+L&$IT_R>yEWbocDJ_U4C_7{`&ZazJf^<ZPGEe&O)w4~A)80f8Dvk$i_RgdhaY{N
ze5*lbM0wF}Xk5-j`O_XUjm;M}7ad}L<wI@I85RYZCj~og<B3$YqEK!_MFGY7kiU)6
z0Z<gqKeeOENB)uq(Hw#cvuP1R%?7!VNJ^;vvl}zeI79mrW;!&;Qe(oyColRU_dfCd
z8+<i7c_6rbn>~_@l8RGt!XxDo9LkKVkNG(Up~O&C&+suq%4MkeCXz*@A0&Px0l&bf
zx=2ukV1GkCw7wqp)Eco4ynu7H5`BM0ajrIS@iE||g+oWg%p<3xefFXKF&d*fnc#hx
zA3?F>-C}ee?p`{%@$UQTB@+m`Lp93d?6w<K(e?zg(aZaJgv(g=6>Q0*8}fs6`_(q`
zLcf=8SJ%9a-EAo3+OVrGpx*W?>;(1urZ<?mt*uSgd4d5hVtB|8f-w3r091=G769ap
z77yKiGU14FQ#>UFclg{vh3>zLo9%Jj-=DPLaL@`yA(H;!1$GNDqmqe}$9JIQ=zj!B
zj02Z0oOk`_NluhF-v;XV!J8^`ocGr$9gqm`E-xeC6bV3eiO}_*gcafe5P#hgUBORa
zZ-6@J%B(Qv>25!#U$NC;PyZDQIFV42<au@X14SuQjN0hzj7avUb&%=J1RHlA7jfJf
zb2{E8kptCpHK%`cgFn*Nly1Z<@M14+#!++5-7Q1sk!Rm&y^8Mt&uPI~&Dpt$@lSmf
ztuuW{p$Sq4C;f?6zKZ_;;A;S)^F;GdWFLR$h!pg{20s)~9lonhUZ*m@u+q!BSVU25
zsxlt(H&fO8HTBZ94oSlaf&}(=6laJ~M(-?#uUOna_w!u^-GhodJN>{;-2`sUJ8kaH
z1Ft~IxFWNd+1A#TG(*)VC-)Dq-9HF^mrot+dobXejo>{@J>mqM5y04;w4P@K6QzH}
zDg7%)m0nT6nCiLnXNpd~uji_u=7Bbm{A{B-r&G*-qC=BFoh~C1Qflw?`1I_(+<4@#
z->k%%HZw#Hc0ABuI@#Y)GzovAzfsdy)ZalYp;Xr8R4(ecNL*R1yV7%3wDFZHd%j9j
SomnjWXyGjwMX<22xb#2ERZT4b

literal 0
HcmV?d00001

diff --git a/aerialvision/configs.py b/aerialvision/configs.py
index f0389ac20..01dba2e83 100644
--- a/aerialvision/configs.py
+++ b/aerialvision/configs.py
@@ -61,7 +61,7 @@
 # Vancouver, BC V6T 1Z4
 
 
-import ConfigParser, os
+import configparser, os
 
 userSettingPath = os.path.join(os.environ['HOME'], '.gpgpu_sim', 'aerialvision')
 
@@ -69,14 +69,14 @@
 class AerialVisionConfig:
 
     def __init__(self):
-        self.config = ConfigParser.SafeConfigParser()
+        self.config = configparser.SafeConfigParser()
         self.config.read( os.path.join(userSettingPath, 'config.rc') )
 
     def print_all(self):
         for section in self.config.sections():
             for option in self.config.options(section):
                 value = self.config.get(section, option)
-                print "\t%s.%s = %s" % (section, option, value);
+                print("\t%s.%s = %s" % (section, option, value));
 
     def get_value(self, section, option, default):
         if (self.config.has_option(section, option)):
@@ -90,10 +90,11 @@ def get_value(self, section, option, default):
 
 #Unit test / configviewer
 def main():
-    print "AerialVision Options:"
+    print("AerialVision Options:")
     avconfig.print_all()
-    print "";
+    print("");
 
 if __name__ == "__main__":
     main()
 
+
diff --git a/aerialvision/guiclasses.py b/aerialvision/guiclasses.py
index 04036a8a3..f4ecd2938 100644
--- a/aerialvision/guiclasses.py
+++ b/aerialvision/guiclasses.py
@@ -64,10 +64,10 @@
 import time
 import os
 import array
-import Tkinter as Tk
+import tkinter as Tk
 import matplotlib
 matplotlib.use('TkAgg')
-from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg, NavigationToolbar2TkAgg
+from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg, NavigationToolbar2Tk
 from matplotlib.figure import Figure
 import matplotlib as mpl
 from matplotlib.colors import colorConverter
@@ -250,19 +250,19 @@ def chooseFile(self, *event):
     self.cXAxisData.delete(0, Tk.END)
     self.cYAxisData.delete(0, Tk.END)
     
-    
+      
     #filling in xAxis vars
-    for keys in self.data[self.fileChosen].keys():
+    for keys in list(self.data[self.fileChosen].keys()):
         if keys == 'globalCycle':
             self.cXAxisData.insert(Tk.END, keys)
             
     #filling in yAxis vars
     #Need to fill up list alphabetically
     keysAlpha = []
-    for key in self.data[self.fileChosen].keys():
+    for key in list(self.data[self.fileChosen].keys()):
         if key not in ['globalCycle','CFLOG','EXTVARS']:#exclude hacks from list
             keysAlpha.append(key)
-    keysAlpha.sort(lambda x, y: cmp(x.lower(),y.lower()))
+    #keysAlpha.sort(key=lambda x, y: cmp(x.lower(),y.lower()))
     for keys in keysAlpha:
         self.cYAxisData.insert(Tk.END, keys)
             
@@ -782,7 +782,7 @@ def __init__(self, master, data, res, dataChosen):
         #self.plot = self.figure.add_subplot(111)
         self.canvas = FigureCanvasTkAgg(self.figure, master=self.graphArea)
         self.canvas.get_tk_widget().pack()
-        self.toolbar = NavigationToolbar2TkAgg(self.canvas, self.toolbarArea)
+        self.toolbar = NavigationToolbar2Tk(self.canvas, self.toolbarArea)
         self.toolbar.update()
         self.plotData()
         
@@ -931,7 +931,7 @@ def type1Variable(self, x, xAxis, y, yAxis, boolK, plotID):
         
         
-        if self.simplerName.has_key('globalTotInsn') == 'False':
+        if ('globalTotInsn' in self.simplerName) == 'False':
             graphOption = 1
             
         if (graphOption == 1):  
@@ -972,7 +972,7 @@ def type2Variable(self, x, xAxis, y, yAxis, plotID):
 
         graphOption = "NULL"
             
-        if self.simplerName.has_key('globalTotInsn') == 'False':
+        if ('globalTotInsn' in self.simplerName) == 'False':
             graphOption = 1
     
         if (graphOption == 1):
@@ -1018,7 +1018,7 @@ def type3Variable(self, x, xAxis, y, yAxis, plotID):
     
         #if there are kernals.. we need to adjust the x axis for proper labelling
         #Need to make changes here.. works for now though
-        if self.simplerName.has_key('globalTotInsn'):
+        if 'globalTotInsn' in self.simplerName:
             x = self.updateVarKernal(x)
 
         concentrationFactor = len(x) // 512 + 1
@@ -1038,7 +1038,7 @@ def type3Variable(self, x, xAxis, y, yAxis, plotID):
         yoff = numpy.array([0.0] * numCols) #variable use to remember the last top location of a bar so that we may stack the proceeding bar on top of it
         #Legendname = ['UNUSED', 'UNUSED', 'FQPUSHED','ICNT_PUSHED','ICNT_INJECTED','ICNT_AT_DEST','DRAMQ','DRAM_PROCESSING_START','DRAM_PROCESSING_END','DRAM_OUTQ','2SH_ICNT_PUSHED','2SH_ICNT_INJECTED','2SH_ICNT_AT_DEST','2SH_FQ_POP','RETURN_Q']; 
         Legendname = ['N/A', 'N/A','N/A','IcntInpBuf','N/A','Icnt2DRAM','N/A','N/A','N/A','DRAM','2Sh_IcntInpBuf','N/A','Icnt2shd','N/A','N/A']; 
-        BarSequence = range(numRows-1,-1,-1)
+        BarSequence = list(range(numRows-1,-1,-1))
 
         if yAxis == 'WarpDivergenceBreakdown':
             Legendname = []
@@ -1046,21 +1046,21 @@ def type3Variable(self, x, xAxis, y, yAxis, plotID):
             Legendname.append('Data Hazard')
             Legendname.append('Stall')
             for c in range(2, numRows):
-                Legendname.append('W' + `4*(c-2)+1` +  ':' + `4*(c-1)`)
-            BarSequence = range(0,numRows)
+                Legendname.append('W' + repr(4*(c-2)+1) +  ':' + repr(4*(c-1)))
+            BarSequence = list(range(0,numRows))
 
         if yAxis == 'WarpIssueSlotBreakdown':
             Legendname = []
             for c in range(0, numRows):
-                Legendname.append('W' + `c`)
-            BarSequence = range(0,numRows)
+                Legendname.append('W' + repr(c))
+            BarSequence = list(range(0,numRows))
 
         dynamic_warp_resolution = 32
         if yAxis == 'WarpIssueDynamicIdBreakdown':
             Legendname = []
             for c in range(0, numRows):
-                Legendname.append('W' + `dynamic_warp_resolution*c` + ":" + `dynamic_warp_resolution*(c+1)`)
-            BarSequence = range(0,numRows)
+                Legendname.append('W' + repr(dynamic_warp_resolution*c) + ":" + repr(dynamic_warp_resolution*(c+1)))
+            BarSequence = list(range(0,numRows))
 
         yoff_max = numpy.array([0.0] * numCols)
         for row in range(numRows-1,-1,-1):
@@ -1102,10 +1102,10 @@ def type3Variable(self, x, xAxis, y, yAxis, plotID):
         for label in self.plot.get_yticklabels():
             label.set_fontsize(plotFormat.yticksFontSize)
      
-        self.canvas.show()
+        self.canvas.draw()
         
     def type4Variable(self, x, xAxis, y, yAxis, plotID):
-        keys = y.keys()
+        keys = list(y.keys())
         keys.sort()
             
         if (self.dataPointer.graphChosen == self.possGraphs[3]):
@@ -1251,7 +1251,7 @@ def plot2VarLine(self, x, xAxis, y, yAxis):
       self.plot.set_title(self.plotFormatInfo[self.currPlot].title)
       self.plot.set_xlabel(self.plotFormatInfo[self.currPlot].xlabel, fontsize = self.plotFormatInfo[self.currPlot].labelFontSize)
       self.plot.set_ylabel(self.plotFormatInfo[self.currPlot].ylabel, fontsize = self.plotFormatInfo[self.currPlot].labelFontSize)
-      self.canvas.show()
+      self.canvas.draw()
     
     
     def plotMultVarLine(self, x, xAxis, y, yAxis):
@@ -1261,7 +1261,7 @@ def plotMultVarLine(self, x, xAxis, y, yAxis):
       self.plotFormatInfo[self.currPlot].InitLabels(xlabel = xAxis, ylabel = yAxis, cbarlabel = '', title = '')
       self.plot.set_xlabel(self.plotFormatInfo[self.currPlot].xlabel, fontsize = self.plotFormatInfo[self.currPlot].labelFontSize)
       self.plot.set_ylabel(self.plotFormatInfo[self.currPlot].ylabel, fontsize = self.plotFormatInfo[self.currPlot].labelFontSize)
-      self.canvas.show()
+      self.canvas.draw()
 
 
     def plotScatter(self, x, xAxis, y, yAxis, plotID):
@@ -1275,7 +1275,7 @@ def plotScatter(self, x, xAxis, y, yAxis, plotID):
         self.plot.set_title(plotFormat.title, fontsize = plotFormat.labelFontSize)
         self.plot.set_xlabel(plotFormat.xlabel, fontsize = plotFormat.labelFontSize)
         self.plot.set_ylabel(plotFormat.ylabel, fontsize = plotFormat.labelFontSize)
-        self.canvas.show()
+        self.canvas.draw()
       
     
     def takeDerivativeMult(self,x,y):
@@ -1347,12 +1347,12 @@ def plotParallelIntensity(self, x, xAxis, y, yAxis, colorAxis, yTicks, plotID):
         
         # put number on axis if there are more than one ticks 
         if (self.xAxisStepsWilStack[self.currPlot] != 1):
-            for count in range(0,len(x),len(x)/self.xAxisStepsWilStack[self.currPlot]):
+            for count in range(0,len(x),int(len(x)/self.xAxisStepsWilStack[self.currPlot])):
                 xlabelValues.append(x[count])
                 xlabelPos.append(xticksPos[count])
         
-        print self.yAxisStepsWilStack[self.currPlot]
-        for count in range(0,len(y),len(y)/self.yAxisStepsWilStack[self.currPlot]):
+        print(self.yAxisStepsWilStack[self.currPlot])
+        for count in range(0,len(y),int(len(y)/self.yAxisStepsWilStack[self.currPlot])):
             ylabelValues.append(yTicks[count])
             ylabelPos.append(yticksPos[count])            
 
@@ -1387,7 +1387,7 @@ def plotParallelIntensity(self, x, xAxis, y, yAxis, colorAxis, yTicks, plotID):
         xtickStep = x[1] - x[0]
         self.plot.set_xlim(0 / xtickStep - 0.5, self.xlim / xtickStep + 0.5)
 
-        self.canvas.show()
+        self.canvas.draw()
         
     def updateWilTicks(self, z):
         x= []
@@ -1480,7 +1480,7 @@ def changeColorMapMaxMin(self):
                   else:
                       for iter in range(0, self.dataPointer.dydx):
                         if self.simplerName[self.dataPointer.dataChosenY].type == 4:
-                          keys = self.simplerName[self.dataPointer.dataChosenY].data.keys()
+                          keys = list(self.simplerName[self.dataPointer.dataChosenY].data.keys())
                           keys.sort()
                           y = []
                           for iter in keys:
@@ -1523,7 +1523,7 @@ def changeColorMapMaxMin(self):
               entry[self.currPlot] = (maxEntry, minEntry)
         
               cmap = self.plotFormatInfo[self.currPlot].cmap
-              plotCMap = apply(Tk.OptionMenu, (root[-1], cmap) + tuple(PlotFormatInfo.cmapOptions)) 
+              plotCMap = Tk.OptionMenu(*(root[-1], cmap) + tuple(PlotFormatInfo.cmapOptions)) 
               plotCMap.pack(side = Tk.LEFT, padx = 5)
           
 
@@ -1612,7 +1612,7 @@ def collectDataChangeDiv(self, vars,master):
         for self.currPlot in range(1,numPlots + 1):
           self.findKernalLocs()
           
-          if vars.has_key(str(self.currPlot)):
+          if str(self.currPlot) in vars:
               if vars[str(self.currPlot)].get() == 1:
                   self.dataPointer.dydx += 1
 
@@ -1681,12 +1681,12 @@ def collectDataIncreaseYBinning(self, currPlot):
         if (self.yAxisStepsWilStack[plotToIncrease] == 1):
             self.yAxisStepsWilStack[plotToIncrease] = 2
         self.yAxisStepsWilStack[plotToIncrease] = int(float(self.yAxisStepsWilStack[plotToIncrease])*1.50)
-        print self.yAxisStepsWilStack[plotToIncrease]
+        print(self.yAxisStepsWilStack[plotToIncrease])
         self.plotDataForNewBinning(plotToIncrease)
 
     def collectDataDecreaseYBinning(self, currPlot, remove = False):
         plotToDecrease = int(currPlot[0])
-        print self.yAxisStepsWilStack[plotToDecrease]
+        print(self.yAxisStepsWilStack[plotToDecrease])
         if (remove == True):
             self.yAxisStepsWilStack[plotToDecrease] = 1
         else:
@@ -1751,7 +1751,7 @@ def editLabelsButton(self):
           entries[self.currPlot].append(Tk.Entry(root, width = 50))
           entries[self.currPlot][-1].grid(row = currentRow, column = 4, padx = 10)
           entries[self.currPlot][-1].insert(0, self.plot.get_xlabel())
-          if self.colorbars.has_key(self.currPlot):
+          if self.currPlot in self.colorbars:
               plotLabel3 = Tk.Label(root, text = 'Colorbar: ', bg = 'white')
               plotLabel3.grid(row = currentRow, column = 5)
               entries[self.currPlot].append(Tk.Entry(root, width = 20))
@@ -1821,7 +1821,7 @@ def collectDataEditLabels(self, entries, master):
           self.plot.set_ylabel(plotFormat.ylabel, fontsize=plotFormat.labelFontSize)
           plotFormat.xlabel = entries[self.currPlot][1].get()
           self.plot.set_xlabel(plotFormat.xlabel, fontsize=plotFormat.labelFontSize)
-          if self.colorbars.has_key(self.currPlot):
+          if self.currPlot in self.colorbars:
               plotFormat.cbarlabel = entries[self.currPlot][2].get()
               self.colorbars[self.currPlot].set_label(plotFormat.cbarlabel, fontsize=plotFormat.labelFontSize)
           else:
@@ -1841,7 +1841,7 @@ def collectDataEditLabels(self, entries, master):
               ytickslabels[n].set_fontsize(plotFormat.yticksFontSize)
 
           # change colorbar ticks label fontsize
-          if self.colorbars.has_key(self.currPlot):
+          if self.currPlot in self.colorbars:
               for label in self.colorbars[self.currPlot].ax.get_yticklabels():
                   label.set_fontsize(plotFormat.cticksFontSize)
 
@@ -1851,7 +1851,7 @@ def collectDataEditLabels(self, entries, master):
         
         master.destroy()
         ## Now replot with changes.....
-        self.canvas.show()
+        self.canvas.draw()
 
     def zoomButton(self):
         #Variable initializations
@@ -1980,7 +1980,7 @@ def zoomCollect(self, entries, master):
                     plot.set_xticks(xlabelPos)
 
         master.destroy()
-        self.canvas.show()
+        self.canvas.draw()
     
 
 class NaviPlotInfo:
@@ -2224,6 +2224,7 @@ def showData(self):
         
         countLines = 1
         for lines in self.file.readlines():
+            lines = lines.decode()
             self.textbox.insert(Tk.END, str(countLines) + '.   ' + lines, ('normal'))
             countLines += 1
         countLines -= 1
@@ -2232,7 +2233,7 @@ def showData(self):
         figure = Figure(figsize=(22,5), dpi = 70)
         self.histArea = FigureCanvasTkAgg(figure, master= bottomFrame)
         self.histArea.get_tk_widget().pack()
-        toolbar  = NavigationToolbar2TkAgg(self.histArea, toolbarFrame)
+        toolbar  = NavigationToolbar2Tk(self.histArea, toolbarFrame)
         toolbar.update()
         self.histogram = figure.add_subplot(111)
         cid = figure.canvas.mpl_connect('button_press_event',self.onclick)
@@ -2285,8 +2286,8 @@ def showData(self):
             count += 1
 
     def yview(self, *args):
-        apply(self.textbox.yview, args)
-        apply(self.statstextbox.yview, args)
+        self.textbox.yview(*args)
+        self.statstextbox.yview(*args)
         
     def onclick(self, event):
       if event.button == 3:
@@ -2298,6 +2299,7 @@ def onclick(self, event):
         self.textbox.delete(0.0, Tk.END)
         self.file = open(self.fileChosen, 'r')
         for lines in self.file.readlines():
+          lines=lines.decode()
           if (countLines < event.xdata - 1) or (countLines > event.xdata + 1):
             self.textbox.insert(Tk.END, str(countLines) + '.   ' + lines, ('normal'))
           else:
@@ -2317,8 +2319,8 @@ def onclick(self, event):
           
         
-        apply(self.textbox.yview, args)
-        apply(self.statstextbox.yview, args)
+        self.textbox.yview(*args)
+        self.statstextbox.yview(*args)
       
     def chooseFileCuda(self, *event):
       self.fileChosen = self.cAvailableCudaFiles.get('active')
@@ -2572,3 +2574,4 @@ def decreaseBinning(self):
             
             
+
diff --git a/aerialvision/lexyacc.py b/aerialvision/lexyacc.py
index d657383eb..53541ed44 100644
--- a/aerialvision/lexyacc.py
+++ b/aerialvision/lexyacc.py
@@ -82,7 +82,7 @@ def import_user_defined_variables(variables):
     try:
         file = open(os.path.join(userSettingPath, 'variables.txt'),'r')
     except:
-        print "No variables.txt file found."
+        print("No variables.txt file found.")
         return
 
     #this can be replaced with a proper lex-yacc parser later
@@ -96,7 +96,7 @@ def import_user_defined_variables(variables):
                 continue
 
             # parse the line containing definition of a stat variable
-            s = line.split(",")
+            s = line.split(',')
             statName = s[0]
             statVar = vc.variable('', 1, 0)
             statVar.importFromString(line)
@@ -104,8 +104,9 @@ def import_user_defined_variables(variables):
             # add parsed stat variable to the searchable map
             variables[statName] = statVar
             
-        except Exception, (e):
-            print "error:",e,", in variables.txt line:",line
+        except Exception as xxx_todo_changeme:
+            (e) = xxx_todo_changeme
+            print("error:",e,", in variables.txt line:",line)
 
 # Parses through a given log file for data
 def parseMe(filename):
@@ -136,7 +137,7 @@ def t_newline(t):
         t.lexer.lineno += t.value.count("\n")
         
     def t_error(t):
-        print "Illegal character '%s'" % t.value[0]
+        print("Illegal character '%s'" % t.value[0])
         t.lexer.skip(1) 
 
     lex.lex()
@@ -202,14 +203,14 @@ def t_error(t):
 
     # generate a lookup table based on the specified name in log file for each stat
     stat_lookuptable = {}
-    for name, var in variables.iteritems():
+    for name, var in variables.items():
         if (name == 'CFLOG'):
             continue;
         if (var.lookup_tag != ''):
             stat_lookuptable[var.lookup_tag] = var 
         else:
             stat_lookuptable[name.lower()] = var
-    
+
     inputData = 'NULL'
 
     # a table containing all the metrics that has received the missing data warning 
@@ -218,19 +219,19 @@ def t_error(t):
     def p_sentence(p):
         '''sentence : WORD NUMBERSEQUENCE'''
         #print p[0], p[1],p[2]
-        num = p[2].split(" ")  
+        num = p[2].split(' ')  
         
         # detect empty data entry for particular metric and print a warning 
         if p[2] == '': 
             if not p[1] in stat_missing_warned: 
-                print "WARNING: Sample entry for metric '%s' has no data. Skipping..." % p[1]
+                print("WARNING: Sample entry for metric '%s' has no data. Skipping..." % p[1])
                 stat_missing_warned[p[1]] = True
             return
 
         lookup_input = p[1].lower()
         if (lookup_input  in stat_lookuptable):
             if (lookup_input == "globalcyclecount") and (int(num[0]) % 10000 == 0):
-                print "Processing global cycle %s" % num[0]
+                print("Processing global cycle %s" % num[0])
                 
             stat = stat_lookuptable[lookup_input]
             if (stat.type == 1):
@@ -294,7 +295,7 @@ def p_sentence(p):
 
     def p_error(p):
         if p:
-            print("Syntax error at '%s'" % p.value)
+            print(("Syntax error at '%s'" % p.value))
         else:
             print("Syntax error at EOF")
     
@@ -306,11 +307,12 @@ def p_error(p):
     else:
         file = open(filename, 'r')
     while file:
-        line = file.readline()
+        line = file.readline().decode()
+
         if not line : break
-        nameNdata = line.split(":")
+        nameNdata = line.split(':')
         if (len(nameNdata) != 2): 
-            print("Syntax error at '%s'" % line) 
+            print(("Syntax error at '%s'" % line)) 
         namePart = nameNdata[0].strip()
         dataPart= nameNdata[1].strip()
         parts = [' ', namePart, dataPart]
@@ -323,3 +325,4 @@ def p_error(p):
 
 
+
diff --git a/aerialvision/lexyaccbookmark.py b/aerialvision/lexyaccbookmark.py
index 42c6b406e..7aa2f800f 100644
--- a/aerialvision/lexyaccbookmark.py
+++ b/aerialvision/lexyaccbookmark.py
@@ -108,7 +108,7 @@ def t_NOTHING(t):
 
         
     def t_error(t):
-        print "Illegal character '%s'" % t.value[0]
+        print("Illegal character '%s'" % t.value[0])
         t.lexer.skip(1) 
     
     lex.lex()    
@@ -150,7 +150,7 @@ def p_sentence(p):
             pass
 
         else:
-            print 'An Parsing Error has occurred'
+            print('An Parsing Error has occurred')
             
 
@@ -159,7 +159,7 @@ def p_sentence(p):
 
     def p_error(p):
         if p:
-            print("Syntax error at '%s'" % p.value)
+            print(("Syntax error at '%s'" % p.value))
         else:
             print("Syntax error at EOF")
     
@@ -168,7 +168,7 @@ def p_error(p):
     try:
         file = open(os.environ['HOME'] + '/.gpgpu_sim/aerialvision/bookmarks.txt', 'r')
         inputData = file.readlines()
-    except IOError,e:
+    except IOError as e:
         if e.errno == 2:
             inputData = ''
         else:
@@ -178,3 +178,4 @@ def p_error(p):
         yacc.parse(x[0:-1]) # ,debug=True)
         
     return listBookmarks
+
diff --git a/aerialvision/lexyacctexteditor.py b/aerialvision/lexyacctexteditor.py
index 51d3ced44..57b41db82 100644
--- a/aerialvision/lexyacctexteditor.py
+++ b/aerialvision/lexyacctexteditor.py
@@ -88,7 +88,7 @@ def t_newline(t):
         t.lexer.lineno += t.value.count("\n")
         
     def t_error(t):
-        print "Illegal character '%s'" % t.value[0]
+        print("Illegal character '%s'" % t.value[0])
         t.lexer.skip(1)
         
     lex.lex()
@@ -109,8 +109,8 @@ def p_sentence(p):
 
     def p_error(p):
       if p:
-          print("Syntax error at '%s'" % p.value)
-          print p
+          print(("Syntax error at '%s'" % p.value))
+          print(p)
       else:
           print("Syntax error at EOF")
 
@@ -152,17 +152,18 @@ def ptxToCudaMapping(filename):
       loc = int(m.group(2))
 
     count += 1
-  x = map.keys()
+  x = list(map.keys())
   return map
     
 
 #Unit test / playground
 def main():
     data = textEditorParseMe(sys.argv[1])
-    print data[100]
+    print(data[100])
    
 if __name__ == "__main__":
     main()
   
   
+
diff --git a/aerialvision/organizedata.py b/aerialvision/organizedata.py
index 090b90f13..f5d5312c3 100644
--- a/aerialvision/organizedata.py
+++ b/aerialvision/organizedata.py
@@ -99,7 +99,7 @@ def organizedata(fileVars):
     }
     data_type_char = {int:'I', float:'f'}
 
-    print "Organizing data into internal format..."
+    print("Organizing data into internal format...")
 
     # Organize globalCycle in advance because it is used as a reference
     if ('globalCycle' in fileVars):
@@ -107,28 +107,28 @@ def organizedata(fileVars):
         fileVars['globalCycle'].data = organizeFunction[statData.organize](statData.data, data_type_char[statData.datatype])
 
     # Organize other stat data into internal format
-    for statName, statData in fileVars.iteritems():
+    for statName, statData in fileVars.items():
         if (statName != 'CFLOG' and statName != 'globalCycle' and statData.organize != 'custom'):
             fileVars[statName].data = organizeFunction[statData.organize](statData.data, data_type_char[statData.datatype])
   
     # Custom routines to organize stat data into internal format
-    if fileVars.has_key('averagemflatency'):
+    if 'averagemflatency' in fileVars:
         zeros = []
         for count in range(len(fileVars['averagemflatency'].data),len(fileVars['globalCycle'].data)):
             zeros.append(0)
         fileVars['averagemflatency'].data = zeros + fileVars['averagemflatency'].data
 
-    if (skipCFLog == 0) and fileVars.has_key('CFLOG'):
+    if (skipCFLog == 0) and 'CFLOG' in fileVars:
         ptxFile = CFLOGptxFile
         statFile = CFLOGInsnInfoFile
         
-        print "PC Histogram to CUDA Src = %d" % convertCFLog2CUDAsrc
+        print("PC Histogram to CUDA Src = %d" % convertCFLog2CUDAsrc)
         parseCFLOGCUDA = convertCFLog2CUDAsrc
 
         if parseCFLOGCUDA == 1:
-            print "Obtaining PTX-to-CUDA Mapping from %s..." % ptxFile
+            print("Obtaining PTX-to-CUDA Mapping from %s..." % ptxFile)
             map = lexyacctexteditor.ptxToCudaMapping(ptxFile.rstrip())
-            print "Obtaining Program Range from %s..." % statFile
+            print("Obtaining Program Range from %s..." % statFile)
             maxStats = max(lexyacctexteditor.textEditorParseMe(statFile.rstrip()).keys())
 
         if parseCFLOGCUDA == 1:
@@ -136,7 +136,7 @@ def organizedata(fileVars):
             for lines in map:
                 for ptxLines in map[lines]:
                     newMap[ptxLines] = lines
-            print "    Total number of CUDA src lines = %s..." % len(newMap)
+            print("    Total number of CUDA src lines = %s..." % len(newMap))
             
             markForDel = []
             for ptxLines in newMap:
@@ -144,7 +144,7 @@ def organizedata(fileVars):
                     markForDel.append(ptxLines)
             for lines in markForDel:
                 del newMap[lines]
-            print "    Number of touched CUDA src lines = %s..." % len(newMap)
+            print("    Number of touched CUDA src lines = %s..." % len(newMap))
     
         fileVars['CFLOGglobalPTX'] = vc.variable('',2,0)
         fileVars['CFLOGglobalCUDA'] = vc.variable('',2,0)
@@ -152,7 +152,7 @@ def organizedata(fileVars):
         count = 0
         for iter in fileVars['CFLOG']:
 
-            print "Organizing data for %s" % iter
+            print("Organizing data for %s" % iter)
 
             fileVars[iter + 'PTX'] = fileVars['CFLOG'][iter]
             fileVars[iter + 'PTX'].data = CFLOGOrganizePTX(fileVars['CFLOG'][iter].data, fileVars['CFLOG'][iter].maxPC)
@@ -174,7 +174,7 @@ def organizedata(fileVars):
                             for columns in range(0, len(fileVars[iter + 'CUDA'].data[rows])): 
                                 fileVars['CFLOGglobalCUDA'].data[rows][columns] += fileVars[iter + 'CUDA'].data[rows][columns]
             except:
-                print "Error in generating globalCFLog data"
+                print("Error in generating globalCFLog data")
 
             count += 1
         del fileVars['CFLOG']
@@ -231,10 +231,10 @@ def nullOrganizedStackedBar(nullVar, datatype_c):
         for row in range (0,len(organized)):
             newy = array.array(datatype_c, [0 for col in range(newLen)])
             for col in range(0, len(organized[row])):
-                newcol = col / n_data
+                newcol = int(col / n_data)
                 newy[newcol] += organized[row][col]
             for col in range(0, len(newy)):
-                newy[col] /= n_data 
+                newy[col] = int(newy[col]/n_data) 
             organized[row] = newy
 
     return organized
@@ -320,15 +320,15 @@ def CFLOGOrganizeCuda(list, ptx2cudamap):
     nSamples = len(list[0])
 
     # create a dictionary of empty data array (one array per cuda source line)
-    for ptxline, cudaline in ptx2cudamap.iteritems():
-        if tmp.has_key(cudaline):
+    for ptxline, cudaline in ptx2cudamap.items():
+        if cudaline in tmp:
             pass
         else:
             tmp[cudaline] = [0 for lengthData in range(nSamples)]
 
 
     for cudaline in tmp:
-        for ptxLines, mapped_cudaline in ptx2cudamap.iteritems():
+        for ptxLines, mapped_cudaline in ptx2cudamap.items():
             if mapped_cudaline == cudaline:
                 for lengthData in range(nSamples):
                     tmp[cudaline][lengthData] += list[ptxLines][lengthData]
@@ -336,7 +336,7 @@ def CFLOGOrganizeCuda(list, ptx2cudamap):
     
     final = []           
     for iter in range(min(tmp.keys()),max(tmp.keys())):
-        if tmp.has_key(iter):
+        if iter in tmp:
             final.append(tmp[iter])            
         else:
             final.append([0 for lengthData in range(nSamples)])
@@ -356,3 +356,4 @@ def CFLOGOrganizeCuda(list, ptx2cudamap):
 #    return organized
 
 
+
diff --git a/aerialvision/parser.out b/aerialvision/parser.out
new file mode 100644
index 000000000..809874f58
--- /dev/null
+++ b/aerialvision/parser.out
@@ -0,0 +1,47 @@
+Created by PLY version 3.11 (http://www.dabeaz.com/ply)
+
+Grammar
+
+Rule 0     S' -> sentence
+Rule 1     sentence -> WORD NUMBERSEQUENCE
+
+Terminals, with rules where they appear
+
+NUMBERSEQUENCE       : 1
+WORD                 : 1
+error                : 
+
+Nonterminals, with rules where they appear
+
+sentence             : 0
+
+Parsing method: LALR
+
+state 0
+
+    (0) S' -> . sentence
+    (1) sentence -> . WORD NUMBERSEQUENCE
+
+    WORD            shift and go to state 2
+
+    sentence                       shift and go to state 1
+
+state 1
+
+    (0) S' -> sentence .
+
+
+
+state 2
+
+    (1) sentence -> WORD . NUMBERSEQUENCE
+
+    NUMBERSEQUENCE  shift and go to state 3
+
+
+state 3
+
+    (1) sentence -> WORD NUMBERSEQUENCE .
+
+    $end            reduce using rule 1 (sentence -> WORD NUMBERSEQUENCE .)
+
diff --git a/aerialvision/parsetab.py b/aerialvision/parsetab.py
new file mode 100644
index 000000000..47a38843c
--- /dev/null
+++ b/aerialvision/parsetab.py
@@ -0,0 +1,31 @@
+
+# parsetab.py
+# This file is automatically generated. Do not edit.
+# pylint: disable=W,C,R
+_tabversion = '3.10'
+
+_lr_method = 'LALR'
+
+_lr_signature = 'NUMBERSEQUENCE WORDsentence : WORD NUMBERSEQUENCE'
+    
+_lr_action_items = {'WORD':([0,],[2,]),'$end':([1,3,],[0,-1,]),'NUMBERSEQUENCE':([2,],[3,]),}
+
+_lr_action = {}
+for _k, _v in _lr_action_items.items():
+   for _x,_y in zip(_v[0],_v[1]):
+      if not _x in _lr_action:  _lr_action[_x] = {}
+      _lr_action[_x][_k] = _y
+del _lr_action_items
+
+_lr_goto_items = {'sentence':([0,],[1,]),}
+
+_lr_goto = {}
+for _k, _v in _lr_goto_items.items():
+   for _x, _y in zip(_v[0], _v[1]):
+       if not _x in _lr_goto: _lr_goto[_x] = {}
+       _lr_goto[_x][_k] = _y
+del _lr_goto_items
+_lr_productions = [
+  ("S' -> sentence","S'",1,None,None,None),
+  ('sentence -> WORD NUMBERSEQUENCE','sentence',2,'p_sentence','lexyacc.py',220),
+]
diff --git a/aerialvision/startup.py b/aerialvision/startup.py
index ae14fd394..d261c0c10 100644
--- a/aerialvision/startup.py
+++ b/aerialvision/startup.py
@@ -62,11 +62,11 @@
 
 
 import sys
-import Tkinter as Tk
+import tkinter as Tk
 import Pmw
 import lexyacc
 import guiclasses
-import tkFileDialog as Fd
+import tkinter.filedialog as Fd
 import organizedata
 import os
 import os.path
@@ -160,7 +160,7 @@ def fileInput(cl_files=None):
         tmprecentfile = tmprecentfile.split('/')
         for iter in range(1,len(tmprecentfile) - 1):
             recentfile = recentfile + '/' + tmprecentfile[iter]
-    except IOError,e:
+    except IOError as e:
         if e.errno == 2:
             # recentfiles.txt does not exist, ignore and use CWD
             recentfile = '.'
@@ -313,7 +313,7 @@ def loadRecentFile(entry):
     try: 
         loadfile = open(os.path.join(userSettingPath, 'recentfiles.txt'), 'r')
         recentfiles = loadfile.readlines()
-    except IOError,e:
+    except IOError as e:
         if e.errno == 2:
             recentfiles = ''
         else:
@@ -323,7 +323,7 @@ def loadRecentFile(entry):
     recentFileWindow.pack(side = Tk.TOP)
     scrollbar = Tk.Scrollbar(recentFileWindow, orient = Tk.VERTICAL)
     cRecentFile = Tk.Listbox(recentFileWindow, width = 100, height = 15, yscrollcommand = scrollbar.set)
-    cRecentFile.bind("<Double-Button-1>", lambda(event): recentFileInsert(entry, cRecentFile.get('active'), instance))
+    cRecentFile.bind("<Double-Button-1>", lambda event: recentFileInsert(entry, cRecentFile.get('active'), instance))
     cRecentFile.pack(side = Tk.LEFT)
     scrollbar.config(command = cRecentFile.yview)
     scrollbar.pack(side = Tk.LEFT, fill = Tk.Y)
@@ -391,9 +391,9 @@ def addListToListbox(listbox,list):
                 Filenames.append(string)
                 listbox.insert(Tk.END, string)
             else:
-                print 'Could not open file: ' + string
+                print('Could not open file: ' + string)
         except:
-            print 'Could not open file: ' + file
+            print('Could not open file: ' + file)
             
         
 def errorMsg(string):
@@ -447,6 +447,7 @@ def submitClicked(instance, num, skipcflog, cflog2cuda, listboxes):
     startup(res, [TEFiles, TEPTXFiles, TEStatFiles])
 
 def graphAddTab(vars, graphTabs,res, entry):
+
     
     TabsForGraphs.append(guiclasses.formEntry(graphTabs, str(len(TabsForGraphs) + 1), vars, res, entry))
     entry.delete(0, Tk.END)
@@ -586,7 +587,7 @@ def startup(res, TEFILES):
     organizedata.setCFLOGInfoFiles(TEFILES)
     for files in Filenames:
         vars[files] = organizedata.organizedata(vars[files])
-
+    
     graphAddTab(vars, graphTabs, res, eAddTab)
 
 
@@ -873,3 +874,4 @@ def manageFilesSubmit(window, listbox):
 
             
+
diff --git a/aerialvision/variableclasses.py b/aerialvision/variableclasses.py
index 18850a1ce..30d8d2d17 100644
--- a/aerialvision/variableclasses.py
+++ b/aerialvision/variableclasses.py
@@ -102,8 +102,9 @@ def importFromString(self, string_spec):
                 assert(self.organize == 'idx2DVec')
             elif (self.type == 5):
                 assert(self.organize == 'sparse')
-        except Exception, (e):
-            print "Error in creating new stat variable from string: %s" % string_spec
+        except Exception as xxx_todo_changeme:
+            (e) = xxx_todo_changeme
+            print("Error in creating new stat variable from string: %s" % string_spec)
             raise e
 
     def initSparseMatrix(self):
@@ -133,7 +134,7 @@ def loadLineStatName(filename):
     global lineStatName
     file = open(filename, 'r')
     while file:
-        line = file.readline()
+        line = file.readline().decode()
         if not line : break
         if (line.startswith('kernel line :')) :
             line = line.strip()
@@ -171,7 +172,7 @@ def takeMax(self,key):
         except:
             tmp = 0
             if cudaLineNo.debug:
-                print 'Exception in cudaLineNo.takeMax()', self.stats[key]
+                print('Exception in cudaLineNo.takeMax()', self.stats[key])
         return tmp
         
     def takeRatioSums(self, key1,key2):
@@ -182,9 +183,9 @@ def takeRatioSums(self, key1,key2):
             return tmp1/tmp2
         except:
             if cudaLineNo.debug:
-                print tmp1, tmp2
+                print(tmp1, tmp2)
             if tmp2 == 0 and cudaLineNo.debug:
-                print 'infinite'
+                print('infinite')
             return 0
     
         
@@ -209,7 +210,7 @@ def returnRatio(self, key1, key2):
             return tmp1/tmp2
         except:
             if tmp2 == 0 and ptxLineNo.debug:
-                print 'infinite'
+                print('infinite')
             return 0
             
     
@@ -221,3 +222,4 @@ def returnRatio(self, key1, key2):
     
     
+

From 2b0df2f831c0e9d69015e36ea9da2a1dafefa003 Mon Sep 17 00:00:00 2001
From: christindbose <chris241@purdue.edu>
Date: Wed, 10 May 2023 14:22:30 -0400
Subject: [PATCH 112/154] Removed some cached files

---
 .../__pycache__/configs.cpython-310.pyc         | Bin 1444 -> 0 bytes
 .../__pycache__/guiclasses.cpython-310.pyc      | Bin 69744 -> 0 bytes
 .../__pycache__/lexyacc.cpython-310.pyc         | Bin 6623 -> 0 bytes
 .../__pycache__/lexyaccbookmark.cpython-310.pyc | Bin 2712 -> 0 bytes
 .../lexyacctexteditor.cpython-310.pyc           | Bin 2825 -> 0 bytes
 .../__pycache__/organizedata.cpython-310.pyc    | Bin 6305 -> 0 bytes
 .../__pycache__/parsetab.cpython-310.pyc        | Bin 870 -> 0 bytes
 .../__pycache__/startup.cpython-310.pyc         | Bin 24043 -> 0 bytes
 .../__pycache__/variableclasses.cpython-310.pyc | Bin 4892 -> 0 bytes
 9 files changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 aerialvision/__pycache__/configs.cpython-310.pyc
 delete mode 100644 aerialvision/__pycache__/guiclasses.cpython-310.pyc
 delete mode 100644 aerialvision/__pycache__/lexyacc.cpython-310.pyc
 delete mode 100644 aerialvision/__pycache__/lexyaccbookmark.cpython-310.pyc
 delete mode 100644 aerialvision/__pycache__/lexyacctexteditor.cpython-310.pyc
 delete mode 100644 aerialvision/__pycache__/organizedata.cpython-310.pyc
 delete mode 100644 aerialvision/__pycache__/parsetab.cpython-310.pyc
 delete mode 100644 aerialvision/__pycache__/startup.cpython-310.pyc
 delete mode 100644 aerialvision/__pycache__/variableclasses.cpython-310.pyc

diff --git a/aerialvision/__pycache__/configs.cpython-310.pyc b/aerialvision/__pycache__/configs.cpython-310.pyc
deleted file mode 100644
index 577be954f86a8aab5b6ff96286c4d64d3363af4c..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1444
zcmZuxZI2r@5VpOqNptC`FDKyoAxNA=kjS<Nlo#=G=z0_hkrqX&ss&^%v6I_lvzxOw
zk%)2<()EWVANfm`_{1mv0v{+d_N5ByS(?erdS*P&GmfjBohE_x<GV-V&s{?PLT7sg
z;2eSI=b!`;v>>Vbr74y4jIBJ-zF=<>;R%23rNLt&0ujC-BD`d2D8oyVMj{gN3zEjN
zAsW9j^`49`3599{E8XVbP@VLR|Lv3SPZrJIcsicWxXvf0l}VLn#XQ$}RSq#~dogg1
zK=eDHBuS|tDHBvMILpI+;MaHEq=ASAq#K)CU)ze$Y{lcM9OdI7h4jrrI|Mlfv2|;*
zqHD4woGl6Bt^KoL71oiUYqq9<f<Rr?Uo?icSWgY%l$LX*nbK03-9a{z?yRToOX&8E
zuVf|+tF-Z_S$%5!vnnsm%^743WL@Xw_$hquQsZk`jQ$|14Q>7NdvaP$WHQOiBuj>;
zD%Xb}K1d!_!`VcZwN7fO>xZ9yloZ)7hsnwD$;11f9Da;^&WfxCBKcHRXL49`=&$~O
z&kqye`~9o^N`ZIfC)@NVu4mnw{$^s%^D?hFe~dKwpqlh8y4Zi6+}<Yge=K801Y%kK
z1`OcV^8ASWynjKh$Ch+W!M}BZ%#)S3q)WDD7cc=q(O!BZZ;xEU49<7XgUuXnG!As{
zKzF|A9O!PK@DvkkKnuXNiK?kHys@lJFjaY3cYU>kQPEYD30;s0=2<b5mUE|WVpzi_
z@vJDm16Q{|wdpQx(>s*?>%UywyM}~UDMLiQS!knJ7CK-!79391-mrkE2W08NJ)`}u
z=O8stGtFI}>NccaflznBR(qgKB;+WY74>WV19EPW{QzMaFMAuh_t~|bzY6YmLmYdV
zO(f^0$@!!bvjXiF=jXGma61Z~5u)BewT}uf?3O|X6=FUC(J`o)#4Pq(q5XR8U)<7j
z4?=5rE^yp6U9mMhAmB6f?dZM!;_f!{ohNS6pBVz2-GOuyXLEOlU4PfKX8cK(mp@@|
z$V*IP+M?=hn+%9ykMnL*;j^e3sPOqzH*i0a<vdqasqjKrFsg$VCM)cIiv@eKy-e}v
aj>a$X-O<S1?pJ%P?S;2vKMtDIfB7H#oLGSX

diff --git a/aerialvision/__pycache__/guiclasses.cpython-310.pyc b/aerialvision/__pycache__/guiclasses.cpython-310.pyc
deleted file mode 100644
index aaf967c0e00a2f66e8e4507dde3326f801b47dad..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 69744
zcmcG%37lNlUFTbSS5;T<t6D9owAfb5w&X2N;yAXo+D;_du@u`92~DYQsnxAsZ<S=J
zsSb(dBsL((JYZm$4ATlB4>9b^5N3ugY%>E448xl@+~L6_!y|~iuuL8g&_FWZ-~ZfO
zTX#G5eBN|_>fCeBJ^S)M|Ly$GVpmr_#^3zjC%jAlDIWW;dTIVk@$v+}%6L2$^I~3n
zE_T*E<7eaco;aJZXYy>yp6Rpca(XS}C2qvyikY12n9rWg&gahN>^(J?pYJ@|Y5w?J
z`fPzX>A9}??z7z%mYGY>_nhqszxST)wOsqo_Sv)lY`;AR&JNgf@a&*Hht3Y!bNKAA
zJx9)t*mLykD9?_$E%Re%$KtUkV_w$Fy%O_s*Ai#9@|O2HdFw21y_P&X&U?Y@;=PNn
z+j#5tdU)&MZ98whULS9LyzStv-y7g<fVZL-KNXu8yrq6S8Kh5NJUBf)kqG*Z%}ihN
z%ZEz~mrE533<gh@F3(JtsxylVrxzFJ&XxQJPG6je2N@R?U-#7F+@gPYap7{=ua<p%
zTe`Y5w^+rOEnS{kTsS{7J*ASG|1!Kh!LRazIOUi|hEa)oiRs8qPRFNXUg{HZFFhUi
zG9OQ$O?n+(_LUf;mh<vF(_W`n;F<Bdyl$QyUXRzyGwbzv{XBEtfH%l9?+tmwJUhJ+
zZ<J@j+v1J!?DDpH<2<{)ZQgdCJ>Cwl$g|hm>Fwg#=k50P@a*^Q@b2U};N3;9?G1A0
z7ybF83swKB&lHQ@O3Kl+n@C!KuL*i9El16qNZ+z(3H<iS@072AGo|m*-yS?Ue#)+r
z>4?Q@$yZ~OOkiAfMdB0kC2A?$<a7)-<tawmQ%J_*b(lYEZq7pTwe(u&4JPcxgnz)x
z)Dj~~Sy)KBaQSQTwXRoVvxOVVzq^+5I*9v1wP&`M&~8mX{C%2uUg3I%IQ`+be!g{G
zPY^P&@F3~BNmp5yjzM{ybR%yrerw6A)MmDSZSY1cPOD}IW(WB`#BbQ^yPk;0YMG`G
zrvLS1JQn6K;Of5GQR}ccs>di3YcPznrPiToL#&bOsnu*P>vHl&uLC5AJH~gWk+=1_
z#$YR@Z*7jD_qB0voaeS1)TFu{U+h}^NbLRj)tr6ZK`zBwPG6V5U?DrbZ301r?;`i@
zJa-em$J=pTBYX$`JHj$@UQz8MhQ=srOTLzGYK!#%F}Bq5Oz5a3owd&9dRkd|Z`Vg-
z=i@frv)&!oldFYVVSP#DW8=NrRqJZYV@s{eyOXbXk+!?m?P}4~S6$v-n?-ke6V{fy
zsP$c8|LnE?p(XYFdulyx`S;d(!m@j7z4mRQp8LL9FRi%S+jl*&+Gp+U@$L@m(qHRe
zSC?MZrPeolk9UvN<zDJ?Z&;W8stex^C~WpV?_Q0z>e^hUeY2V7>Og$=w%<G8MrG~(
z8(L)#P@4y7)qSLRz{^@6-w#O1dJkMruMX70`O2)@Xfy|FgRbu6GU`3pG&2Xihw9@w
zR2vFQk7nks+F&@IZ?Up8p2M}_Fjk|iG2%V!y)~S{4^hX5!d`fbdO`1PIXBO>k=jU8
z?HY4!gr0vJwR=0OX0$fya-ap_inQ@joJYKOP_HeuE%g`=yERLUx7J1g$$~m>Bj%&m
z)33!}jo0Hfa@s|_cYZYX(fIj<_gKBHW3@3Cw?0D@?{Vw>C+e}b)>_spW%brFlm<Lo
z9j}dB4uT19kH(ZWInGEO^bUE4Ew3Y%*U{^l)orzH7XJ~x9}Cyz7IJ&CK1$nb+g*<J
zIyFk$QQHxIdyw+p;XM^XpnWw#2sf{bwW2G*VuZ77_R-o7?`iK{e7nbcx6S2uGHUO%
znD6k8Ywq$*bNQXoTqeC6#e?1nAo@v5^%yZ8i|XK7ppdo4y|}@nC)N(?-S&q=_zuku
zde3;zdhbzN^}g2SJ$EC%y0f;^!D5wC+evSp;GZ&9chz=T8ds-X^_&Hrc6q0WDaf(A
zwmX95Ylj8Xy%-zr>65mr-s}ic$7dGGLGG!UN_BDCFU{Z19V+?7!xu_^_4dfKCBHN`
zSDq_Ay-+PLRA#DIi_e0Etn{6#mZmP2y<$D?%C=477WbaIG(WLYI5kzOg34I>pmTa|
z@my){@YSigau8d|oi3d_JyV@4KfaQFwlrNX2I<N?1qbPuF3gZf+xo44#a@5wM?nd0
zrh~+}>7eu6qVJXcmu9@`g&^aX=Vr?1gUp5U%=CpS2^D-Z?vIl+$jq0$nM?Dxegrh&
z)@_i8n~5M*neoa&rnE41VbQ<UL$sSo|4ttMT}qN#DtTAdg<QSeefYxSVx?Rx6_3r#
zl^?m8xZU;S%-me1c&fBeDV{ieOLe`O3{utdm1>YWzX(dBI#3m5e(SwBhO696_(l15
z;N0$e^vL3+b93c;4_&HO7Z>im|1rOt@F0Jzbb0ZTPajmkgC0;&x>7Aw%R$G~;{1GR
z!6O^1^gd-XaHLc%6|0NIb7kx2YT0|_cBc4v@!oqMn<7HXpHQL!eqR!1^=fPlr0511
z?9If<mA8boIXpKrb+Nd(P^?}k7w5}We`ZR#SITqcscLbiS_Ek;3N5Y9EKKje84o%x
zl;@UCRHkpzUV6AZciull?zgWUxUe{1J}^JCaG-Qx>ViL0dElY@4;)#Xx-?&2s8$Zp
zzUsqof6IZn(zyo?96fyW;rrh9z*{FPGxL|`3{9R~@)u_*b8?Ak`QYT`2M$axUAi~?
zzUcE7z8@%+{h89-<(Ue|^?~V2GgEV=N~K)czjQUoJ~~&LKj)Pm`zLC(FvomsWltUN
z+&?)vvoKSgoZSED+~QPeuJRZjmw--I-}{fKEtwaK&zJm46sEzwrG8|x6nWTy26@*<
z^NU`Yp2#fvGqm$|=8Q|)9JBjsWy)Win~TOXZlwhAtKrBphnQct)85qs-j#KOs8-k@
zX7OkcZ?L*=#!pgcZt6mL>f({BN3Pfy_=i<FjfQ_z&M};y;%6O^jonPS>6N`)q63+V
zHfyaew=_X+cb~qxR4y)_FFxs)mM%Q9t|rlR@*g9%Ceb|`O(G!7(uHDGSv60j{r(fA
z+O(cOtB+6O+_GU%eO3wwJ+FA`(m74PM_3#h2RBL&+UMQT$4c?uiA3Y`L@G$1_ZR0U
zgG6=Fe@3Ysj3BT1nd-WBi3U**SF|A2!>`8I;x}SiD*hRQAo21g&kJi3B&S&>>Sc!S
z2#=tv;_7&64tV3Y4ZyDx!v>%mAO(pyRi0Z{foO>R7m4hOj0XdLd4BP7IV|;snFVk0
zrA@o<H<j>xs!V#?FP9fqvNnb%$~&p#0m5&oZEvjLu1dLjX-S)aV9lmQ{)ec@iL}2>
z{?_5Kc@<7@{~o@tbU(8sdbq;Q19)9vKn<+voBuq{tqD2z;_yi@(=W<-4hPU`G46pP
zWD#0P^%z0w+#>r!>a>5UOo56RWXe~T0G+poPcAkli7U5wrd++f_srs@;)T-Xa&cj?
zT4aBqLYl4?R9d)N42QHbaU@82bVJZV$RSAiAiuO&saWq;f}~%r1lcHV(5aZNloy*{
z&XC7d?@EwootNEr_D8Lg@Flr$Y2Kf<idReLcnOkBIPFl=<sfnTVvs(@zRh0t<a5tF
z^Zd~ue~!I*+F!i1;00NMw}XDUr07!@+3k-WI~~MN2FcUUJR8Jc2-3$(=gM<I$8q-m
zbBoks>V*q4Qx~+k(RIu#LTtygkW6I}euNifW)>=NCW7SAlShKWmCG~bmnJnItBXO#
zZDB$7$kV3|9y)&XNYE2z5N&1b+>LE5$ewub@zYN~dz^Yty?F4-Ohxl5=yYl1e~~Po
zIeq$>6G85jK}z@~LH7Bh&z*ky@WJCjk_k!DGaDs66J#8#BuHOX9fJ&b48S``0p;!s
zIu8p%p0h2NECk?e=%UiyOZ-%j26>@B@=rbU+|y^DIeD65<1Ys3=bnD@DLRt@Jgv>X
z&yANh2feE|qs8wQXwa=O&|b#SX%kOAJO5IUJ-JvdGkGoso#9*o8>$3_OG`|ca6(M%
zalj;J^TNKTm<Z>rAr=3FnjC3%82>8mu=%Q51l^!dZY<pl>6vQ_NNdhjq7hPWyNJDW
zjsABbL+9Sp7W5QbeUN=InmCyk4TB4^XQGh)GnVw&qTd*|9=F*wMk{m1C8OFJ3vE|H
zPWyxF{QO*FRC><6r~abC{ePimXXj2i02_49(IwGK_qovL;BL@;Zpy%G^xdtnaIO~4
zS!v;0Pt!OGDGdf^78c5WBh>ZCSR)`D?nX0*s`E=vD^XZnIR3*M2HziS-8kHO#(~%i
zG}ZR)ARD16Uqevkk8m={OgtaYwEY*7g?K*E9nYr5<Ad>RT<&<HKi*5+!Ng#^H#HdV
zk9YIy<-a@GA1@?FaeGt!7Cx#_{^bsbe)k@QD4qWE$?bY3U4BaE^6aHF{p)?0-xxWK
zDo;vLivFbXqbBxUISo^Gfv{fOETOaje>Pr7tn{zP)274$jQ-MiGW>)kRDKs{C0~O%
zG#jrbW|LlAWZMg|g;X2@C>}c#<6CMiE$(y;HXm;tYgsQzTw@*ao^O7yb`nzHm)R0q
z>$>4sPTi}CTEa`go_oyFcPn)*VXUY7remwg4O4ZlreNx&FSBRWl50J+l!(FQch{1}
z#7bDn%{iT^#nyW3WLxIsMBINnt%Goxs?NY2Rv-97IBbi01nI-ajz9Bc&~fy|)6XA#
z?$kt|e?w9JnVfG@gv`ljo;z{yxc{9B(YE4$SkCv_HxJYvg6rc7xv7xU92<kbYTk?G
zs}=trTfEZJQhCAif1Ve1ctdp3<;#LVf1>X>MLjsTbfFX^@wz$Oz&-yFalXW_qCGyr
z6v)IA8umnQd^DA1$P*t-rA8U}JiaW`hzT&*m;eqCX+vt60z#BD4QvYhTRe1TY%MOb
zhCx=>Cf8D4qMDw~7)L5LoA8nslm4WJukOq8mfNDV9b1?N8sN%1d20`c_q$<uh6!<+
z_{p_Sm4vV0CAP%E5}QNGU!e<{T+QK3G8bD@Eu`18C;iXSA~)Q_@(E@0MYcN5w{YT#
zZW~26s4wf&kIMODIU0I5tk>kzfZDi2r~+(lRQ+#IpoTQWUHv~*fDM~DNA>zioL**K
zr5BHu>R|FImiK6UI8jKi47HDv?cgoLlp<YyrC#BM=D+?hxxsmucy7StYYYdSJDXok
zR6ALr_*q9?t4Zt9u5bXlYf0Co%X?W(J@rmawRh@7uT6T_f!0qOb)fZZy#xI((ga^J
zhM>oGXZAZbU+q+Fy*iPyg<`!Lbbx^<P7t4N@8KUHubt{)4YBLtK~~3TyinM<f6uh^
zuOLE#-(h~0Q5@I5H+X6htR<uNJ6a;;|1fb}lV6cf6QGG$Y>d1%oVTl98o>hF#mYeQ
zXtRB5_nuPO(VzTDeyYot#&BF+8cVWK8%Rc*;`EYrx<RIoki-EQM@{)3Re)8gQLi5(
z>~ZS#CDp5&O=Ep8+4kPjORALy&3fr9Z>rfWz`|pWc|d3{b3F~K&xIl*^KxR$h3^vE
zMDGa`1MT(-tMP?|*R=|;VaJi!=C>6akkiuya$;*8dP95@zmc^3kgO5+8H?+__2bWu
zAXVg-SFDBZO}=)7dE_jQ<)&DJ_;Z`)(fl>aBX4<pTT>oG`170Q(fl>aqto*Esir)J
z@po>TNAuSxkEn-6YH^p&faIw11eA^<kKA%SK{-kPPpq6&>zAg!?gAG4ymo2ehD3FL
z)iOp=IRhbRj^4+r>g_J}X0U^6s>4=(ZnP4!Hd-Hx`*&H{(RdNQX&d_~Kl<w02%eyR
zBA0hH<uXXT*Bi0M*C-boF>;x2%4LXLUT@4AU!z=X%*f?qO}Pw{%j=C=<Lm2=+VU5S
z**uOoyedrE-_MGJCIe3>`=7_p-+y`BwlbFegS-VDvzIE>ne%Y#DiBMs74Hvn5VB=|
zPHgHdT(Tv@Q09pY;r}g4lb>0L!bNm@LGsMeQ$gb7GqzoZ<$w`(h;8vmG4E>wT`35G
za5(%U#UP$v=`iy1k>W~+ktcY%44_AGRx+Z~iJJ|v`uis0K^hXdd?n}zgI1E{&o=Ti
zR<CMldeRY~zJXSQUTxd4@F|CZy)54~IRbk@rcy5X@Jc_Vmn2A-5B4fF?}!4@MlS^^
zvB%p1q9FKT0cvrKgu4sp&%{#+VblHiw?hf6cgLU$#`)^v_Qvl7lq%+WcPx?mla=8H
z@NXtnTj0MSavjR5ee6H5DC4nde3OEdXeO{SyB7B22R*aZ3q}n7oTzqqiCX+xf>peO
z9X+w8JwLgY15eM_l3JNJOq4=sd@T#M4K{9w8<=_xoPQ*C&0*JDDYb!0rB6ECi4KbS
zfcV(0ukZ)%G^c<M$@zespOa&Q^{?;+nNvrPA3c0JNc*LQ>9T)IUtg1xC5xb^;)u|v
z7huoB#&Zak1|r<`8O}@RPQts1SOP{wLgt4F!5Zc(?E`t9^xr`Cl|ejNh7f!>-knH+
z4UENiX{T$#iW^MGDQ#}ipD^P35YY3nTD+FHnDT8)@nW^ahe4Vc43i7lw}g)}%_-tB
zG%5dRHLg$_hMlYENQi)0Yr>}6h9b6>sAeE>K)D#04jYP8eJHju6xj#?!Jn&72E|UU
z<%PzHBvLt5Y7myJ3GhE(>e=Q}Czt?qgByDv4J3z4PjtDl?^1w`u1$dY5c}vQg?O`J
zNH$yQ)59<DHKK$vi*p`B?W4UE^ZVooVQw4TAdhe$gc{$>=gMB{B-$qh-J@SOu14A3
z5*d5nXxtmwv>-TIdKrF(;Cu!T{R^6{{uW8hTPi|!uCs+s+V|Ns-!g{$gvicXhu5L;
zsAku4%E4$Uk=FUyw2&e%AEHJ1wN9@Sc_p)DEnP1qjw)4x{}l2{ZNy5`Wir%6J?DfF
zca*FOUJvrhBxJFrHxSXq$aP2aWuo__FSzetlCwq5WjT{_mgUfoG5;kwA}su?a=u=U
z@L<0xrzYoJI1@dNfz;t4eDW8sIuhxd_4!-nd`!-_$@!OZ1O)t_mD89P-=WtJ$!S{Y
z{vRmd59Rz9Ifkg3WAu~e%_lY9+wtU~nl@&6WVE`sH9+hJ{Qj2V)-sn2$cFp>!Q(L$
zUV=_%(14(0>R9mGBxLJ0e4?bBFIo4oXo1B$NyWljO)Rr`!xRzim8i()Qpo30JQAf~
z7zND_GiWGKY1$YNN%&RbX4X1vT4dpph_D93u4ZRDrGEx44WVp&n9N#1u`GU<(;hT@
zOoVg~zg_?eEt*bTORb`5c*=>h^UW~bcK3(G!=DG25dU5RTgiNi$drrcXUcP41raHo
zTP|Kzw9?h0?euP%6zDyFx12q4w&K7=v0>I0*4UyQ0i9(7Ui1-q8ZzP!GPX@q=^K^e
zx*VHNno2>z3Oi9+fPwG3o#KOpN8kgA1(JlcW$Wk3F36sK>gbct9ennw_Q|wFvY*yu
zx)V<dfRT^y#ZAFB*~>No7-65+Py4#|@zj4`8EH(a7Fo`^=_CkZ)5*i5P8Tmlup>m6
zLk8MvyqaN4)?~?EbK7-+X{XISD}sDB$CTJrOROfBh3;oXj6<*++78T!!vX`=7^0Xu
z86s<c@pyyXF;NH#)gxsX$1^4(YU5|XtZzsY92qMR9wg_=3qg*qyf|O-FIN1&q7jH)
zn)3O_ghYMbb3T$zIBXC63i(M@N(L?i!Sfv&+ScV94Av7iRq;19B)w!WMy42?8jwP~
zKe4jC(fJ!CY#9SBXF>2Jzse7gWi=#Fuba*yIwFZRmZCvtmZIJT(b2U51_Q}iaw23+
zcSbug3ZE>txR<TN;#>s3iL39723my%gTwT>mh@fKY*l)8`D<~nn>}5`QICitO9fuA
zE>&}_(2jK9DXr7%4eM2?rKMUbJwmj*q%+xNvAU^Ss5z*xwH|MXXRqc8>HA1E%zHnY
zgvr$mI*Dp;cc=-3_8#Bk=o0pOW2TKWNZi4CzEPj_5t{I}nCx9t__C8`!GX~@-;
z9Qs2ovEj85(?A>L>nI?#bF~9s*V-0JMsv}vi{)RlnY{g^gC#o|JO6Dsx3q}-ci^n_
z9lTtcLDONbTnvStH&adwD>To)jU%b$oaf)K_d6MR7;^W>zYk~Sz566FdPEXQZ>F*|
z2lT4Xr3$JVrK*!uhL~mL9tY$E3(wD>OBzAMQY6PD7}`vG;>tVr9lqrI&}Va3(fhy&
zOMnntS4au)DY^^j>Pa!c(CU?pDTGw~JIL;q_BgjaKa3CItl*Tj!nNnEysq>#tsH5p
z_)jU)LvqgHK=F*rUzD>0XQlH!m(V?NGL<O;TK*Ee4e$%qS{%;PkkvO9dMGqEcx{VX
zu+A-S+o+H6`e*dLDIToc-B`G8Lpc@!jP>dw6I&gpJ$rf)V)b%)&X;J~AdvsfDx`s0
z7zQB<!rvz6L;Cvda{i$lY1{k%K)&z73HsVfuH5f`k3Q~Jv`@>o6<?4(r~V0g=E_&D
zmZqj62o`jpJX0y3DEr$LU(nAO!V}q$qAZ<5f*WM3i%U)>EYY>|<s#qxL6_tKh&Chg
z<y_O?*=FT>(E8H4agA~~QV^?1bwk`ak)skk>3L`P)*p(2Bo6A2k0VMF89LaR`m2>K
zn|6E)uoR#az<iQl<>x3b+OQn+znX;cuPrAXZc$x^83{W|+a@Kp{kUyX!ZOmfNkM-a
zX{4>qg=E7JQKBv+7l!1ZNu#h%g<X@-0oMLn!OHEr7I)=a&N%>Rax@YOfagGH{ZPG*
z#5Hp3-L^?~)u3ttZIU#Bz0MkUN@1v#M<sTM;2Mb;$z|~;o9evoTT|`VeQRk)^i^${
zb@_HKm+<#EnH0W0=d1Puasjc1)%DK~HDPsq7Hc@f>iX73Laa_4J$>84Z_Kc|F2m~5
z20+2+KF3fxj9!dzW7=$vplx-FWm++dn%_pp*`l)PuI=A{S<W>%AHfNdGsvr6(d&Rx
zd`P|z%dzEZfI<AF*T?4kVWl;3zz2yuk!djb{hj)>OODVue~TOgHRjmz*iYD(`Bm;C
zv77!v7Wz#JI~?DY$V*nsav8V(UC4AB*xbt2O&3Sy{uV||n%q11CHYm<4eVK9b6^Yc
z*QC`1IS}Fz?0=5^?RA8BLW;OX>xDZuF~Ak_q*e@(6)7+Wonvr&Gv^0%c*J&99Ug&Y
zN<3i)iq|jd&xUxQX^&j#Iw~x>c%odXNUbGpul^nCu`g5I##U&A9y^Kjmi)bP?ouS9
z0r%ny63@K*W@4r1^vry@Qao9HsrcL?ijw{!QK$H|{3UqPX3@~Cf~Q#D5%7;H@5X3p
zPuN2u)23?H9W-Pe8esiqKvhj??LieI1wwMwUOv&`=GDC_!DgrJ1&ysipglrQGf=2(
z!9k{K{Ssv6!sj{HPgcXnl#@-WA$;4Yrs1FxqB|kHxCg!QW>I=n?UuPByuUtIzK)mI
zn=8x&@e$I|Tmi=f%M52inwbD`XEQ-Ztg_6JXvT2Bs6G?An`c6&bUTD^oSs?oA5ohO
z4XO{Y{|<#cit`4B(|@NTt9#IhH0@O7YTac$r2)N{o^k`ayRn=?Mb`D>=}Xk)KY_zi
zH>v7IBWQyc%KHuy7c~_%i^7BKxzHC@L)lqf4pNuB8Z{B1Zq&jN>swc^lM{o|1X)1u
zYjQHkPojP0U7C}>Fgdw=sWj)l_<v4T{%7S#=5E4U{|Dsz8aY2E=jY{|mt#<HPQE2M
zhE}Y}Cm`i(Qu(T*tGD%Gqa(#)e4d|CqS;t6nN7sLA-g5JE!&qJ&1SNx?4#K(d;{6x
zY-je)?2baG2~9T~!zYxHkm2S-Y#bJY+Cg_6-X&F;5=e{wMDlC*H}LiWTuoV~CIC*-
zJSLrvW+iM7N<%cI1?)T>yW(ILF1!d_rY>$rO^2_tRbx)#%X3UDYsbVoJtj0qyOOhz
zDU<AnxX6}gdwBL(iD^nhE@%Wvf{<Q*cF4<)A3&~ffPu4x#OkI_Iwqx912Dd*557UZ
zbG{HD*&sujVu&nR8}<fxju;^%QUszKVkEzswXcpC$?EI!6${zw4ThjS4pHRlm#gKh
zWLLkTaJlx^avZLRYNvxwO=VNNalVh(VjlIju$adny>#@+#maN&#ccp_;<<8k<f%1}
z{#wBs=j%4o>d2G6_BOS@;BEKvgl!8CXz7U8_OQ)6LgKHHzm7$<<{z(hhh^))lfD(}
zx$mrLly>Q0(5hPAPu!hhUHUk1)Lxe!)rF%`yS-gTMDC$3d&0Whp}O$xPKC|h<?Yr8
zs;<pt+P9|q5#PP-;Xs)6l^Ed69<_Kcy{6pqUefybF6-mH9BR^mE0?ahkE5~FkteG;
zdGvV`UT1x-4S4(Nqp5>WO*J9zcx@mY&AaQPse@1LB@Ghq9`D|8PEJt6iLej$sSott
zoU?81`%gOd>?PKu;cV=JMfpX)IdYCA9O86C^7YEuDM!%Hzgy0|awIZzJAhGyZ_#U~
z98r|M2uLT+xkJ83<-8MT9as@$u$}%fMR{D#6LJiK9j^Q8o5I%sh#lp(FQlU*(a!6f
z6L6HPh<yKPIU=e3cgs00=cJrx<-AAEDLJR*yjRW(IP0oSE!q$i|EwZ!R_%(a{XWHb
zzZ}~%zYgEJj9dGoq{vUm*(~F)SH>c)j3&1uw-a|aVa6ZRr++Eu!*X7hb4`vgZHFHJ
zkbFNZ=Nsf0;w?F;Q|kDTeBUO==-=;<&q!aRguhd-B7)sM_TBQ;DOmrX=v7;;{|PzY
zE9aAPzE94l<oxe)zF*EwIkpG>Gx>f{&X36XQ90TYCx#tGoS83GBwiJA9~5-ZAesXv
z+R-@z_Fw;3l<rsM{F<D9E$83J`L}X@UCwXF`As?hPR?)1`E5D>Ue52x`44h_S59LH
zY&L$Dum|b=$_k!-#6ies(fD-#PFgF`AOfQeG&-egDPsH0YIAnCX>$(7S4KBm2PfH)
zpN>=UcgcCT9M##i!y03bHAXx4{c4Pm%xtWXaC<o~Eq`6SJ;AT?b2t$az&;%RL>wd_
z)X@1jwg$@R(jZ96l-b8`BsrySA}KG;w_NzGvz)VU`Es_L_X;0RaAv)W^XlDayUK;|
zzk98x+@+K5H<D%NXAiFXIq_ci4>kFR<v-7!$O-s3C){`F=rX=OcDQYJ#(g{JN-^k`
z-oY^hyd@^k`NfGI^!wqWOkOCLs*`?@tG@*43yTU#@(5DXe(7qEkxLNj5S)MFG@e+4
zXhLM<5}tZ;;m{>i|8aTCTUO`|aO9!T@be|NQuY@oJy_vE-n@3QZXyX_X5)g+j+~px
z&)<7^{0pCZc<gZ>QwcG1h-H3@B_Binbvy_^7bUbT2>5G;tdY_emnMwudzNIH2>ku_
z<IROOrLg>yj_jK!A}yY~RQA7?#4|DGO{3EF`uQc*A}nggN^uoy)Vkaj!=F{ac{$T^
zCUGVbR*LyoI#0g$`0+>XyK*0YL6^wM<E5oa`FWiq=)Q8AlW~>j7CDQ@sg)ql8N!(r
z&iXMvU9fh`bH{i&HM3H_rKyZP2a6vC8eEPti2{S}t1XeHw97?*@@vT)=2y9ZBl!w4
zTNqfVQ6SBg4FlX{EsdM1Nt~9h!30962D=BO7NOf@wg!`G5*E)|=M5NSpt7h`t#wtq
zXM4OjDv}IuwHKC7zkMChRxY(XdIrN5n4k~@JmMrNnJd=JAam6{b5k74aNevRc8kA2
zO4?pkUp1O}rS1J{%e(C<9Q81Q7e&e-H8o#aa-jQJ<=3hAkZ~R)Q3ui?53FTWg4{{1
zu{nA@NG;%<=ybf8D#;v)eM!E{a$b^iMUE=zUzM{W=j-LPL9Nfy0slx)YYb1OGlMj~
z^*=Ag`ozje>q4(%3dXx?Az0F6M22oWpT)yS#H#Vx1WdkMIJO8{!w^)+7%h|8hap>S
z%#EQLhTLi6Z_G1|n^W%0_|{_Uw*owblZ!{3#Gt7Y{)*B=K7)(h=pTJ*>x&@&G>0UQ
zYl>HXN?p=VoB|!v_~qlNudHlo>xxFI7PhU>5dCak3&B%ANJ~L7EXf)#qd}o(jrl1J
zf>W_7QYQ4OA?~gtq*lm80=0OCoXr@n%BluXFvfSyo<i`!aX-zPiZ5-NATc!`WI3m2
zVH2T3RaOyBYCpFOT4zAe`DD3zShMUzY3Y~MY+<R|d}(a5`zxvV%2->2n-WNAUli)Z
zK>I%V0o7H2%_`=9P_-M;f$WlhsywZ;v1QMSrEWoY9fVuO4N3()4Pfp<Kcd{ODWk4M
zlb4=i-vCDWqL0~YX!hlj;OS@8MXxCKH_B-=*nGBIs{^m8wZgTtv1~GtOvDrM%$IwU
ziA3bfCsO>oFA4bj(SQE0^W%jc8-yv6xBO|ypWs)~?h)<pqShUe@1=N}4ld8Mc*#67
z5?1o;5UY_ViX@&a)Z{`7HE2Vcg__RLLQNsGP}3D!sOgqz5#slR7HWD!3pIVAg_{1*
zLd}5ZEYu8!7HWpP7cp)z!M;8WJ#{xZjEElMIqG4Yg69_RKJR{>W8MSagFLrlO5!a%
z$7KqF=QfNtyq)KE8ED|S!+X?wC(j}V79Quh(|f`@$a9y>An@Gn9r2Fx+~b|W+`&_n
ze1}XD5OOCW@Ai%pa+iz%5VBWB19(n&@A01FxsQYYr+MD(z1Mr5=RMvFEReItrEY;h
zn!uV8w({;_K#;_1N0G}w;37>nsH5El?kz5&5?qWr;dX$IB|o7U2t{ynGulz|=60G}
zFt^KGi7HYi!Z>Jl4B!~@>@$gU_PX}0{)O!pN7@3X+Tvg%C5%(p><eiTLGaaC4e1nl
zT`%+N2_FVx6v9%OIF{eYLa|=vx3t9p8H8`GeRO1@Yx8fUAE}o+>g3q$q6kP-BNIMJ
zz_G>RjFAuQ7$f~<&4F-Fc7OV)*^p^rjT1I*akqNI#uVzAMQ2WOZrtt2x07OJSqxdS
zXyoQ%sjQ;Kl1yCvu~96<Ewei<mRL}m#X@ZoW{FCarJl`Vsm8l4mY7PL#Zv9|SS+!X
zHjAYhaEHYbTd6G;-)sh<s{FOpJGE%No!2$<?ply4ItHvvrFxEUuz2d<I(XmRc(3jy
z?p^#wyginB0{_I?z8kT%0aH)9TkrhFuPN_)@ZGa^uM=La?bp*79Cw^_@+iPp$h#Pk
zPm9XmL*T7{BSbaj&OUEfsBu1u^6kQz*^|L1x*u`v9H&Bkp%x@CQTH}Qvs6WVGctQM
zQ)G#<O>gksW!<gxp@98)r5CwqQg}ie87e*7(Ovo|j)So*P)Xo0J^BQ7p>}0u;JNa7
z4lrN9^vBYrs#&|Jtc+k61nt*iv?mo$l&%~&F|)9OSq%Gdh@+D5?pHdGdWh&9nh>PU
zE-ubbByROl-YqeTZ+#sOHaI%4Vpb8lmh_fpK)$8w;D0*~k+os^i};b3oKVn#U_eNn
z;dk0=tqv9YhOGlx5;;{ZF9jVh&CDGypRf8KB!eK&8QSpBwIk#6C^Vpwy&z&rY=|Ir
zWo~9Z=)a=#!^)uY!pxjmH$mAPl=k2i&c}8vQ6Bxl0p1o$4t*krPU)0W6EozxGVU?8
zb++>Ag(r2I+e!Vuk5B$j>ATMRV-k)nSzp~5Z4Np=74*+h{S%9o>U*X4ZW%aE$p5Bf
zomFQtP6>?X5e!yK7t2SmdvO_J<Z{`aV)g&I)d8j8(~FH!4Z3_csy3*VU~tNYEy8RV
zO$IaQa-rc^2E7<h2{FF#glv9=0S(I$CL3fJNQ~V1+SftHUBkMBYg8{zVu}TyFY+bm
z2x}i?oX&WVRgbFsCJGK&*H|~iss8uy$`Y}2!M*30rY+6sKNrqYYmXHAE5Xn?Hy7I)
z)g1;$t?E3dDH_hu^f^t@CLGAnpU+a`e<$>30goa5O^AQ%iQ9Z)3{-F|K8m!~`N!j9
ziDBc(i1#A3hRuG3q}0Ta@g{U4R=WB+!ep}|)Df%`5-qs1c5k4F6M9G<22(eb>B3S*
zi1Ehs!#L=aG~t6Zz?^<qQh2rLKS?!HR3Tb?e3i@>_Ock(VpNAQ5NL#q|CGuU8V7@2
za^boTT2mQ_K8TTJSBdn+8fA2%wlBrdI4pbenk<lCp{*XxWkecfUG-XaBjS)PrXKe0
zhF(~s1by!}_PWd%N-~{X?J$Tw)GUMS5<ILg8-wVMm%5K`UfE&?v!(L1fOz?QqzLp#
zvFJFyqGO!uQfTAHfOcM?;tj8iHR;SKm!=&2;}qbR74<1O7jdTewfr$99aH#jMLxqD
zMo0=Q^+t{BGVG?4d^V?#5qmGo)bY0Wx6HCOV6k5I$-$OyBE4@C&tFjCMj?K)!i)iI
z{M+x)t5~;4@y#SbrtD6#OGPOt0IC|goZqLo;@k!U&Gjd~VnwP+ne&zbVsG~>oNvk_
zD9km#WzV0wbWWyJf)1Ggz@!Tvy`zS8NTnT?V}0>Mv|xz7Fqx%OO_HEoXX2f}Tz+X^
zxG?Eptc<^2kF_k%H05Y{s=Ug_C<>iw&}B#7f?9xHaNvQ(B7*jYkd2Hf(Pm6@D5DcF
z;hC3=7&FAqt>qme0?*Sia4<b-r(&WQa7IC&AC=aT)4%r8sOO?YkNyw@G+IZfY_c@<
zo;Ka4o?6`h)oO3G4-}Jw$6g2OoY`v$EzY$8FJ~poC+#N8TzZ@etB2W%_(oWD5Q4#-
zyV04Qr)bfT<)ZRIz#B0wR3t&q>_}Uvw68GD*B07qJ%eF0Q#p<55VmSM##z9hC*4p@
z$GPRR)+)q{kD6B0WY6+TlsXn6aJJ<8LalLQ5(?i~xnEl23uW{fL59$yoGT*%keVdZ
zs8}jib%^brB4##<rRmbl!v6jHjX4&io_glQ(Um(7>^DPHGF;Vah$_;-tL(30C?sB5
z={a@!;B%*o?=0>*{oH$x?#i#^EN=y@BzW!d8jHjNdm4z$i|qktT!76?r@m|fSshoC
z4}HYXXt9bZ5_H$0jjbjBm-NL}^~d$OtNz&*m8mA);)8F?U0?-UYtlwpatZ;aVP%`N
z*|&Li5uK_sOuI$w0|bdhaHsOZ<ryCyY5FDEF*Cg^P%E@dg~YWBie6D2(o=Kjltw(M
z5tWHKwMsR}V&FrmDrOX`+`eD`GBNjQJwHM&DRxmEt<Y0DDlVE+DU`4Y(=N@**8i~m
zYv&%8l6E5X-&WjVfN1aC04_~YNd2z^UwxemlvN?3O=C{09o7}vd#K24i4Dpo1T0&R
zb#yJx@g~S#v=kJFl{cFrEWOg*9;$;>e92(FA`^%FfB@m`j8w3a7t{XJ5bT+Y$Q#Mm
zy(emEy?ZHGgsJn1VV#j^^6U7MXa&cor^m3Aapj$NcxVaZ!asYP`n!`UjP9y$46%Pt
zh#M$L!knh+HGD)q!Xt=ZF)?P4oZ{R<(qDWj=wP6DGWvK3Z9mx-m^lxmA5)7M@dM+o
zEH0eT$PJR;IAb*kduiO5ti=SDOcE<QTHvnqVOmqSfL@`RqB9QjtIU$G<85$~&G3^t
zAO!eM6E=xR$)4h;(v1cfH8?tSl9n9^;$)4F0q>l=42+tJCblE&<P`QB!UHc1Bz`k{
zEimi=_L@MTsa?)?Gt_`<0e1(2XM0fod|&i7$+2Ti7V_;~i01_M?cIP}jW-4!ASYiK
zZ<f}DKyF=~Ek0@im>c<~s*aVLTB@mC#vY9=i@M#ky{#?%Sp5C)E7aDgP1FR{75KA1
z!*g&=G`Lhi0k`3HHMl08i~2lvLp&sLF!Zxnk1BO~Hc|p5)uebAgm_ZL{R8~sQjAQB
z3*mZhoSxezyi$jZ-~#Aum^cPa{mNU~6l5DI+!%^W(FohYTkHnQY2g)e!!k3u-Aiu8
zdMq2I*Uc?<gR`Rx9_h1|{sifFzUlNwNbmR-^Q7PPrqk~secsByPWs(%I(@3T$LqWn
zf4ROgG|%XbA15c9X`%lO%(dkkwy72zc!3>%X>QJT$6F#)+(An%Z)dc`uwhBHjAybk
z@ZN=s3yUu;6t&HneZZahi97uND9BUaV*DEdP5oEbk?OudC<9I(R+j%+@p8bOMIREr
z;&g$vi?RF7EO_sRsxHkPU!217l_NuuGr2$q=GIi~$K`xdj@aP-_saPRoPYy^b0x6_
zI@uZ~L#a6S)wG>`M1_P_x?}<5vr15C23Jli;){ycCDSYSQ?@g>Xd9h{JrIStC=a&2
zKGgd9milY%=#{Cm@%&*V2{QY?r{a@XmkM%MMP!>*62pScF)aAg#F-TA^+=IHUm(kW
z0aH8~ps@DMEIVYk?V*W;c1fsJVzF03w+<(WYx}6MX6>}1Uyty6xaF5kj9W?!`+UBQ
zwv8A7Q`$(QKSA$01iz0OvgzqKIJ5FHd}km8@Ky?R##4S5;mFSLr~NxnqqmK9)}2|-
z!o9=@2iqj));NPUj#7IDB-mtUq(fVEON{)UAQlub9Bz!{!|jIG$TM@J!*Trhnsr|}
zua>HZK^5DX*EHswP#C0)xS`ikZI-@DZKj%P!<nt7`n>v8>yvkF5^1o$ZWGy)K}r(r
zpv&3XJ#E@6pkc=_ZCY*|fxn$OU}kuoN>8V)9k-rdVGJC)*4@fsaymW6JsZ;KMV5c-
zMuMHPjApT9S^np35twk;t+L8Q&dtx?*O#>M-W&>%w#5y@2PW6MeF<(uTZm5J+jPRD
zF@e8GtTj#Gd#G?K(Vfz<M<IV0KTGr`wlj_MDH%T~Af+V?!(-c-!l<;2wICHOtFh%4
zftK0$6vYVzTQ?iwxZ7+z{kpRenfxZRv3304B~_MJVpW~2#Mpt9JV66Bm+SM8S(>e-
zn`Wpa$IRcB8OkiN8QNGOi|giOYn{$-Iw!v!?#!F5Q)kmGU01(}&XfLM(VzbRC+Dx_
z7;cIu=KqaewNJF*b_oI)Y_M9+c=h7wocQ}wk{ADj#4^(022}uuIkEq~AimHDayInx
zkLnbY&aOT7pUKNCu=d7QTDc*b4s}4>%nON<ZdxrrP9>YlOtiil)J+riMauRwc;J0%
z%W7YC{jJOH#HBg6!8rXn+ZJz=Ea=f9)Yp~;*W0|{^tE}>`G2qSPSbt<m*vw+^QDpC
z4jgBkDoRrutkQo*tY2?hX$f0tZdE1v0q`;*l23>040fhM3aF)Ru+r*X+p^Lm1<{Zn
z=2v-zq8$DRmI(u5jWO0-t|mlad5JN00w8&Yca*%1OzME)k<~kEC0FHS3J1|Mx({Mi
z3=CAoWYh=fZ6f6=cY4%1gvS%RvkH5~%K`H{2Ekxg3&=O~pNOw^RXeL$DKwA5`~X){
z&brT|z@G6cOd6D!VYYM`&MRMc-Pa=@W%Sm#W(J1H*3}-1+gI)K`nh|dcNCIPEik4A
z#-74(N*c5<rC%<qeCkOaVabD({3(?jg_Cz@Bkx*|GmS#pZ);8K4XyT4^I_8ePFwns
zru5tk;mVf*Dws#QCxTiw)#)c%^KI+7zSjPtc3D)I`+!URTVijGt@bT{uGUTHU90^Z
z?eC{m12vTD^_9^yrV>Z};RP!M2-jCDjK<Jh8mQ&A(l1TEF2&%?E_?Ly7a8B)8ceNK
zsXF%o2UFfCzbzw>+_00zc$$nD@qbJ<PY4^o&Q5OgLTQF<<d`L#g&ss+Jcx{NTPR1)
zK&cFJRIL#<I6GF|Iy>%dH%cZWg>@9m88Lcxh~Mzaof;oES8JKASlzH$yK^fysL&1>
zd>mLEsST}hS4?fBmKlsFr(r@mYa@=1`dEB<q1A9DG!OHTo`C@%vv%AFp_#pdIWnr8
zO)*`qDAtCk?@oTZP+(7nE!|xkj@r4l=Y~u}<aA@i+8x9j(-%toF><_f?XHMfHnIDp
z0q7Y65Dfq=zFClZ&O3DJje<gCd`SVBFA3Zn67(^r4iUSc+zldb0E_Eb@J6uMR0$_~
zZYNJ3IOrEtgdCKWrWVk-Tsm~=ybH)F;DIC09X!#9<-Vj87<4~y>cV7uqO2vVT=1IG
zbgJzmFO>YHBQnD|U53GY2$9@H3~?_6si(cU@=B*r?&4FWm6GpaLm1`2IY<0_yIL;d
z=XU&&U|5+vU8!6up8|~6^SD*yd$6_s@yJ#5n`Wk-_UbXPWR4an_XW($S^>09kSil;
zbKbN;e-l#QP8Ba#iny>VZ~Z8?j&7zlQuN}06xO1##*aXDg~P)%$=e?VOeCf!-uJDa
z`K!Hu@%ZPz`yc7=XC6oADv70h|G&}|98fo9x?S}8#9p_X3GH*lo3yU{kIH#f&Y$AQ
zLVk79DG@p(GYzW<EB)z`kH)K62QxD~LCP;L`RH2DEtW7i4L#?GvR9Oo(S09QN9B~A
z@lcyI{Y?MR3UXSCPU}q?epufrRkdx-&72Fe{*+MI`3eFCYj#L>iB>e}aMsloIdPBq
zq_AlHTXC(H;$YYy(`CdGB(*WUMi&h;PXt7r>HnnSr#!#((!^K;5#DG|4h9fb%U+sq
zil)b~`pP(qt9DpiHJc-Pd(N0`8v08pLUOPb``}j>&z}$S$E`)ujSD&tNv&-8lICg<
zp9)4iH>W0FVg^nk>|dO_Wa_M0C7qlvUGe|GY6>gweoULwT~3P`Q_E5Sd|rBFm^inW
zXBR8(ysHt<<644;7?`F7{^#WNH^Lr1KzKe4sR(Ni3<8`S906i73tKh|PT}<Gy5kQS
z1}|$@vVWO^)Rg}oe!avLb}!=dZSj0k#>_Hb=}ruS)fX^&oFbP*d=E50e?0Z&3|ec0
zkgQlJRaykC8{i{#@X~^h2*DKgPqkx}57Gb_xCm`-2x9=>508`~$eA^IQk*+>*S7)x
zJ8Bt&t=U?~T86zW>)`paIABTa%_g8#-^valVGQqvAvju9SaJt&y<>U*!q?kZmp2#=
zdH;Ade=W8OKJps-2qg@H*~s=1N7~+(SnXv0%xV7+BxmnV*7CxOUW+Y1Ma^^68Z8}H
zYm<UP{l1dLsz;Z;g*1k%`JQ$8cCcH+D`JNY^Ih$(a_G#CpB02Z)OykqwshBew9hBS
zVf?0Lnf{5|`y*eay`QS(Rx^(4iTOp*G99AC(PpkwIncH4zPH9UB&C}mzJJ{wlqP&Y
z<0efSqY^qXYy?ILRmHI-*IGw2OeEH&)`*M7urss~jp4<@_;GhIT{Aq~(}K9W_n~RR
z3&)?U?`UsdM{L#^IUDV0ghhk~`Z`@qy?Q|vK(#5h94{|SS1)W(qK*N@wr@nyyl$Du
zcRx=;XTv!qTmKHiimT;_j$88883TR?<(O(V*U~J_%~YL5Jfln`M({<Rkm(Vn60N@x
zh;yZ-XXa%MDNm#zH(#1vgdume62vcZd5_-ooI(GG;|mt^pmvEMCw8IpCYHkff0ycH
z7{y<Z*(v(Ja)|QHqPPea@vn?z;go$MV<F;hi7}9kk0lKp;VZ!PN_<6}uI_jSZP&4l
zfJVJHTY!cbS^6F3S9uPHB@jw_9Dt~FzAFaH6|n>8Yto^kpmQv6ht!Dx0KCA}VCsj&
z4a*=cmh%TP#%4u|qmu;=pP-$g5xZ2|fCZg`+Tq<xp}c0(kqEL^!kF!#`!6Zs8`P6x
z<qju`kb0So`lHT^Tz8;c39UaQWt57@E40-JFB1?LOYb-a|FFR>fWZ%`xI`tPEsXmR
zn215Aubgi6pEhBRSo$(d2T|u?WRTuWK`Ouqgjh?5C^nWOE%!&4-y&9ofha(W;?%*^
zTb8%fxX?wviOk7OVL=;VA1%lv3f1jBrnUc#^q9kD?`v86Z9M0VR=)!;2IGWjPUQW6
z%EK|TZFx2L1utvx#rqC&SDvmo23wH7Sm$v&v!RY598gux5}CH#NsLiC9~X4BS6{f>
zYZ`Eg#kD-PvkdmJED})kFmc&o+f*Ui8r32q6(klswFw|-?1@V%nm8*(10-4rA3K3?
zXHzv!c;8Hh78<SnTD*}g*!YD7*EaBJzgqFdHFt~*MQnnFXBdI+R1>s6Xx$5aP}Nt4
zTSlYaX)VJ}LNg|BRrYu>bA+lmb60aQl-!zhmxlDyAZ8bvx&&6(zjKU+S`x+-^AEB&
zu`JyNzF;1+CG|C!8jhhnfkko+cVqNfvS(>W6MN`owbDc#28%TrLT-PMTk^}7-JNO(
zKQLeRv_$b~8}0T1{$tX8k9tSvHv|P!AX$QXLcfafm8~s()G9S;8C#7}9oX$9SB))Q
zlmZ7rlWSFqEFD!sS+EI#zi>hCg_p^Zd<-iKQwhI^-?o6Iq-$AQs)?Gc@q;}%i~?*5
z0k__j-?cdOF6ozdE)TTj(Yu<jGRGYLV?KoJ*JRQ3R4u=nSw74lq!_qIbc9{p9iG@1
zV4&b7A%r&rvnC}FKOkNc#hsQLl?!h{M#9BGa9_=xPOKJat4g-=c<-X53^-tc`#jBD
z^70D?!og(duJ91Fy76|lc+^R!oU++YM*eo<;v=X*Y5#Y$lt!InPvGV@F-$>21eIAC
z!HVq>Wodh@xO<$I(nN4j!53DJDPb0V2N`g?+Puut&L70=Q^pR3pja_=0UH$^m(h>q
zNR|Ja<c#vg{L&S3*WKihBgY@mG`f$%G>P2*Fx>Wj4xNt16JPG<CI>CNJlMJ>*d8`u
zS%6CY?aH>6NoHh(t7%32%9d#+reJ-V{T(mSGz*6v<ke8<L#W4N&PK=JWRT52(KM(4
z7qP}{Pz~*7m4Y^F4`%qaD&0^saBlcohL}(ZoN65mX_g_zxZ$-pep$2B@QO+3hPQbl
zHJ53*drVyxg50JFr1v+LyG@;3`_%EZ;WlQC@9EWcj$G5+5lC#BHcVYdj5`Z8Wxle}
zv`!%ZywVss_da|<r+{yH?hv!e3k(v6FIW6|#axhci1BHH#D7OQzoJR9H=HE>x|hZ<
ztZ@->kY&2^Lm~Fg6PEFoO(#doITr0Lb>!j~cwsnVYg&_$ol3J?NvK`T!qbooPz#-A
z;XV+7XLy644?-a{m{4b&qYHV2ZiOpPAsbOzrx-`*toX`NsNLkjJBS>n>dgucZ!(?L
zh$FSPD1IYOXi#fCT#y+bf4iI=a*A>U7qCd`R=om@<o={a(*)+P$X92L`1dG^DQey+
zpPdM_Lyu<o=CUG-A>qGO*}BVIHbfB*(YZ~d`rk?OhZ<FZE|Jv_gE8;nzq@_xgn~xG
zz?QL-+EiV2>QlUgi<xC=i@8&a87*cwr%<w>gW_$PSz1}X6yj*MLwTCylwKEt$e8k<
z40*d4-*5V^#Tv!sZkLd|i#hwIaB-m5$LU(@uJc193(`W4nuesOX{o*b;Jkr>2<?b%
zL^XX>#kP$G=63A9bQ=w4PH!w4jaHpijYgBOsD*M7tE0Y{{xvUjg?=LX1t0AoCJ+@I
z2)ZQ|-jI>tjtg}dTC8m4Fgsu~bU3l41eOdS2Z+SdV5`jNjwFT%x<<ODw(PhafmnHr
zg68VWE?*zM0%^MVb)$N&98`-Q-ns0zC9IEk?CUgz4Y-SMZBe0?B0SK4ntsyh;Ds{B
zV(bO_(g&yo`8xBo(zGvcy8eb78`2h>2Aewo-3_<?0~Fzwq#!DLHAgCpgLsvbCho-L
zMvK#yWMg7%v^blj<{MJdN33yI+1ug%QEdApd7kfI(vsE06@jbEwM6jlHx&-?)K*4Y
zz-|-u*s>jK)z&A<Szf^M1l4S^%tF@c(k-^Q))2zkXj4_Yg-P3|t@18bd6Eq@(!a2E
zZde_LsvR|8gaDl!0~OkU@@ZOkh7|uTQ6ocq4OFOm9;BjiPR1$tY_yid)rhmNRD^V-
z;(HNf3WaIYLEJz%t7=Zszx6+5PzP||!V6ZVz+4b7(gifzL^!yvW28WkKzDP3Z;0gE
zD2z_l6068C6~#;VdzZofjf@h)4Dw3YTRG@2{m-{r{)j(WkA#eZ`kHVEWEmz~dim~J
z3R*XHGKimVp*CcIY{M?Lj@`%CbtnSb8k5)=g(X!#gA}Aiikxv>{B_Fm91fEbqO*Nc
zYOo6G?IQh?Po`k*ic%9bB62g4$if;<F<H66qMa^j9fK*&$mnk?p85>c`Bs~Y==!J|
z*W#QoXD?u9L1PELX&&rfZww7@h1D+Rb2q;pe!cwqNFDClXdiL{!ge1EG5KHk4Y~nD
z6dL1k=ZjJ~86j_A1~zmjjPEOunriu`yuZfsUiZz>B;-Bse57r}Y1A$0e-{^Lh=QRe
zaCYq()niLc5>#W1^dgh(hIJB`3{|VG*29}gRBi2j)C`(h38I?->@HV;yXDF=szF<>
zz2uXr=i0|xXY{r#y+U2o)ys*}E@xX>A#2^WKC6Si_P{~T0^YMaAACJY%@hNPxng)7
zNQ*g*ij-Gz2f+(+7^uqD1~~(iUd4db+7M?shZ)~t*}s?_xr{{9*ETe|CJob(8gn2l
zYjkalgAK!Cya`%)o%r<><f?~`#->v&Sn=&gXSX6#9gzt)&JB$swYB^hLCIjHW9mjT
zrZSKSbJ~clEg>7(eA_;|gEPe;Uda~7*(2IEikvo!ls1c$*6dOfvU_b$i)1!Cd#7rQ
zWcCdCbaH+*i@C5y3WZJZzON}PZmlq;A|OM$w4G~rBO@N$8uR*Ll5wup`8m+xw$*!^
z&bMNl(_rrBSez!{b3j6vr+<f*a))dbNGCs4x>Q|+1F=fXpv$!XmloZv-CR=Wtk%5D
zx9J@p{=$1cz5R=iPk;U34?p?Yl~*1QvItI*mzCx?zLcC|)9Q5hWSgjKr2}3hH>?*M
z*e^TwORA4yu!gt(HH&=WfZK_`Q5EQRY$!C!Ps`|tX^b{$k^YgA8!c_@X``h9Rk8zX
zd$@`Bn&ft74a7<IoRtz^mY5nsKmQ@Us(D<AJ72kg>U*C=e=i|`TYPDfTVN60VU>rV
zC7q0@_@7V>q(<uhiBe^yLTbvSTy5mkRsH{4Ut4i_C*Cvy*>!<QHWM%b-l7UTEXPck
zY=l2F@R0_ZzqN_X`+JpFR|N|7W~F26=asLi1_`rpL!5CPfkLT?IuZK?8E4T2{$o9;
z;3T`DgfQpc6u00cvC#+jR+k;VtYWM?6QD|_CGcT_;7UmDYwQbGh)(LZZzK{n+@X!+
z9J2y$L$W)`T{&zh`bXBtmICL&h|SQS+6%^`F!$`|%G6<q&lJZzc_t(#xDQT+j*sj&
zk%10!l1r*68RD*Wn9@2$$QCv@Rf`EZ5KHk(Z^L0<N$^Xyu`miSC`9uBb$}1a$hXV|
zN44a6X}DEr>CI#t)^jtwhcmun)LW2TLLzI!dWf=IXC0OAqx9$!mHComjB+H#T9|F$
zTw9mYrcLp`hy1Kc{y&v(P0pc~B7=c?Ubd)hZC;@Y{`V5O?YzwfQP**1>p0aBdIG^F
zxRStKc)m>=li^exI!K`G*nD7PiLglZmvcAoWLq=B9>0t=7~zNiQ5&4byWJ}}EEJdR
ztR9PPLi}HNNVciJi(%ttDh=E6Lxz{-4Jj6FWM_P8$#=$<PgFbCB68D0TgxS`wZ#s6
ze1tc#$CJK%4x|%OKky+wbR%~T%-R(jA*sr@o{(szWooq<r4TBkG&&9vzmamBXa6b#
zfqfdWlkLp4;~yBPG!mz-Tpx6VcIEy1iL2|Tx%lzu{8II*Q`aag`qQNabVwwcbC~4!
zE7L(apOW)g&5aZCeGp%ZsybR3{zY^Qj&LP;4pD_E#<w3H@_(4LA7u7a_K{xaxe)rp
zYD}?2w=<R9m9g7HU3D=Ybyl0Ro2}};EhphyX15k82%{5M+=&NXBd8(F;sWEzd}2I$
zc_#GqI4q6Wf@LZ0JvD>ZqPE%gsbvr~@bmg)(h4I<T33kwDf<5(t4)!#!d)5y#yipi
zW*XhyLXVMB%2^Ewiw>GWh9{W(iLH({gNK4x5qS$$#j2OIw~$~7r8jCdV*Q9{HN>4$
z{gJv!`K1>{yB<;#%sDEb4EcRRbrzm2Y&{R4ZR=%e(LN~gffQ+4dRc2&)iw#sIXuiq
zTgg;6A!RwYqPeUr<{xOh11=c>r%Y$$;{XJu);PcsDn~*k=o`>lkUIDhCSF+Hv%FQZ
zJ%duTvvIz|=DW>u;Q}Geh*2}Pa;~?}p;`0n%oR*sNi#^YcC?Yuv~h|qt-TQxFg(FA
zHUn@ke!sd!$|GWJN*7%RiW6T!YSzluY);bx<)pYEez)df<-3`YI^6%N?rj3rTu;0P
z&-&HmWU`tf3`?*E`;zdsLHJ&aYh#&-y_T?V^aC+7_&fMzX5lpRu7hkytW5osKI3@T
z&nmxHIml*guC&E181noZ_;dE&1+?q716nV%gmFn2Xfq{Ac(R&W_}!MTSmb4;{DtMv
zV=D4XE#Es<(+e+9AGIYj+hb$+%&KbKTq?(`{GY7$wuE-9_1ysKi*qicb*6lY**oSy
z5B#e1+CXb;iDV2qe<$r6tbZG#RUJqIpvaeBTK}aUPpy|?4zXw@2e(*EY0BzH1$nrp
zbK6PCvp;lxL5f#J3jSd&`C5GWH!Ws?n5cRZGsUd_Gv{ATEq`v|9h5@bt#`iCS|Yu(
zt*v*4)H`dV^?25T`dhu85DcmrcV4h+M(l|^LbZY<x?A;P$3k#6y9Gn-Hm?ljNO&j>
zq23ys-O6uV_O?W-;NJ!@!#A(XYTE_NoNIv0KZGALr@GzASiD}|<7Qp~a@+FS;Y|O-
z0V%p%AgZK{v=d_0QW+VNYmHkkaez3SkR{i_dd@N>q*`v2GeW&EXHr?tonhP|3@ar-
ztRPFzCtlSWNj|DJ?2?am?_MpC=N?zbTH$=6dPnun*}GKsx0t$jm`}##Ln>Q;;la5t
zDi`h8^sxN)O<|HTZM+hf4^Uox1W+#Twi5PYzidP`wSMPnqp&|^*IKRK8}%oopV>D1
zXzZhjk0#G2&AwR{)#ViG-n|>d-A>H7#Z95KJdPQRBIK+rV6V@2r>TkUs>hNo>ZVx6
zTSYy16fouQlw<pzZG(e))po`%V*&O_YOiyGY+w0?uROkQ@?#hOc;y$To(<0jPAbXW
za%eXKPd1p9o?J1!lStNvxRtd~%-OvT_SUet(GqcHfry=9JiWN9B4mEUE|6H+z3+Ir
zbh%u7X{LIi=#|fx;0+d?3{+?TA#Kam+uhE9M9~?Pso3R%Q)JQdCu~8`+R*7&6Kk<o
zxt{BW&~&rR5f8?o?6|7ltlTu2rZIbj+CwIiLH6X)7mlHSeANFX0vv5-+l%e9#|TTX
zUD`R7LTu&1HjF@)mrw^a<Aij|2@DfFiefJ3ydJ|-nm^~29)nzXKv|E=(fRn5;kva<
zyB3e8irHIP*FNphrbegX1L~-K7?-&LYc^_OU)?ZMl#5{rH&gykGJ>3wJZGw<Wh%F6
z15-56pH=Oil5-JfT~)};s?xZwqxpUZE364UQMYB3>-+Fb>}xVzV#ueVwQcxr9m{oy
zt04x`lyE2Bu91(s?)??{459lI`HoXDcW0<6#J#LAL!&O~gBYCc_db2QqL`r+6X45Q
zivNp>XGX0*Dc>qikay?aPLvleafk9H9m%)Ts2@;_?^7yS8~|}YBLD9x2^b32gELG3
zy4yXG1Q;QhS1Q#QRT0MexRiK<?0o5pv(sW4z7}{tp)(5>>gr);nhs<9Qa9pB<=pOn
zk|3qZWi989Jah7B&~xroxq5o>$+^XIr8y1tudBF2s@SL14>p&7gs>lHN>#p_@LmXQ
z6am28vC_baQ=KN2vhS$Qo;tJ%Mc+{{OX=@mQ7L{WG3A#wpVQ}Y{=^3pUoPm*97y>H
z6IIH$k)NsjqSNAhvgOj97zZ62=XjFj&fS!(P^mcSH%gsWc7D}0*23s#n$>Ch%DZ?G
z{=F7A<OGb{kOb~l6;Bp4*JPQqDXyJpe#<O#H=?^F6H+Z3NgEx~nroVF#=EjPVyu$X
zWkxW-Wtff1`ZRqUi9ddZo=4Su26Jbc^(bqjRVvj=$JepZF~>&dr-_}TEk=cy#w0zX
z9$wkr(jV*AdkZm=YwnB{*BSA!Gr)@IjD28EhM~i3PaCF|fMKiSyrQjkqCV4M9Yxo%
zBd0N9Y^v5`-CK+Yh{3%VbY&EygU&<Fs7@1&9(A4w9SOS-S~QmHGy#t8sdu9PZ)myy
zZ{;+)Fci1_x<b`?{%^=vmG2kj`z0J(KE}<mC1XA}=MSkA>(!qiUY~kZyQF&6&0crX
zS|_Qkhwp0X;m!Aq7BWGUR|mf&zsi53W*`RfS9SWhK_<kFOyG2BGnwE>gw>>x3Hl~7
zAr6@khfIivWJ0{1OnCpd5(fm*5eXI#o0?oCdnokFcEdkqAiAhWcN<bEp13aHaF>qA
zCYM7hv2JYSL*DY>%$bY~nO~@0!O%VN+Zk$0KZNy8SS%ot!(5(`sE!a9X~d&y`)HUK
zTT~Pdy2`!6H4xw^Uc(>OXo$16b{{1%>d2h`bBb@w7x^3_M@WdsZO~VS$kkg75t_F8
zZm|np93nS{{V^<$IRp$;4P+GbYXnq{DIUfSTZjFNv^Q$#*A|PV>sDAI&9Myq+WI=^
zmrO1?mBNist3nP`>a?r}&Rl+^fk}a{8Jh65N^B_8Uip^g{Fo;GJic|CbA$&O>|ly8
zG_82h;JP_Yu?|s4y1m;rvCpWO#^O1~>k}*<U)HAEP@h)upEF3f9`*76x$69;5eolb
zC|PSo4yhtSCk#mm?|F3Q&<sGld7Q$;s^6>}KPKnf<TQ{2!#G}6m|+M%B%dJ+c45W$
z>DBgscZH+v|NhTXZPfn6mJ3ZZ{(>6vD{_8KPGejCZM_<Z`awX)c7Z7IrNeF6wpwk=
zgKW#Awk_-IO+Qx*V=gJv*gQAcG&dTZmVH7qM!&=SD!+px4v>khWYiv>DL)pR)CeY>
zwIqb6#_8ZdClsxY;KPOE0Ij$_PY_E7*=-M~ic70KB+k*{$1x=yzQYs3hMI9}A6M=v
z#A06Jt?<rtS0km9zj0o_R_V>2n0ywl5HAs$+F4h><?VpIy1Acd|M9<;E=DU;7P1^m
zwGG8=FraMi7_w<2Y@5_8bV}2P6k7H8j6!X`3E^{7^pL`AD1VC*k7+2ihBbU>0|~D(
z`y0-Fhd89zRBFR<+Nl&)=eoUyeBEAij8bf`Nwn-W3EOLQ&wCR~ig;YC<#r_5J(@)y
zc$N#cW^;BwvD*dmKyKYnjAh8zw1Y_*2!r4C>x^_qd_iRE`Ggt%&I+oF$zm~KI|0F6
zPKwzV#_adHfv<|G-9vf8W@?E!5XKx_7jvjB=5QEuq#jcc9^GN7Lbc>M8phmGk16P`
znC+#Hg)z6Ti`icKxa|bUl*9IDCs2E~M`gdNUPo8@)daUFC)j-x+@#D&EZ#Bi*UapD
z{}IBHkgCQq^LAKFS#tUcV0(~p8LAEw#-8dG*@=~p%=GOH{TSyHLxu4AUJ0|Ks1zaN
zLw>hk5*F^48`Ns~_ig{$wQm1v>{WX-!R?^S!<yi{xg#%#g6(Lcu70>QzYuhYExS}&
zSQ&YGVTxn8m2xpO^jduJUdb}oZC7csGO}G<THCHnPxJ;1V+h+X5r|ZJYtxn$W&C28
zvAbhH>kc~;cFg8Ds5MvPZbZI&+Bh<-nU#B5(LA%i8|Jvd*7cOKcWw25hve=C=#eAk
zH`hr2ca>I+qOQ%2iZWW?DE~hy$sv+BuyPFF`Fbmco-5CDTku!wWI=c9WbuMn)Ev9h
zCGwJ&p`e*SxeJ-e=lFJ-!^9ci^7MbNUhoE0xIh(*4>IU%gKRM1XI_868-20K0H0Ym
zz#H|&Y`rhmkH#CW!6u{e#%eHEuR;50Y*KzS8vcJ)5C6WL|MGf$y`TdAfEYpc*v!IA
z<wDt8w~Y9@DNAM}3~E_<8U`gsps#sxJ6<$S-^7-$!lKy-P5OsqpsSK*Ji~-G++({0
zL>g=yW3hZ(rQ1eZ$FV|dvrSl3-??bfhg0IkppOf$-P&skYKwJdeXKJLtP9X3u9*LS
z$~mHH*tY$nRPt-+oXQgfISec^KC4?>H~NjTxyuNbw)aB(o_O8F*59@X;ij6QOp6dM
zNn!dO=2zK|1K)xLDkLjB<OD(_6bs}BVS#kj&_{4KR5%_x5uMb!liCBPj5Bcqe_)28
zWd8tZE|>6zokbLDv2Yz{O#Sp?)OqcA%>P0Ed`KO_531ReQ}?z4&j!X4WfH-$W9tyj
zdMgjLwo=CfrA&z+*V>qTG`1j}euOy^X_=-J{;=gH+_zmT)B!*37~W!Y`?shuJkNRy
z{r`)AsF~5UX>VpgoR6rP(iL^gfkBQ(tZdrUD5I#U>&8>d+{RPOY~%5yz5QW(8Plq4
zli2tga-EW;muS3UwA8;CYXwoaghb7K;JToP(!^uo?{6MQ$ZB)pj~HmjYI&7*`hP0t
z3vvz-t!a1pL&CmQy`YZNDScgXEr#9Z{SXa#1T5E$c)cH9B+sUP;8rEt1nHpG(k3_l
zBi0{;GXCB4h}s>-@ls}Y#@=lnAVh2LgK!@FIqitrZY^$ffVKUL8?^nignego+qIg<
z-_RVWx1B8Nf12rH!Py-69R@&L#z~!9qXkWBG8%K*Zse6(E43H|R`!l5YKhckF4~Bz
zUr`Qx2Rbl!ek~%i2*GNS;^~`N`4kx?t<mIc0q#l{?7D<MMSPumapB;kVOcs5*@)}P
zH~T;J^tvO(S?cs!;&MqX`MyfNQfGOi`8u1OBK#VqcA3cahsa3^F>fv>v?|wR&Gb#B
zBEF4+k(qzY8#555r+gN!kh+4;x^Y?F8LhmApiL{!neQ8H(skUF<ZNt*KcUrmOOCW3
z8<r{kFY5JWWors6f2LQN@9=e`o()VO&KdbTz34<*Zr41dB<9$jrYV0zGh6&4vHGPA
zfrT(wpwgo|!tY`GJyciLN&V#pdUfmekrujC<bn90n)j7Yv+AK>;Qy&N>#7p}lXNNE
zQRT!rPotyq_&fRKHMd1)p?(okI?zf^{UQTn0)>Yb)r#M9^cLz^y$UBy@&@i?AHIIQ
zp*TTK?gpA^5M_lKKXZ`Fs8!yO4$GKNX4KvN38FqnV1QtC(90P$o!5aJe568X?_!Da
zRIDUId{J$dH*Anm_@`S0%LU1zP*17@irr0~P|dAh(3Z|nhrYF~H;e)dOX>|v5=E-C
z+-B5X5|*<Pb~lpk9qURPeBIK<U1{nQe%nkXWp7wE2Hx#12`L$<?BPwy9`OdQZ(R21
zW@W=K*e*JE;VofVTmF}pWtUsizGj;G18Q?fQ_H`erd~fI-RO^nb=93r>X)sX^b41X
zZ_+R0uhTC@^-I_*+uBOn{-#U9((1-@jS<+{RML*Nl8SG-Bvef{EoqlZqNnTx`0|sq
zuEDdCqKSA`yWzL&3RN=p)biXbwWrR>zr(u|&dgnK@^yEWi@i6zJ4JCPjDNGwV%-ft
zN>`&d#WJXV&+BkP0K85i!tv<^QE#S=3E~(7_vpJ6B0176HSDV^*)v6PJ{~FF#GbZ*
zw=ocODA-vI`A<OSmF$aQBxhCUoU(<Pqs&C0t>h1fYU!v@7?LYY-#9Y2JM01h^ep;K
zW?anciqE1hJ$<BjOF}6-IIuGKV$t+{TfW;ZPb-6GHu!YQ2t%DGf1?ry|Ho-bq>C`r
z#IkAJ@MD`2)Q9OKa3uj37?djxQ$M7VY`Z#!FHXgssNp~ht<io5(gwU56Ei%d-2Q)&
zH6&FNqm;!2nz-YCf;hHghHVUI+x^=%eiMTlfov)X*VKlW<@}L4)<hJ)s9_i7>`pT{
zy?y5ojkwvdg_jh~&Km24Leq_^->2+MR?+3U-B{NqtH|pz2!;e3X1<*>FnPsiDB9T^
zv)T1PlmEe>OX>@nZuV_M*~Z_h>Nc!DMooT3SjY#cLr7VpR^U_)q<%2Wn<%e*T!w9=
z3Lv|sQW5CpDW6jzkRLi6%v{gWklt>*Z{*7-q+U>$khEGnXrELqB#f2;wRUK{dcz@x
zhMpTL&-)vej-cQ9aI<`bv_jmGp5Vi30pEnU3;9M&%jra2GpJH{)4}VtE~F+Y(9~xz
z0@fA|v6$l8g0$EKh%uwDpiK2tvrVbOa7)D*-q%U>+9s*`*ckXt_D5xjs8Lyj2_4t5
zuoTw}ATAD%RE2dCu@lDAbp@Rm`Al(ZOd}rcC7dH;{%_Uu4(m(aI^?AzUlL?*Ar<CD
zqY@VSHi+*IU2P?n8P?<pss!2c1(m%vJ{=uH>s;F=?FZypagOUDwK9?`kG4aH44q)`
zlaxZ!mjBvP?$F`IFm|qfR37^i_1Jcp4((47U5!I<5Ahb&zqq#ZhBN*t^KmMH%PQ@N
z**;>`<zO)O-)niG!5jMDs4Q%k)5UOy*+PavN3&rcF+AKYigv6_R18pVQ?g|vN!dnl
zY+BhG^Pvp{T@^&SEXU4P+bJ?TFV%ot9~00p@MT65bl58x)ak7&W`s=p>YQDDY)7u#
zCyrZVpRDQAKTsxbkz=YPKdBPUwnBIv^=*ARp-_W?c4&LhvK2&g3SUoi_1_h4_R}G@
z_RxYmW))T>r8*YeVXQDn=cF7Jc1V3;K=5w}%dn$Vo+rpj1*L5zz?TB(Nyn<$9Rwum
zUwpT5NH!QU9FCGv7Y++y6Af;hX-`Eg$1Gs3jd0s=A8sMshNw!NaC;u|4HT<|aH|U8
z2NoGy%E&8Xq+3LZ8A60cVQQDPE&<@WG@eK@;2k6gOpW^0bAo>8aEIuezUq2AT}&s!
zN?#GLHGRdbk9`$ErLVel&Y{6d+Z}!t5u~p@?yJgpE3#_ZA;PgRYZ~;X$3&~pW&*l!
zJd?|FUyD~Uv)mvX-~XqpLhgHIz-U!z`^<-FM~Uswo9aSSxKkt}{+`ud%uE+%2Z3Cl
z#>`M3d3K={+y_L)9HXAK9EU%8YFugyOH=;d1(~6CX?~^F6Mc8zTz^@OhhzrY)xA!7
zN>^B|6zM5}VV3J_Aw5O9*79gm;{vya`7%P^3`>vMk(mnjMuDG7EwSkqM`T9fYh3?Q
z>k=yd2J_$=hz;}TeiM1X0@^T-o;Q&Pn)@5((c3~+e1slp%wOqJYyNKW2EDOxE{)at
zrN3P#E4F#tAuD!3Rt(ez!q{eJnskaA>1Qj}Zsx8GS-U(fmXQ^Ez6x2f(s8DU@*o)g
zy96&JUb7%+TN|mc((z&#K``6i?3-cg65+VZq$KGxNsd#t{!4)&!{h&7`Hb)o&EXWV
zyACcb&0T$Cs?{wBey1)Ug({IswS#ho>K#(ybu|839gUx8LE{@2m;SFu;6L1oz;A6w
z;2XNf->kZCr2k|1_ofCfzE^vQaCdi`z08X^pUztOU(i=W<y#T>Cbi+AHdNgIdxj^-
z**0>98BzD5U+O|%Y=pijt<&@s@#g%0qG2_~NMj@#7Jo%yFDb*+6{W&OCwNm`qz1nJ
z<5c_!ZB?^$ORvt|vKjTGjnr$`qdHt%xH+agWg3F*NEU<biZ{Z<J(v7IKH>9Oc#EdJ
zw9akA&07!JHD*c+R<7Aj-5gdf4KV}@>{?B1b#Q7giHTY1CW(E{xj}?!#@r@mXw_1C
zjMCtucqyA6<fC&s9n}QPbeB?EO^>lx2#1q>FrAjkg8SupqTvdo0d*4cVBf+85}J#D
z8fBMS76xL1T=8lDG&8-!upQ!huff2SCX}u(K!G#q&s95XUBb!TS4{Ulk2+&(U4Fl%
zE}|AxE6#&)Vx^0n=a*b+ufq)YWwybQMO<Pzdd`?%YgkJ0yj*D5&&9*;io)L3oYwMd
z3$y&9v<q7><KXJ>`){}oIW7uwWvbu8(ka_n8OmXX!i9+faz0w?SWZ!kKNhcKqLRUm
z-_wZ4PTN!Kuya=LLDe=gA%am0@^Ghv4l%Cw&NT>FBdzwaPuqNH_-g&(MFBlP|6T+X
z1G*+aJ87*?Zz0D1fG%Fgq((P%PdBV$lx2yL<N7tD1C}qLnKlw);~2q+bkiofosowY
zoQxT5jPz4HTTCBMho+Akh$U*GNv&~P_lJn@J{b&7^WmEW|2EKhX@cL)Ksd5??38?@
zv~Ah}j#p*0f~-CG+GHrX4qfo~fI9lUa_*p5OsY2^t8qIv0-y#vvr3bbGp(%msWrw6
zG1z%ouVy^qhCr#26b4E~7PQ-dXhZb}9b|1q^@E(6etd?@ME!1+DzF)JTnWuM*Wo92
zas<G2N)=P?-W4hBljc;_H~t7IfTRf3{qIyozRP8D(rAT^PNw=Dl=Z3rs<=tIrbS>=
z*B}8Tp#h3u^Lp1=h{WIbX9P^6+c8B042N&qVPlLM6`(U1y5~!_-cV9Npd*dKb;hHb
z2Kpp6tR=9Th##A{*MD05`7Sx{mh)M4{twCb%l0bg({jEjr&ajm|E0oYp4LC3)*II<
zC*R*GOh)N^*+TQ5mt)6q{#Ic!4q=uL{7>q`59{?)^63z>|D$qrG{ygE`F=u<RAPN8
zQ~FZP@THF6OIYtqR_#k@>%`e&FZq9?S8ZI@*^M>Qv?PZ4ssjW@fn{U#{FB+C?CxxT
z_F(qz?7i8q$=;LAXS=f<*+O<}b__pn`OfT??7Or2+sc;=VK3nC%I+oYo%wV&m(LP5
znmv(yr0|rlGN$OwmOsjjJ;Be8ACzNfv4MUTz<xH|sLsNFN0TER6i${d&pZob?dgT{
zi$V7(t`eByT$xvXey03V3y`3c#?kEyydX;wiv?)~WKhD`NvWnVO0qES1Fy%!0G5TZ
z@zukSvsoAfT|KNL49glS%dKWzcFTi^VzHVH!3?^u9v4|21qR($THbxZ6lWcC86OuR
ziR|>h6@QSMoSd2~RVtH{L3hQUQd>`fS<Wmj_+zXavNm+6DeIPZvk>F>oQIX?p}J#m
z@Uy)Bs+OJrm?NomENNw^xv%TjyPrgLplxkzrw(p`G;%hdWk1T-&nSB_3bU~kTD`u)
zrg&}nBh1Z#AIG!t9PrYw#LhAT+z1dmoAK=Sv9lf0z~wzFn@2oz5V}1)^H?(K<Jl>x
z9?t?K%n;8m5qms!Ty+c29&gOs%CpxS_qOruLtk|V&wj7y?c_P&?ecc>9Q5w-BteOv
z9rEt-_7XDeO?dlwj(B%7JMRth+_iohgF2^6=lm^dV!Ek;WB4_sYG&ojh=8TM0PPD8
za}B)e1}8GnDA&dHm{q}LLgF$7ap7Cy!iU7|Ksd`)ujc291^had4sX;;R|{)hTrA%W
z$B!<$fgu*cQLMd&PUB?1U3{+->c^`CvxB_F5f$a}59#zYJ2Jwg$>D`v7O$IlU$-tE
z7edwJb-&(M@)&mex%oytQ$`6>3{!63^@JTK86j*7zcB_WkDxCFcVqw)Rv8Tz@rI!)
zO=wH#I5awV1-?PPyF5&&+V+iUVfmY2k#Ae->9%73c9gGB^tw7?)=IB`-Nsj@0#`1n
zSGQC1?G|H)i-9>T_#xoC7-G}6wIX@%<QfY&0n5K{A-lY7Ja;REZ)ycoDQty=X)3R|
z6;VyWaGOiAv|UU;S5uv~Zm(%qQ%(0!=AP#C)D(P?ns!=E_cYfOj_du3zvk+CM_5<Q
zhiF`Mdb=?$`Id3%@pjh7MJKo0TLe$Z`u=|uckaP)U3VRqc2}#_!;&o7l4Co``Vrak
zL$+k+<v4L-InJYrlQ_;K8A4UrSCZFOR`T6dYHuBqrYSZ}8$y6i3+=FmhQ7kHG{ex&
zKpBQV3Je4Ni~ARpfx#JuDSiFXl*i}$JNNG1ySsAQ0)sVseC|2F^SI}ndw%ElIKQ8`
zFUh<O7gzHkxi=D5q4AAYBdQz>$9|OFj2gs^jKz9yt2hrz3wymS>NV$4*Pb{JjqMHb
zxw`hmacFFBDAwL~>fRn}uiG0no)4*Ei(^eKV#ckh#kI8-hc6B{wYWCc;tpEe(Xz!I
zVT(JPS{%7J61Lcl5I0(5Bi{D#E!Sa|E%}zaFa$SN8`a&kxjPPF&*R<>Z>P5_9O2z(
z#=YI%xO-;o<Ld$!T&~BpFJ{lh>%2Xb-fM#632zb(Pq(+vc9Ol+zSrO<=)rzhe|55U
zvcLIQ1-C~FbK^@m)yO+0JIK!6&JQ}V36A{J4|-35-6PEA%)E^V%5L+UwD&qaYIpTF
zDA=rEi-N5RHYt#@J~nPf72l?yCW)?W#v47J-19Se?cDjJg@svNu0K-1S;)+V19|@W
zQabe~6i9Sh$xbbMGx;gUiP}n^O$_au_W!K_1gS@kbL8Kn0un_B=|@jIRaqZ4BxZzK
zI60et9Aj*-wDOgqd!CyqfU{})=yK68d+DmZ*G+1$zn+ap0W7qQscPMNtMpK7X||X*
z7Dq|R?<mmiH~)tU{zJik5>&d>oC7TKe2QP<Uy3JVk4YCX5>@A9FwQDBj6S6-iy3Y|
z%Ht7`n#q7GH`UQ!?YI!9p4=^e&JXol{`*wl0R`79(Dl8sea=J6N0jg)LC|^P-lGpZ
zbnoN-hxODab3K|S_vX4rrV-G<OvE21gWTEK3zPEF$|5qDhiA*D7rkIW?<B9UDkJal
zLHAt2FO{d|?W%t4@~=<R108OHT$#0@MQA6Nptl%1g*uj*vBy59F2Ab47-e5k>=zaM
zk^+-k+a0-&^N821+N^3l>5WQ`@Aa#K{*;1G2(XXN^M>}y00x8gf~+em7p$tokQS_~
zgSdAtJz@idZ94}TjYJ*J7S=?RA3hB<<WDB@C;dNE|9VbM+34CT$?9{E3_a*Nsr6=w
zZ|PnBm>_f|=Ejp#Qpx|CvR+iMU88-OSkPa88Fz)hXMq;NAqdvaF;d|*mPeNH9^qhO
zuUs3>m2*rj#F-gLFFfK0BlQ=J#?%#Vkzr+n0oO?zC<odb_R^Aqbu;>;!_g1B>)-|#
z;#sb?5lga?aoCAqwF9;s1Dyy~fSqiHowT9k<IpU)#%(CW6g-VbQ)HR$!?sV{zV3fu
zGX6#<_b(8U1^jG_KS|}faaOF~#*8Fz16;e5N1{HcZT-Xsm1!xNn1WTR)R0N)Qlac|
zfgF|r*1?P3ufv(jL#>oObvTO%s>A-MH%|-L0T^nr4<#6+9lQ^#%`r%qh(H&L<%PLo
zV$M6ZG_KP=wp*Rf^+=mUWU#;xf0>CNSxrUUCqLb!VaJR(`j5K9_c&?@H8>21H|LMx
z9HEbPo$tf;J(W&vM_UkO{3E+?B}D}E-d0KH59GmH<N7pqDA=h$S3=lDk5B=Y?kH$(
za!PDwnrG~IN84HtM$Kjx;K>w<^9@uV219{zSr4*<8y`(J=;UQ;D-7$6|CNCRjE!U+
zK0#*-8F%rU;#aD$S~Ql`_K=ka4bP}PCNH<7L&o4d2_sSnqU4tZon&;rT^=HPM4ugq
zFY%LXX(CTV+bIPTiPWGgS)C9oN>M%NxQlQJQy6n=1%pqBHeF2}ArzEg2$e0qPEXME
zq+g#7OVs<tz4sRVT1#rOU)+gr=of~N8v3;^Oo{hPVl>qh^=m!-a^<qcqyO){(I|zz
zv03c&HW(vUGYV55hHZN_N^9qKEXL0`utSo`CP)aIw)E~WJyT21B;fIK9KvJlEmlve
zmX1#Z(xZAhYUv$ucp!QpvaYAMz1|HFbs`m2Zk21F7z}j6e#DcwL_q`B<wDup0XhwP
zJ0<#_oYlY$LTX@m(o}SYTuY>`;aH!BBoJ|`5*bR!zHz*Sly%l3WESLh)ost>5J-J#
z#M@yEZ@~=fxE)K&p`;@L>uwv#wlhOb5H2lIQ(1~T8@%xkQTz190<BGJ6mIbLoK3;z
z{|50Jy}gJ9^#29~DS5%mkqnTVin`TM&!1?_FK@1H0b-NPeok*C=ITxPQ{>!aWhoQY
zVRZv5!*#%ql0FLYdFS(8h!u2CX;l7^NEv;;r-DGJx{b80@>8VM^<>*?sdMna^#^=!
zdHb0$U~`CldaK*KNfRKX&TX-By?T1uN|d)(r3PemU+li3QHFp&sJu*3_awGbOCL%|
z+p9aRJ^G;j)u4`g6DS^FJg&OS)Qr|kAX43pz)7PsR2^ro_muZu>_cyUAG<xK<Gise
zX=b+1n*ba1(b6-D-zEhHYMP!XPkQ^ko4s3DYXjwdh&wvGTR)~5l3LUL>djdFw(qEn
zqT8AM7}-n7q*?V=`?dzEw|TeO2;L51eV}@WcRNrAG3Gbu9l+;z+w`5~ySxedBca>R
zOW01nbj{Pg){~o~w6Bpk<lVtKNcjQrJFDBucMJ035c7NwvBUhPu7I<`vHcRyhrEMU
z;@^p@#1Z~?CGI75_zGAr2AzaB+5*D$AZ@P3l0CRj(EJ*EkgW6mraFf-*6Z*u%KP3y
zF7V!gWS`ug!o28>9`Gg%_A$!ciAU?U=?6U=pG`mD=E}QE_SviNv9WJEb9?n6a!n<}
zJLKJsaP%%cYpiwCZ2Dnpy#35ELdEXHmis;Y9X5OJyo)zwZCw&n^$|iw4843QD~8YB
z5xb!{#m#i>m|72ys%>urGf9o!hI!5q%<Ous!Vh^HtiI0`<(FrzTZ{DeG3I>uN-A83
zL$(fwUQ3^=8t3`mV}6o0E|^5%HO>o08yEgavFE_lMj#fR;WSZrpVCAi7CucZ$Sy7W
znD8oWRPqx9SB+}ktU_B9h^j6e;+Rs{mm!c4q;Q=cje-3OD#p)|%8+i5U%GF8zmkqA
zIHBMPf~%_Os%RmOI5#Raui!XAka@p9v(!ws?+}1!_p73_p|NWTLDt!OBFzXe5tc#M
z>B9W!1%8-Gwf|1y4A_5MS?xXsyP>9>=uT&`AaiVaVc`dv@q9i{EuYkjh`(@EM6UmF
zJ^l*8)kVP$o?OPlSaBK`60qfGpOd!s76u{cI4;ZZLHc3z2*v=Pcb=&K?oi^;1Y}Ku
z+2FA8^P$ADl>uH!EdO5Qwn;3v#Sdp8sTC);Yr@LY^n*8APicRHf?>VUaVCH4xEn2#
z48Ew;R}>gAYxK8Kv_{9)RX>b`H4?AC>}L1G;?hHm!GiymO8=&U&nggk=v=R6j>-C3
zr^sln2GPU#o8|;sijs_Q+)JPQFRIj=3jS7uE~4KLhy@)wB*Xj=kWq`^V`y_#BPtmB
zXZHLBynLuisV(D$dbQ$Gv0wBtd|sM;7)x;0Y%s7?KF`H4mH-yxmN+2GVcn9m58WS`
zbvCIB?xTBk%6>dLR-+8=&8RQO7BNy+J|9J~>EH<vOS~Q~&T%zTr{r~J#}Ks~-3h$p
ziw5y|6JFpnGj$Jz{V(f%`KS=_DRxz5Si`0AA__v!I+xr%&n_)Y<8Zr(04V6h@%94x
zmqEAMol@;3XL5bF21&wVWKkGTvQ>!>%~Fhvkg%93Ay<oaUa&p>Y<bx)x;5PG9`xG&
z9;yqvofiF^KYLDUrNvnse$P&url-l9N+yX8dOWC*>GOCKC>8ykT5nQR4Km`o_)k*N
z*odQDa^=M(N0oMInp||%;xc3j3&TAcnd0K6LLc<nnwtr!{@lrM3F3X=l#jxdZ1De!
zM({g&yIeRsrJ$#{DB{TZ;mtZ`i2uCas>7zt`8#+^h0>?EsK1Gr?ME{CaH?FIFt@)A
zM3ldc3QmZ6`YugMB8?s@G^qY0NNarFC?%mV7fKbGw5Y9CKhe(v8C|JgCv|?SO;_#F
zY0YM#ar5mrr25jm>8+HO!|@anaQU85U04llPkq0;cYS-NLkdVY0G}gQPRDU$PFz<U
zB@352Qg8bxzs>ENk;{z|nx#SFq6Jl>v}T7>>wy_)Uq@<&s{+W<+t<Sn$h?!FwKc&S
z<fg5T>2JG_pPBmspFv|^Y=0HpM?4k9g~1%xG3^%_SGNT$YQ3b~ddbZA^G$}<wNZRa
zQ=sogovaMR_Qea&NiMARqkn+5={Lo%^igK{A|^kKO8qlj`*J37caY_^6U*dKnLM|j
z!9Z_4{UQdIKGZ&kFIX9p<{dh!S%hy;9>%hl8K-WKvmK`fU0CRm$tCdmtx~9xYCr!&
z%c3;Q&Hb}&rTe3&kv2>*KV|jxsW$iqv}-QN;5K+UojvX3-cfCp-(_c0YUOO2!MMLn
z>&N<~zVd_SM#Vqle7tzw;hAF}lvBJ<uSZYTr>fOuZE^;|iJ2Uwh5ga_1QY)1Z!9m-
z7brCKDSPfEU^_7CpAP4~KZ<Ma?F7@~Jg3jwY`r*sHy*Vy;ll21+!D@Bcxlqwm{l;X
z;C%{6$G~Wiby(Rc$>`%YRtXA#+L7F%`ez^=k~0Qt62tf(P??JgZ0FYE^IuY+ZOay9
zWPsys>Ir&ErweoCqh^8L|AauS(O*`~HV`vrdRR?bU|h8ilKWjP*uA8qt)9x;H>Gb=
z_Cxh$cf%3O@a=JZhrjjczo0tEZjkwAWiYn5!_B|xbJLe#pWEw17;@M#vS-6`gsDYn
zEG;C<x>18}AE6cnJ|<vWb)A8fvJbPT;kmczd6l3&^QcF8WY6}T4a-N*EEUyP?X$zK
z407O9_L~|L@#N}qpCBzC&u5fYAQ<cSCl%LHaAyF+`*kju{AKyKDQK9}14_F^flZ?L
z&i=nCXqbq7fUJdeWjZG~P!IoLMzV`Y{YaKeD%ZsZyv~=_xnbpT7u)__$6ce(he4>@
zFL(%N^zkoP8@=T7s)qdL=RTFL%?AE8>^Yo`YJY9?x!<DE$M;J4d~EbHhUabM2mJjS
zy@ru@Ckka46Y_sN{5Mn84h_FVclF`ViW{s6un-Gy>sn3#T>7T*r$Srz{<n!Z^H>?T
zLhMy+>8!XKZm0BJbjKGeuZ%P`YG;y(AFWMWf!3xC)rxK2L7Ru)t<A|qoBFo1;d|Pt
zqt!Yi4X=7$s*Ic1a6RQ3p3xkE0Xhz()6doax_Ksq1(OAx*;G-?k$SFf*u`?otT1P0
zrHlzT(3|rklc^4<Rp;3DoR|oh%%i2Uyl+mYD`E#F`toF3kz>66o^DIjluOi95OQf;
z!;a9jZN%(L7?Q9hJm@w6n*&?8*Iur+wbV!ox81y-$q5D)7H2#i*fsqU75e8%u5T<*
zu1B=kgbw_3%6dXU!%lN6Y4>TVj{?S0x24^kLST}@@eA7H-58lpakx{cXO?~Q3Usoh
z#A*=U6Cp9huXKwJGvRJuM$Ijpc{GR^|Ged3-dV=(yo?<c1nW}30(Q8Nd$=e~I6)4F
z5o$P8AIsAb2f;@o54)4J^PU(KYsYHUI*Mx53Q~>nT2b7h<Oj&<vO1oP!E^7WsSefG
z)ldhkmpWdFmE^XIO6ph<OO>&f?s!={cC6;kpUxH{j4PcYCJ*GhO>|!AJ%PB{)B;GV
zWL=&!{r`@jdt&*OoR-Oi06W8&9b8=cWG$UorOQpH(sEIBtj9m7<<o%Bi~=@XR!4!h
zo^5xr<*fzKxazd+%xl%<iRjQSqGTgx#icmQ%8UNb408t|vEH0}D`c(+3KQ!s+I{F`
zTS-`s8$JnB&OD<#<)w8u2qTz?dC&;KLE`gvh$goK0gf5agq7}KyFyq_5+Q5~^P8w<
zwt(#kVKL|&!pgX21FX&Hs@Q#^N}o6YdCBkX>i;P^p7^KN8RGYyZUSd#(JU;6&Jk9N
znINUpNDajiA#iHUNSbW`_Ql)z?kIJ<q)q)kB9&b+MAE=5iSuflYJ}sG^-|8?!E&q&
zSic{V+Gt*zPqT%{sOSC_Vv){9U({bK8B$EYA%J8rU#|6X$Qp81C?k7`r|3rFU(2>u
zJG*Tfoya7iHPWKFpN*Qs?z^#>cI%USl{TDNJmWBlxfE`yQhboxh&kr1gc_}M5A+<q
zm1#WJ+p@7=ppP$V$qN&RdL9ga!}xq!c_tDBBE?Ohsn;!4wWqdM+k|<_m@L1Xaq7eJ
zZp1Tsmr^E%-LEy<K}r^zk!&TzaP!UjZ`X}aGXbkyz6EJ!N)%LX$4eVG4rNHTijb3<
zjZoCCEmPi&&{khpSf32VHbI2-$v;w5AH=5Bhsms#_31XL`ls8ZO%P#y8272w#~I%0
zbMKmXo7h<B_Ni*?mtLuf+1e4BWxZ9VB_`!unKF06I>Bj7;&7|IL3g+CQu<P6PR7Vp
zVhv(b3Gfmn>|0+O`_@fE%l}@AMfD&P=<I>$W3sb?t_;fFNWPMFHbp{(YeAkRXj=2s
z)1VVo_vX>cbmXsTjXy)7%FftwOMZ_lefP8Kg%r1%deIs7!glh@nJTJ~EE;6&OpU^{
zLa+T(pnS-y)G@4I*#+@sz4vnjSP@;5?1b|g<Iec{h_KUQo7tZN9b{*pnqidq|7neu
zJL1@<R@>pO=quCc<lD@4`PgcAlf2=>{wNT!TZ45KsIRGOhSOf)LkqfefxTQP<2&RX
zYW07p;0+ZPqZws*ZL&dbalxB|6FM8@U^yFu(EoGm@Pc&|Pa@WFXKdpy)Wp{nSl2&K
zJMQx3XNdir`aemfx>F+BTQ;cO-SJj2<*(zn8Pu5|36rCGikfTx6c@sr;#YbvL9~%@
z1Ys*-doXD%R_HjQg%V>veA0F8TGr`M_^zefb`qwe{2iuaAft9D#@eIJx>+iyPkoP@
z7!9*AV*LrlbZ@{n%K~}D%-X<>iv5g&j}SyA)HLXBsO_eZjim*)7Cr~$XEc^Voif3$
zKW^FFxZ?76D=)F-_E1(o>S`;!&5T7O#paN@oc+Jpa+|k1+hB3sVH@JLHrNbXb<<{<
z*q*Poow7ZzP&(R-qwO=Xy@s1{$K@6qwY4o<YTC-B0nyaE`c~eY7u&d_?X}yk1AvGa
z{~ing^x4LZ@;}=!9-(!~)9TU2y(-+`1;;&$`o?{!<5G666MYwzTFd4wh5Kmpwo<lv
z4>xY!oVc|QghrYgcJa!pa0A!98cxxx+%_HpSJsBP;ZdZP96}|N|F`%4uh;he@fM4}
zc6R*nw}0w2Bvjf(<Z?E){oBS5LJ$(@s3QbW0bDtU3OX84L6Ey}vQR8a;FUWc#y}4v
z(MDG5XA_3#jA4kb*~ql3{{_BU|BD1c*KB0#vlYhpOYIVD>utM2jM4iOV2nCS2o06|
zo?8EH1wsV=?<n@W)|rK<Gsu2xoiz^=4T#~(Dr1}3uM>My-8@J)8xcc<7FPPd4+gDJ
z#Hdij#%ryB1eOqn5JP8*U+M1H-skWGWG&8yLW}|4sHU&9m9cLdUgpL?IbUh*88|s2
z;^4^@UX{GdN{YeQF3l5zBu|zgYbEX3O3FwGY#uB5Q&v*!8uO%MCCN*av}Y?Re+HNd
zv62-l!_gC_P>+tjup-q%@|_OJxw-nYmF_2(E6i5<lU915sWkav=?`0Jd$!WDt~%)K
zQreCJd)e@yNx75A(EokS^_K|7)-<BUH<e`<Xd4iuoki+c(#|J_DsN^rJHO^1VBUg$
zGr|)yvQyEDtmD&=RrFhG$53f28#sSJ?q6#Oid7^@-3=_MWavz8%YcCC$YRsxT8pg(
zx|R4#M@pJ%EB$~Vfo|nK$Y`1$jlzM>2l04WHhJip7^S1TP2G;taR|yFduxz&6q~Cd
zkvcsUp-?&4=nGK{{!hJrLq!9%S|Vhr%Ybn>qao<okU@%eh!A59vNi2c8H|iJ=Z_-j
zUWNm5;S^((_@`}541>CpKYRo97ecoVA)0Z=&MAS_&r9MCyv_VNA|Z|_nh5EY5QXsv
zrd6#m2?ir$+dp=HQBp8B6z|sGe~tR<94+aR#!o;kNqMGWAih<rcvf|`q<xx)I!1xD
zsPo&kI%Vq+pK7Mta&59iG=}qDB>WPI`0PV{g^z@*4AGh(8Vz?VA_GbF!+K3oD-NZ+
zLHU28OD6h`P>Q_w2~|W?#rj5+(X<xKGEL&l+(8riQGZ+MWdi#!_;6pwYmKgj!kB~k
zj&TOlV0)gcpl8z6&@wuDmYADrgo1^SMs*eGv`RW74(B$>+|$_(|A5M4MOx;cGWsC(
z`B<H#Nu$j^r;cL@exW8$2u|NhN&E3Ugw0QPiW^ALwl#5k4cxxJ#LM_+6-)seTP#w1
z`o2a{6%ONKE4JgcQbdV=IyE&YCI+#r7DcDikuhPdwQ0oBezKe)$$bKH3ikeHr5+@J
zxO8UK>Y7>B%X?MrWjg&mb#Q>EJR7zQmdSaZE;l5Ip;p~ZjG4MgZ6QyeNMyam8F)9Z
zQ%p0ynFmaoi(I#gXF`+NGQl-+5Vr@i^ll7jsC&a~_Oj}09WBL%`Y?^TRr?h{+M(!-
zsuSxt%d0@u#d&BAyWPs~AWRm$gEQN&cWBvXJRw@2|3qD!Qg^IR^^Sd&jGQ{ABa;k5
zG<2$AOOr{59tL!5-O6{?O|&mX9;a|3@w_k5urC3nFR`F6Ih`*dj$LB%#bxxxMDRuZ
z`Z_iG!rQ(+c3TR*2GCb$V=qvDi%n$w6&ScQ&Rgh8Jcbr7@LDhAuFqw1n=qTOhW|Tq
zc?>IbVN@Z935MbBfn0BIr?0YOn}STafH3MC+HAI`Se%{ZvxOp;p&Vy*TuJHW(rmEl
z*_rYZ%-My)$?=mjr_Rn6y;2mJE}xy5pEo5m(>e=Q$AISw^UMCMQ&W3^W)hj$&0F+K
zz9zxfNy7iCf^F835~>z>f%W&QQ0CO)(gptwGW^dfP-DJH*0|tVq>YlhZ^x&@O4Zvu
zs$f`wj0D*0wi?4AcktOoZ+T(%E<oE##|UzVWKKzc=uhwo>hz?r@sE>Js_g~+4ZOGS
Pc;5pq9uNCo80`6PSh%l?

diff --git a/aerialvision/__pycache__/lexyacc.cpython-310.pyc b/aerialvision/__pycache__/lexyacc.cpython-310.pyc
deleted file mode 100644
index 7ec78ee46e51540f598ff8917b3469247c4df25e..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6623
zcmb_hTW}o5b)D{+-JN|801_nlT#=T<C5Z(I>R~aAA_{;+I=~VFNHGS9+8S;T7Yppp
zta=)+&8)2y=@MOWRaEwut8CY9Rh%k)_%9!+{N_L9?{-yU$MJ)CC{CQn2@_ksw`Tw>
z9@bZ|H9dXL?c06(cK7t@hL_EzH27OO_XhjxE1LFW6#D;SAiM#O{}>XcG1AZ$!Z(TD
zbb&f_S??O4r9DbQZLAqzh)4NE(_Ap4d~zYFYDq1m)H}V9VVa$FG^im5X`bmL8Z%g|
zr!QD6&JvHcg#l)=B)kV1nb(S`r{IZJZa+6&e=z3Pg)`c+t1EWR@yA+?&Cx5R@vXNs
zZrqxgYNtxIR;{&K@#{^Qu^nEw8*6pH?zvThHAR0WywmVZz~g@(l6FRDf-KXnE(}I`
zl+j1DL+)yBifH%~F{VEvMEfFicwP(0q_#NSq3yANcH=#*Lq#H>j|ekXC_i^sdv7G5
z%z&=NK)w{pU6aLoq@xEqYC48Uc2g_?oi%!TCl<uI>45g2g&-DafpJYkD?<~RO^0Y5
z2yQzc#5)P_BN-%E3R*J=ltF(w@h`>S8wua*WL<e!Oqbi)H5<DBpyBu>u^}Yk?Gt6M
zunR0atT&v(LvPh(C5{1lM$SGX7p3+Tg?8NG+~Zf<M=uuY?jFlV-F2=Ov(oTdjw`9>
zOQU6rC21^sbywzBeTUCGLe$;bEW)uCue(CVe8KB2sW-qT=C>MkA?aFGnp<8;tKMvR
zT)f4-<~$U2Yce^tQFU4ZhOlVL<W@mw4t<vf`XPxgO<&leY&RX5z}&kwmjfFc8x`R(
zuTovI-I@bB;(Vm>Q`ph>$CkXNGuEuTWA<2eiP!z{moAS@dezmY;|hOFIKFu0wU@^l
z_Ji@UsfnpqE`5IdRh(9<4I7=Qw79qIR7Is_b8)4zHa-T^ZZy(+9HdAd>z{>V4QFH1
zu2xH}O?fixY6U}8Va`M72&-)OZx)>NOOQ}{l*EaNk4}k6GsGbHq;!+yNQz|i95u)>
zV!ls*V|>gDV6tnj4*`IW4v+se0~R)6v<9=A!AyJ%bC<<f9Ns!hbc}HEbz&^V(tyQ<
z*4272|LfXU;b9p;x|C&Ej^&xvGrmFCfI<e@5d01)<gh}9`;a3FIofA6qL5=yCLdWH
zSICJz<fKAQ^&zJfQs_gTQ^=V<<g7wI*M~f>kQe%p7Zq}@4=F0-d>?W_As3a;qpDw}
zeaP58<kDUQVs?2i#>V#{SN0(<?L%JPhrFVYSHbJ8*neIjul4ogszQFR54jfAgR%d8
z)w9<l<PQLOLm}7qA#d(OCiWqd`;e&!QR6K1UJPc;TL)lo_hab6cS(fY0H1F5jQ>aJ
z3*kIOZ&a;+xJRDeBhT!S%X{Qod*s<r?io)s_RhC*-zD(XHi8819<Yi?km$yG8tfmZ
z@6(RiNd_jHTcIBk{<j~J_x>hGF2~u$6&>Us1j#1^<!f4~0m<wpdRi#6`A!PzqaSD=
zkPqlXo!w@49-EysdzamXm75V}*Tj_@Xx$|HqaOK?azmtGm(kc4d$f~vV?yO&pUVbW
z(7q95yP2NW&BA_`3vwS)f7m@A<hnVw0Q**6<byoya5?Y|_Bdg&dqHkk!(Ih>mLJi&
z17K%CekjThu={}(4Ak__V2}<5RY_>a&p<na%CiJ)@QY<>+`Tn7DRbpJGjC4K%}>2^
zXR16gwd)gxn-M<q@ca_KV9+TrCGFvR_GtV1=t5;N%3K<Kt+IGtec(SLk_eRxlha@(
z<D!C%1y~P-#GvgX@cs6k8K-ou;Z^O1|9UAb^E(~jXTcB}f8pG{(ZxT$bn$ZM9yGgn
z;oQCZrNuB?4B^7!`QnAJ_0NILUS}*(+1=;&vCxZ<w4Xb`soiDU@q5Xug~bc|+(ZtC
zaq>?<5@^esp)C`51GNESLtfRWR)Gxw;-|Ou`|brvQstr{W3VwfT*h%bbv+qdvm2|9
zj8%ae2!4?FNl|f~bu9h?wuaF>LEFa;a6Bru&r=*o<>?v72-uduOYm$&i>6iRvyP+)
zNmtP*w})>w8cxk_6ktQ=c2zjM@Z$6Si;B8%By~lD=itRJVsai5Y4|Jk)`2}ph>GGj
ze+tEXbY_rPq<!>&9;jk6<uBRH;Wu3$_)VCvLtcgGDfVBvA>**Ezw1<Gs@Cux*p0Aw
znBOk0e3fw@CLHjTbf}%!tOB92LZOO+s#<2lw(i=zHCbQV64yawAf~1--*z^{Ox^co
zzTvr!0BMywO$5@@mnS^e54Bm;R#8+sFr0I2rVKNv;vj8V-Q~cwl}cHyqZnC@_gRT`
zug_|{-wIT_td#Bs@V!i9OSJ@2a$41giZm*)I>fVK7;bw)1#mDFg$IN(3b{-$ZZ{`p
zCV2wKIbLTQ7<Pr0Z_P>*)#d9Gx1-$b+*FjCn|ep4Q9EOA%*?$r9}3sk93?2bJEGnQ
ztD1WFP!2}}<TYCjh%dXngl_tB2q<243ltF;4d6VmNY!T_h7`G=>jR=f;>*Kfg|pnN
zI==6)sNmtSATY}ij!Ikhn#1AD(|icj({ZbtGG1Nvg$E~`>B*+k1eDc%@e~=J;=$FG
z2QaNN^S5`D@<=p)9A1MeoHwBdE6iJW<uEF4`u?gj-|)l^eta9B+;r_`y?T@FC=X-!
zDxrUs7VuX!U*#bMtKjsZ7^Vu4@<yZbV6!qae`9X_!Bbpt(u6@9pTuE1UA3!AYKn2_
zV<n7_`XV<jSHMyp-(4V@<&kn^w^B{=NOfknYInu1*Q_lGd1`l=ow_6aiQW3G>LYE&
zUt+u}Y_(d8IcZ@K!geY^^(%ZIvM%#jMs+VLvImGtsFtD<185St7D309U${)?c7itD
zAFpj)WEd&Kcx@|HGm+}#@2J!DW@<OKLbYwVQ*ERVL$$5nsyWnhj)NYXcx(FB+x*Lz
zH_CUWr+4iP8Gz&P;E}~QU`hjFAwVhczJYXLk3fm(aIiiLc{oZxp=*T0L4d)*8Rc;I
zVDTq3l&@(Fh(?(2Qar2&^a)wh_%Gm)ZbCg1LJtT_jsTfqsbP&J6-HP(M8o=ke8J9y
z$M<DJ9rN!8cnOg17=XmO@j&k;0;2~zEYaR8s}`{&fx6Z|qhvL-W*N21p`U1#1*@-r
zhSir2w90|i_ddhwn+ICu!Rll1J|v6Sr;F-3LW2Vy0g4Kj4ytc>c@g$90Xe{y;VJ^g
zB_4nbBoJkYKFClql5yO?W;v;%@>hWlTqMa1#X*u7BNZpgk}Urfh%OsYMEJ`hr6T;P
zDv|7%HLa6`C#Cv!5+ZIg5{?tl<fB2d8;21z9;w*7i6GT4Ls@9AV*-Uv1mS$q)(F_m
ziF7vuM_?HFp8h17AJ??4+=zywTGP<;gD{rdK`Ysv{B~4SpLgM5bW&KO$OXwqq?3ks
zy01+35#5n0S?qsO#i%&gK84!{ez^-*3&>#$yZIo$0_)`&DX7g*(Tn!$cdyTtZ<gP_
zT9~(S<0?P}d=qYac%kXQ-AYw)r^1r$7hJEvY+;uQ^FWqbaO+Ykm2Q{$Yfz^mHFXPU
zY5Q2TUxB_5e%}h}vtM}L=kH@t8<S;7o+1n7xQu~9K5TfkVxA8mgyamkrne4UQ^yM-
zLu|I-76)6Gak~X~Nep&rkQ8p!-MW|$8JP_(Z^)#t`chtPN(-5t`j*TBg$JkjAU1&?
z4jF6O8?zHqhgUHp3EVijU;#A8TUl*Y>TYWln#Ow~Nj79m1xD%|7>go~qgVxPRb@Kd
z)I%N?1G+Ff@sS>Wxh%A<gcS<BWs^e+X~NeSe5n9GgZrGmvq!2`*&4zhK>hw2Bso2!
zXMlC8I~{706OcD)3T}H25#v+5;W-3I3N(A);oVOPv=-<FiSet!v<bID;SaSI$-!Mw
zjt+zV5XeX29?2wUNL+6p-^X!xeB1Rg#zPmfifx)6Xgs||)uRtBC;@@n&xoe$<!6WH
zH(g<G6cleR*g{RT_RwB*>egFDy2ICtg#Q_|wfC5mXoWQW9600Slro^Z8F~Owj|#T+
zQmxIAUpymM`M0skKY^q;gx8K6{gVV{Cd{e=C}j)@t}j#33=wvXKStcZ+d65&wDi~O
zc)eL`!*L>sDie8*x_*o^;MH(6P*dCjUD&C-s<_(ld<Pmg@xBs>lmn+wZ^dza{sNXf
zr>YW_dd&s$%kjpKU&3S<6^1g<;Z{&4)vTR_rNO_2`tM+J1e2p!GKnS2Y8{!Fg*73<
zm85bp7;@o<vHd0*sq=6mng}gG{7)dEF$xpUOqhldGpT97Tc?Ia=%*H?zp-@sYs;Yj
zXT|9MT5<Z!O3?qYO#0telK#p{(NC;2{Vyv+|I^CSUs^f(A6B0JyJgW|SOdntS%dUr
zYl!}<b%_3pb(sFMHBA4>IzpdXN9oV45&ARh82zbroc_c*LI2S@Nq=meqW@r>rau}i
z&>s#yNB`bBWBgt2Yz`)J_``|6^~({s82!HalA*(5fh7Rbo><hNAX9V3jKdWvETTBz
zxVUog#vd-T_$Ow@p!_eP>%}3Sf|pEqeo5VA>;9(C@xKIcwgB=D;Y`5)97;Hcx`>)6
z;q>5t4kG^x#AvN5^ILbg)rJk^-VbNo*RTwd8<}w2wL14)o<*pD3AW!i@9}~<93b;r
g(_^a*=XG3({xL|*mrd|H>;vXZ=;bgqGnv9~0mL#y&j0`b

diff --git a/aerialvision/__pycache__/lexyaccbookmark.cpython-310.pyc b/aerialvision/__pycache__/lexyaccbookmark.cpython-310.pyc
deleted file mode 100644
index 689c13c6e4ed8768b6fa779495902ada075e7b61..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2712
zcma)8OLNpl5T4O%wJ+Nk#}6Kf3c_PUd6zhZgp@-tcpa#Mbz<`fYs{9c5w>itq>N-^
zZ|$a10V;6JKd^J-FU=vBoP5qL2a;|{YfM-}w56Wu?oZRxJ>5O)7mGOp&$F{v*vG?!
z{E5u|M*`*wyx}AeMi_NTJsD}8#=O*&>oQGbN>i<?>0GO4n8f6#q^>hMPfE%ztUziO
z((cm-6SObG8~y=g87%xxJ0$tCLz%+Vr=%k>jb&hzna;8>DlEtHFsiK3(O8l7!8{X@
zZL&istnq^UN*+<x&jz0A9i1IwgP_kwbep2SH4sUWysEInV}uQj5ca_nsgrBxw$&Zd
z$=@U1F@;3h)<a4<1w^+Cs~I-DCh<8o(#d;LyYLIWNA82iox)EAPff<h^f77Yr^#Zz
zQv^$TyU0e_*fVL2>`+R6G@lcQ!I4@?70SKYxoM$QzrQneb6%)-r*7V<3axhM*465q
z$kb+UU%yfN)}j!N?t^oD=i&WJ0n*0BXlr!g#NvWE-kKV(pDmpqzx1HoFc!~&@FjUk
z1&z24*qHqNyt3jq?Ml<}DrUu6;Z8Vl;nT{rZ>=|NFA6J>9Yz-~eO__RrHM-QTJ_@j
zFDJfegidqaH6zFO8UgoLZ7XU7CXYUAY)n*^gZ1%L?{nZ%zGB+kG2IO(gs&>Dy}4yt
z){^h9HBG)&4z@%lYQ!$zMQ1R*6sa`|tA_S1yFBUomg$C<%PHu<Fz^g~2_euHzFu4~
z786giU3U{U53JCmMiS{dDvLl`V?C@TnD?41rr~ZtnBQepiyC;!5BF@2^stGD_gc*3
z89!k0vA)yGB!QOpEcWs;!MxX|hBi;UBdFP^k>=^6J$pxbc$k7(%0h1O+*@WblQE^k
z6F?%U{5Dyo3@T=rJb{`4PTH24jP=7X;gwoz_=fA+%cg5sD<(Is$mYhGQ{fqo)v=4D
zqzE<Oju#1a!*thep+XAUTqxn16YyRM0{Lij?sE@!f?FD-15jE$yoq5wy*P~Dmcf1-
zynwC(eGe-ixk;o7$mpI-iDigWp{txs^v?;C$4G;)zWMGd8fYip(9(U{3}~m5uP{yt
zE{@gf6?DMdO)<&=m@HscQjB^4CI^^SiqQ_h<N@>IIoXz@%(lK0cfEbPEr9kn<fAMr
z;_2eN)ZYq1Xg{`R*~V7})`O80q7k<UBWLt@+HU}pl>R@!q_OfrIIARI{()x~uO6XB
zPUPAm&+O3RPJj&$7&9`jt@yCPANDi%MShu^!AhbM3fp3vLY==oHFsMS=WH|dy&0d+
zIIi6~KIIwTL1%NkWuuBYW~`W@;ak=^=QgXAa{LHx<|vY5xDU+?0^4K!I7&W5@)3%q
z4NK_V9?CJ<u)POBimnqzSJUnq?rVFwpwW%&W59=}fefibv>*+p?<kDvE8~cAg%;%2
z(YI3n^;&ObKZb9whTDYayLFt@FxD+dr1-?KhUHpoXnxC!%uORMC&P^53T_?#ue3Tl
zQ<7q5`4AZ4r;zMjXgqH2nINg%ZP0}p5QU-}J*qba3g!Cjt!nFZr5s;ac)`7SVRfS%
zmZQx`P`-;dlt>bWkHWl^6*9CGq2RqH6d%q@NPZ|XwzuJM$eS#;P3AhD4YC`v@v#dP
zeD!?jOnA+e`h*twYql5i0xZb!PY~}%GJxa|5WKaVWzXj}A3<3O$tS2#9zkM>tm6gi
z(KIBepthi!NnFk#Dve8WpU!|JyZ}Vc=viG!eo8zlQbE>Bx+Y1G6ts`#s768dO8;4U
zrE5bl=a=EbQkGA^Amnfh{0ZExGC0pGDAJL`DDd;h_HUSQ@h`cy<(gq=hx{Z+<2Cl3
eT6})Vy*%0U*}7|A#;zS=AoOo^*hzr`mHq`{1Z}$j

diff --git a/aerialvision/__pycache__/lexyacctexteditor.cpython-310.pyc b/aerialvision/__pycache__/lexyacctexteditor.cpython-310.pyc
deleted file mode 100644
index 2ff3a806a2f4d90a6eecc2ab3011b19b52126897..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2825
zcmaJ@TXP#l7Vhq8Nu#SB2V9t|2&9N`SqctGz?FrBL@B6Bv@BkrFiuTP)Qvp$XlCec
z8+&TzDcK_M8}`9^<WKAm=~tfo%nQW}oHLS>IAJ}iK7Bi<``o_Mikr<Ef$!0U>!S1`
zA;03}>5s$YIy4;u5k%0CY)pH)K_%H{hYqZ=38T|hJ}zyPX5;d>vQe3h-Hobn1b;|2
zYN8~{(APypxX?FbQ&e}DdS5mk5a6ckMeEd#;6^e?-@Fst@9DtEmBb(J_#>%%NqDew
zsk?gI4=Hlaz83T)Xbv>}KOlk%HexxQUZ1n+^|>P|ITx;|J|uZb)I=TnvLT0L0!oZX
zUV-KY<R|hY`HqT)Xg+lFs%VKh;Hw#WNRiL5jEMOMEU#yEbuX(QIunvNvWEW0BeG7i
z#vvDHCRBX^qtbNeuq+lPG;d~2aaNH3uqBc=qbi(N$r|DX@glBFC?#24U&b>!iv#Q4
z{_IYFaN};@wgz9`{iOfJdjDTv_6IlnCk#g?U|Waw8-^`IahksIt-q4qSlRG;-Fvg)
z#gz|n_U>jkG5f!eU#O)<q0<WT$Ird3cr1J4AnN(O;g$;Y>ih5YZpFjhv5buF8L7>c
z4=?vZ|Nd&Pf3tt(;s>i&Jspg9L*E2(<RvQJmP6wuzA~4*z17|**<G3K#|mb%dC!+B
z@WZ`8gH$h+`v?ATXym?;A~3P)CI_}=ykqO{pkWLMXX$z5>K9y}`%25ZvU@FzhkmF(
z>OPV9zy49^AMlO)Ex4k+c5iuVefii0)2RznJ8!&`PX8R45H#uXsY;xbN}cLj`#F8k
z=NK>jDv*q96Hc}%^oS}tp;rkb9;7eC(Mg@nXxY-*pu=qmTqTt)he0Hx*p~MEa97&W
zFy4)f`a2`4#*5?+$o>`HhE9ByrEj0A|4iUHozZoL&gWoY0A8Rk=(t1{$pcK>srC*g
zvA_)EgLLt;Fq9)dY!A15<wJy3`<?Up9fj$7gamC$Q~=t~!Av!QSgv=1<ZrF17*8q{
zs|{e&sLleNr|G#<ttkXf*n)!S|Fd`s<`w`RY!P$`iU3qJnzM}EBKPXKlQ{@?=;<NL
z3d_E)XHZZIPClWJh=Dy(S^^JTBccpeSJHDDz#^leY_GMCQ@DLHDUL7@f6z6xQ1tD>
zAw__75(dVWv{6A~%YKr`NLU9P1fHUa<z}3`XC0uH?gM<Qu^RbNkV^4X=GGGLSOtML
zHh^%}qAvUz^mTeFh@Qxva2f^+D{GMe@4_G>YVJ4*rU_tR4HE%_4?FC5+;Il!3+o4w
z@%P(BG}^u?(jYzi>}vm?w>wN>V2XU`Q1u#)A6F<<AhVK8(;$L9jVqkCz_128H5Fz;
zma4S5BUL2BHsYeare1<0j?lPiUj7MR$F&Y%&2lVEmd6kV>niCB6tOjJpup)L0<)!F
zM!6**kjPk&IxXvd7lbnM$I_Nfydxv6s>oDFCf9huD2kO-XK)Q(UZUo4qXH!qlIB3Y
zg-dVa6325aFRq+}f_(?>UN^oOz|m8myotPpPtH6Cf=^*juL5xy{5a=Gp=X?O@ReJ0
z-5QKtnDJ`ucbC?{iwlhZ&g<$jh@A*O8i4iXCN%wDAOT<o11=~$f=n=YRvA32EH7nD
z-OgBnn{rmxi_yijn$Zj}0vH0!Q1Ug2<|sT__}t>owSqp`{{~P652%xsCoHdI6+w0!
zb={P*3Ou54!UGDoa8bHSa-K0gXDWbCcLK?myBYk8=#&TQJm^@2GktBd3lGGmUcRs?
zmOIb(Y7rR`hMlUyyjIIdUIAj6f(UBqD|iaAPTSH*#k&B)Fwn;G9eJQZsEa$!*iRHX
z+w%Ky98N>(z`g1cF4D1TL;q|d&qGlE9pBs){#`(N5RHBWCVc@2hiG7&@aNJpWpn|k
z3*`<X2XS&4|E<yDRDBH0Luik`W1tq1dL5PxF?5?@bp!uky#i?BcvhvIL1Fz7R*DWU
zB5xgipe^^+Xs^IxhgvRtNcm@Q$j5#V9fDj@i(8<ax&<@qdLHh0UZ<sQz=FDht32_|
zmMsU7mde-){FUnh!W3F-U7R7*R0pSXdp`U<xF5=4=xZ&tWqZRyv%=iRmmtJE?<uVc
l{R%j^*T%8f4dqAZ4vheJucP0fN`Myu2_L&lE(ag~`4=PcmO=mk

diff --git a/aerialvision/__pycache__/organizedata.cpython-310.pyc b/aerialvision/__pycache__/organizedata.cpython-310.pyc
deleted file mode 100644
index 4c4e1c8128bddb4807825c693c72a157b532d9f7..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6305
zcma)AO>7&-72cWs;fkUtijpNuu^rY)Tr-X>NBMD@Dn=67PSVP<VcAY%)KJ!(6-A5O
zC1+Q*B`g7>PS6%;fVSu{2kPikjzte`uSIXYP75?ZQJ^`rhqjj%Xw&b_k`yW1MM>=Z
zym>S8=Kalk6ON5#4g9u`zbQU_*)aY{oxvYR=S_Uk4^aq1u)1+0t=SEx`}_viwRywT
zwROW5hUW<GWrXRu_(p^!?2nBbS>cEb>QUi}5!5-66{D!fL{5yM&WpSlM?EeIVghwR
z6vZU!2{9!Ope~AOF@t(iuvMdc@Jmo+tbEB(Dx*SG??-JD-x9v)DvB6{Z*p0TSz-u2
zYdm1W+_Ypd?Fn<%@Qj#ioMRft>f;;-4tg2~2VKSC)30o=NZRB}Vv(dRCAG??npnE9
zeCeI{f+%<|SPw7M>b^4TexPz1-i+@NsLWI`3b$m%zh3k25@fj+#SaZRf<^xI$^1sx
z@aG$~VBVXrY{*))@bZiE=fcWX!w=$UKK7&d)o;8qU-xb;%%5L6|LW=27S60iwZ>N6
zi)&%9)|BCGzY?!CJsH2Wc4uL}+T1$X9S<dX-Tu7i%bHieQ;RTbK9p52sO|W|i@j6L
zZ8Z`3arZFkL842bE+UmVXy@2LR$%T|aunSm{!aox$I$%$0!)gHn8iFcV=J}=6a2GG
zo5hYW<BYIsyv<`**bf*pgp(L|40!@PcV>;YnXtsXJu+*c%qAS=DCEM^(Y0)1baK1K
zIpb#OJG^ZH+Y*^gQ(i*t=o)i{t6NW4CxK+wvaNYBqW8U9HQIJO7U!`8cD<c_z;qk5
zQ2PPLUYWL&*qw2~c0q5;PwXD<jFGn1vT=bl=|@cdRb*&gYQ+=nOk8ZcIB7PXoQ4K6
zBc5!JBqL%Jn)x~4xg^ttPjwC?na=dCvCG<7jEp62cVs4>>KshmbgVPGi*?$g-8KGF
zGupW%3mV7TV@Wm{n<d^j?R+wp<dXcN(fUQ}XIL$dGvywzbVZzaJjnrX6nOb$6jt&3
zWV|z;<n{S}n~Y=3#(KGA><y#!2h1NIoFC71o+5JvKTMpzxM=KyOfJIqz%xT+G(CT6
z7qG@uQb_kt;Yk59Hj;ukFk@haLwiujUC$(p+lNW^DHh2sJvX@ft>Ha^F|F4*vWJaz
za^2Y&quDz@Nt~{n*7xwR8_v*Q+1Dd_21am}U@IOIS}S^V^flu|d*actWP&`2)|}Zk
z`I{o?5<!!q4LbIA?m=gbGjLvu(3;kF@yR|dzZ+OxQ}l$LyJL^+Jo%WN2XGTm*5CP8
z*!lDKjlg9F{UNa#(%26T@ND#q(RMISnxA<CCm?6MqCH?aqnxPX)jg@4TBCX0uP8T)
zy~^e-4H5S=q$chyoTHW#!97L3aw=O<95xhtPqFo#uU|^NP%WsI$PtxlK^#)y%fPFb
z)<fCw;!~$isnKdZyyexGwkz<kw)Ru92)CmO-W^|hRll(gZ|Db=?VZDymrC!#zlK%m
zHA+}+>DsxsN~^L`dZYB5(2jfO_@!I1R|{y{%U3@*8HXn|(nYV?B-pwP8>QzWY`k-H
z08^$*Uh#seKZx^u3BRjh42pxT#w}l#!u5352)ov6fgjP{dc7A2ccqUNhg+2mUpxU@
zUpmsFbLRD#Q!apHH9pWM+IOb+vl7h1p2{R_OQSq{UP=IKL8<Bo;95*)?K1Yla+r#s
z{Itq<m-1Ir`A!B~^?DB`RyRE1OEohLB+2{YZODFNXy6=#poSM9(Dj9ZT~f_3A$Dc2
z*Fz6gQ&P_jyO)%*NIR5EWxU;l3e;l15h+`eETlsmav(fE5NagatTn+M2rC1)v}tgv
z5(ZEdy5Yj=0&xSPQ4@9l-nLh%#Qwe57qvK)s(?HDYPhr|ysk1RM@F%%HI>=$?x_hv
zJ+DzOlRjVcm9^<_M~dHp{@?L%!QO&~7PNGFEF%fnn0<Mv8!z+*Xq)n+a@P?OT*q>%
z&^z#jt)N1eyUNp%*XYb^l?`y0Y%D&fnL(`;uK{N4_%Z}DD&ZEGmgz<vDXI=h@|we%
zCOSGDrKwy{w2CDn<plm+oL1RdRh5ecmPu(<w&BUm3!yyc*Odj1OJ&LMZlp4mu)fs@
zA|1e_1;q&}*CU^Z{t%GpGZft9Z13;<GdJRL*L2wtRxsyK^82o7v7;=<rq~p>n8k7^
zU7llkz^2hc53MZYUlq8;UG&htPj!J6d68Mb&G9LI9Pnvck!LYNAHRQym)L1$eU;^h
zSlY7gI~dP#hgqilHVCXxFxUH0oJSU?1DZKB1gj#HoG}o(7##y)3g1N-lW@2~p1Q(^
zOxd24-geoP$1p=>dwRN7Q6qhx?BkX^1k|77i%5Mz5Du<kL*&pHsj(*KH)!C7<+#bs
z7~c2+!;L-Bwh#qz#6FV=7bZM5Z@FkOKxUZ{FzU4ti{SuFu*B9kR1UMu4&t;9A}>1P
zF4s}OGlUacw6?}O0X&0ZC+xuGHAH6irXjB+W`fwdgI#D=Z{Kt@TB5yLXL>m4{6Rb#
z1N*F$eac$7w!EB1)V6Y^4?~T7A<H<ee4dJnR1hc1j>?dGfw9OH0-mLU0#(KCDHq=N
zay^Xq>!mylgkR!|UO-_%opMmWEY!z>N>M*=PP3w!10;{n{f}c7p(2*8XAa4OV%@$x
zevJ;vApx(5yTo;!O>o3kw+8!j$SR;hra<&c*$0hvIs6vl&{PUKb&7dnkz5}@B5kLK
zo7nx>qS($PnZ$g`XuFA%xH__cR1jYv>Pm0*)@$8)PIr$i<~T`4<C=$c%+b062<9PH
zENNwSQfbr-Sc4YEP@GRDLZ-2b@9zZ^Rh;{#)kb3YW8Q)>JY;ENL<}l(#ZR49Cfvk6
zrB~U-I_~C5*l3>pGc85y7L1(}Lzf2JQuj7FwO9{v2}Nho=*=rzS}a;2QY9h%D38c1
zs8Y4i8sP+MWQW?lEh8s0;k%RtU$-qk0H%gNN})W0u`lpNWc=_U<`mSV$mYlsp)BGL
z)!-U$NHw$>V~#e}`)H*8A~mimOaj73;0%*}QJbs@gE~O4RxW{Jpm!v(w4nmBGeW|F
ziLt~Y(|O-$<<bBx;T<-(a%xzExE}&ni?nNCTAK1QC@S;RmdL8g&VGiO4Zk7Z#gKB)
zLXfggc=BnC{S9AKLQw!)$OXfObV)UbGj`y{;gXrg9%7J-S@dZJ(anG?hQn8+OgZ26
zV_YKR0#h6?g);d$VhXk4*-|Jm?ioNGflRRF>`cN2ky8qLhO7}f4@=8{eXxfC_Bq}7
zmUlQjo<$yya2Dac1~I`r*af^3?17Y1^G(NKTE7)Lg|y#9NU-Oc%I-c+%8zKG=457R
zqt*l~RpxCk*rbMucws+dOTf4Bt`L!6=ms2QtUJVDVy>gL5IRWnhFDBG*uTBf=aAd<
z5CXRp8@Hi>pOT|8Gz?E6eHgA24TGca!{GRlz#E@%1pY|K6RT=M5H?tY=MZ`X!Mlbn
zC^ShKMf-v`HoS0iTk9FZv><+I0h)&Qrmkroqt;C-$OyIX-p`G%Vc-aHBZmfJ0{pWq
zi<@84$k2J|J%Y|mt||H{3c7kgH!deqD>^qEE-o-6&$cjy^)a3&NLk3i{=oR2u@1EX
z)CCle6*Kxu1+Fu2RUUydS)?LxxTda0^d)H|Zr8o$+DK57EL<=gGS%da;aFd9l@ljb
zCF0lr_Nww4p<JgT^`Y{8v=P-&&a<CI6kN??O?d;2ooDxln~!swkGON<8$I7d2T^Eu
z?&7HiF^(|4Z|w5at(Tc39J!1F-b0hbn-zpS`Sludv^B)xO-j8|3bTYu#GNMjk^LN`
zX|$Dg)#XuTkt3I{5`=DVHPsK0u5L)*6H7XZ9qmIAHD>~y_#wpAd&=81&uLcZ8BLAH
zd($+41t~3c+Cmfb3R=gBAadyxTEe{GspeQ-2a`p(cEkauB)lQ@A_eSwtbd>bRbm~c
z-(ontmx<e2D;ZSA!icHgsHX5nW#b(L9t<yBPqm5yaTyd3aayW)P$LKGQA|u+2G0Vs
zH}f!NW}s%c+2P}sdoykp@Op$fP^*#dO9Ip}!Mj!WzW-)xzEwt4n59=Y#C4iN(AsV3
zNNx_VN28je%_Q3aZ3$l3=<;aYq&^;QhiL4L?3w!`3e<}S_G#|Ju9{@=5L%5lC|P-&
z`o7aw-{*$Zmy)9YOMjoRy}ScCJ#NG6y$#D5d75w*sL-l;oLUrbA~mW7+OwB)G9bVx
zvY~|vXri2Hj2ag`ogS@(%8VPxqk>frCV+H_GA~M@luKyTk{4)+aZGLcVyzFoOWR2*
z=#JP=>lwgaCv7FFC>6??1;neI9YorMS6E~Ulr-^wTh0;m3T%#CBh4yLDIP}Y6~93%
zpG2X?dueH<?nRLw$(Ly81u9NZK?!o2Dw8+W8IjII(kP3<A$^tW%SL-(Z6|ue0n*85
wX=_vNVj~n=b^k2RU_{4IxM#`W^qpqocEgd-%eo)AwtLhqx|8m#o6Qyf3vTU&b^rhX

diff --git a/aerialvision/__pycache__/parsetab.cpython-310.pyc b/aerialvision/__pycache__/parsetab.cpython-310.pyc
deleted file mode 100644
index 1cea68fc7e1a7dfc21b00632a4a41e9e2b87ff77..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 870
zcmb7BO>fgc5M8e~iQ{|_aY5|`ACO8UQa33SilVB}<W>SSs*qeN%k?hKLTo2%Cu)2{
z{0$Cp<S(tbamtM|NQK#aNVGRb+Sz?GZ{Cb%<4VOuV88DiQm!NPLxF!S2jCD+@)I0I
z4x*Z)QS4x<2O1@T_72k=%?E&KA;5qj23W<7&$^>?<PfHR$$do~bZ))HPTnangB7U0
zfxzcB&?gL;+mI<bCMz;B)x<tWjId%GF;m^Z%(v0FF}BWhT3mF_sSaFoN}yk6rT@?`
zgFXlS@=71>mHzqo)me#}v=m^cLd$(ruViF*ySXFD@w4NTtl2w#@w|J|@4i0m_Kv#N
zo0lh@gheTfd}cjeK<heX4ogFGtemDCY#7b|fYo(LRCz}3v53m*^)hL!W$XE@+TXGo
z2i7{0;r^B&pwsL1AbWQmR~)5yV**hGs!)BUIda$J%p9`Indkf4quKns6028b$xXeB
zDH92gBWb!r;f5?7#8g()JK;g(r4zwqO+n93f#&j*g~^h<jL6(FgxwDkyHlxWd<2h6
zI5SyQl|h`wt2$$i<t_CZiI`3ny2&>rHr3$D{rYGR;*i-Pk8IEO2ZAT9hfTW^`;!p1
zpV%o&(!Kph_Rza%+1;b=-p-TOV;DG`486saApD;BsXOvSy6a9`b}*VWR{2-}T4CE`
zf_uX$S1YiAoiJDl>_{5z5M~>)1NDu`18{^8Oz?d;HEmO?X;lqhVZC+(V~jQ9>MxlM
B^fUkf

diff --git a/aerialvision/__pycache__/startup.cpython-310.pyc b/aerialvision/__pycache__/startup.cpython-310.pyc
deleted file mode 100644
index 4a6cd5f746c676004d9684f95090a71c6297459d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 24043
zcmbV!3y>VgdEU;<zISi$iMzw$wID%|2NG`*q$u!tJdiN)pnxMO!j|B@?Zsh%z1>6a
z96+q+WQd@22ive{sp7mU%0ZGJa8gNBQN?kRN>$2LDZ9#*#EHu#j~z=DS1Lv-Nu@+v
za=|i{@B4dZ_H|HX-_`WYbpPG`_ut+BtN+=?z(6*Jzvg2vs<%Ig#r}jZoqq|uyojs$
zNh=mpF=f?a=gi$YXUn^D&XIfkTwLyna|yX87gOibmVwQj%Sc}KTvqP6bJ>MhDUWNQ
z8Y>lWAFO0A4XqC&9Z~k?m~vG7j&p8QB~<eL*ts$FoJy(m`_{QlDx<Q<8CN-#$9+N#
zr~>YjYETW~zF7^c5!|QLs2an4i`t~dao?&Y)FkfP)Mhn>`*yWOZN+_u+NQSSUQ|0&
z5%-;Hr+NtYht$Jr7w!+MN7QcIcd19!W4J$}rqv$YcdN(M6SzOB_NsljKc@Dp1GrDC
zgX$3Od(>g|B<_!^r_>SLpHNS$XK>%EKBk_<eV_WcI*R*#Wu1*pKYx!jKjp_?JAHDt
zmEE_nw6L^%zFA%L^QDTemTFh4&1$1w#vDX{aXck)&ERT&1&Oj&tkw}X=HY2`?51_e
zQubwAKjOt+iG6Bx#rAC63oDLmd(KTuS(hDM^c>~zuHq}!m60n2&w4v{7A+Xap!BJZ
z8czx2qXy23@jry_fI1P>JR#7}dsZ^`4%(egoYEYEAFoxLt{*@3=7|&cm@)16argRC
z#ZR9<UoS0I&Y$<4YTccVYmU;7&o>%1KVEe!+PB`l8`A^Gy?<l>#l~V~|6;YizqG%6
zQCFLXo;<k!l}34au~K)N``t>@J@T=q_SZ`DhxX6T%pN)L%%P`A)a6>q1vSqv>Bgl>
z**(8h((d8&R}bw6(f0=B4UM<peSha1>~FdNUtZd`bluOEFILKzXBU^;>u&?2<{l(*
z%eF?WH2%jdJm0WCN;+xVJSVMWl%KZrAi%q(^kc}#<8p8{znG52+}L{TPV81@EwhdZ
zyJK0%xn!?pl+6vI61OpJp0)0%B+Kv=zk>+@Ov1CcVE~f>45t249JxuAy=~fHIW`c=
zv3+hQRvgdif=I2WJ@mH{_u`l0+*)e*wv8U*>lrnI{DeG>;wedtyaZYr!&6pmLVqbQ
z6=7m~DU^-hwpY?#+Vlaq9Jk|b$Bf4#o`lGFnGT4}>=CdDFQcXavE{b2lJ&9zk&i|)
zfYJ$+ZoTc?N~|S%N((66wwkzLsqJdV`)O4~8#ynx0o2ab7~pL6Pz0Cv@||$4!|tFv
zbZK~f#2r-+-?7!MoA%0pH?Tf-C$>^hkKD?61$UEIz}DJ58QY9JudqI@9tDjG?u6Ri
zF1yK%m|YoM%c(~vVqU=;+#H*T-O640-*3n2+v72J@&oJogcVyEvSO`aZ|Kry_1I<m
z154-B^sQXo_J-cWHhl-Ib!F%?Lv^rQ-mh8LNG*<d&uU^dhp{{!jb+#y?i@=1_k>Vp
zZv-{sjr2g-(cFxBqf*BC&Z+j?jH-RX+>Ci+J+-I2QLx1^lw+mdimk<Z$_pqzuo}bc
z9E@<-<ZbGKYU42Ojc<s<xH=TzFyT#f)+Q5PuvaFrkG5bRW!JLnTh&vtS0*tB<3C}q
zY+jkdh_-oYZ}VE#<2pNn{jkN`BJJeV)7&v1SXch3hgAQ*yL}A|?<V&`3^NKgYi9da
z4t?xEANjRB`q&-p)gtKkjEb-2)yJ?ewt8DTu?uR%*W#!lI~bU-#<Qpqmm1r=ZQV7n
z*OI`oz1OyT+dKB!c9d}^?eKO;Ju_RKdu>MmQS^!(5bgOaLNJ-Vw$t0$)AxW^jP_b`
zE!k6AK<T#CBxdtyG@B234{ZRoXEiD7^SKD_VejD$;GU4yo{yk*dAoX`Y%d(oBi<uY
z*1J9*Q7;7J+3oG_sXgF55{)Odmg*@jpmf`63gdY(!r)Qw(G8&Xtfpi<FGX;Vd5>)X
z_k^_eas)N)P4_@0(C18m?;dYYPicF!k9&_xS?_2cSFZ%MpYWdOsXgF59*s7=W_WR%
zj}}n6Z8eS2&PMp{_4f9(L8v{eX&LRY2yUOZZv(g|q_tNgsQupl4WM3IwN~v_2P+~4
z_@p{6@IR(r$9_5B9q6giroch(pp^Ae;GlxiA+q9-cc`cKly?yP=p@R?kE}JTr@Y<z
zVejyUtshpWg4Un(p6sbT<sI&5-Ci>?H?K~sHv&mkK>vo{eIlaDQ{GcO?X<Bt;vLx#
zizDhxfW_0^(>=APydxc0IBQ0(cVl6xvuaMg8BjjHnpw@R<}akwTY}Zw;D66}&p_^v
zsZWyoLHpQAy+Toid*@Uud#qZk6pz=Jmfd?NprBQE+fdRH*DgXqyX)Ygs@#iy@?xdB
zaMAVc`2|t>T0<|1D*KkGvc-e@4o*Aw^RHHGwc^=Qy;(dtcW(@O50$-|LNjr<@-B45
z3yr#~H=*FJ<;R;<RcY;iqgE+3D@9;YE1q6<i)WWA<?4lMS+u%lF<W#S#aAoHr>0g>
zcjI~-_3u&L(v$2Wvs_mdU8~kB)eoS4Ld~3F*%l`3N{<o>h+r7;IlWxDYXcE<=O-)g
zE|u!4wWCdfX7NHp7w4*rmEwug67YJfTDjIDb^(nHz}d#KE?0^(4P_wi(XyI$q(475
z-_XD)Ae^q$s+9|OEh)RV9dLIYy^Sa(OZD=_hW6u2C3XF-Et&cWlz_NOnJJN;pDm(P
z1|_7kWkja0#0<!%%c#`#hbfG4*dIdegdVX5$#xbrKLxU9%dBy%TB#|~ypffzpg#^#
zGIRlT{PfXUX>neap8q17VSDLVYYWI$k)^WlXsuB$)tb-a!N3J5`l)hbaS>F#pXMOg
zY9Dqq1BhL72nn%Uv(~LUNN+l4WBM>Mr)}6uO3gJ+z_gfeyz4s)6<4rBNv_ORn`pkG
z^wWTDqphF8>)*m97!Sqn59&%8_5<;5?sKsOE$zoNEv@_3{(BsjZ|(DwFYCs&W<R2Y
z6EbowuR(oRmTN&}hb?y7x?_W+j<JEJLDy=%>cY5FRgLwYtk#<qjs0>=`GckA<;GH_
zP9$g#1>+FZ{cIb-o?-kN`jlae#qQ_N$S{PQ%|3dDv#k%IiqH>}TBtQj%2Xq{Oyfrg
zKzmCjS*Z1o`9@o%H6Uay3SFNq-c!i)lV>W6jjR2X*|5tIRjIL!W4f^zOq_Iv#s30(
z>F)R&QO6I>Tx>M3*Jf%+o7n1QU8&TIXDZD`ZJEa1v&Ct<l|6Xi(38b?i%%bU#<vb4
zBX{t~fdj}mc<8_Z-#V<%p}}DH`{}DCSlaWoil4Yzsx4P!w@h1RdySmETwN;896Qli
zD8gK9R_hDcyU_;IKaPf4J7*g8tC+$74qRyF%~xJ3HqFi_yJ>CV3YXkYen1L5Tl^6w
zNk179h1)XRgLAVzbow@-I6-|)hRajGMv&u7US~o&Puu#_d_Em!v}gQJnDqvdz6See
zRiBCKe=2$+gOIiJ-}xYGXRDB+p>?IX)Tk>^wD`u{Cj;*Fqx92Bi-9&R&`1wC{%_c4
zU#tHga{MGKZa|I?ng7=($9Y!kUeSNfEFs71drmH6Ay@0=+A>5$$t_aKxk3~0MhY{x
zcmWHgS)8vx;#7)Dpd1%s(Y;tHHkaoYt8Q_A*>xNB*0z4yO8vjtLMyb1%T{`Z5QhZ)
z8!fi^2T(V~2LJlhE%j436Qb72kRTj=KP#jh7STc_!HAYMdjX2mT6`_B;M@eyHUb6=
zveY49f?YSA^anbm%iOF#)b)zx=nr@02n}VY{UlKqGIXzsJn(LEZDwO>EB*sNu|Vzy
zB|sR?$61SFy|ryaP7<+WhcG*4Hr!&(5K8X=RG><TUh!_<F4<s@zeOAb=}$&a(H`F!
zksazz=l+h?VAQ*xJSz(q%v5wVdcQ<RyGOTlA12gIWUd~<1Lg`dBx*oXQb$rK1)RT|
ziE!R*f57aHnOe1c84AGj>_j+TN>gO%3X=;+pz2(xH5Lw)msQD6Lm<0QxGKI=Uxp{P
z2@xO?;H{GG*#rAHVI<Q-p*S(hByrW8FUYgKxJ^Ig+jE!wgj;nfXO3yecE7MxDqp5E
za;daXax0vKGGI=W<|{SdnLGW4@4T^i%}<|dxRsY1jmy4+W#}hgJ#+f>TeH6PwjVz+
zdu+~6%tBGV4#h-RfFGy+;M<KR+oK+PsZp)_gUiq{&Q@F(eB%vfXLO~cD4LqS16{*U
zG?!{s*H7qDeWBu~j-Q^@8VYBkqIJFDr$Xf_bJo>p0+jHbX2tb$XG*HtFx<x<0)7N+
z>te~{b(gTY89Gb@u=HZ7TCX)4OVc0o)8*QEVQzk0HySR`LOLf@DllI(Nb{bl((sro
z%{Tpa5zGMihDY0=upMSq)dlLADAUh1VJSd!sFpF@6tFyoru}s7rQj(yFS@ZIWMMwY
zHY*fh(2m%aKj1Ec_l8QM&S7GG3-cIOFsh+H8oswxs-~Z13q}$3bE5hLFNJv`V_u*^
z`I)AuG|*`L;hIrz+S-kuI0-wVDg3~XFX9>23PE$_d1@-MO#Q*~ygVEtn`r8?pJV5S
zl(X0rZDN3HWdd^GF&h#D&jTGKA#V%EWX6b`O<_*QD3MKFoCBJhB?BC2x5(F_u11*?
z(JE}0nUfnf&5}foo53H5Iw5C){u<2K+^oD0cXcCKV_}nFKa#sS%<UROcLlQwf|m2`
zStN}*<`J)horN;ou5!ugo@`Va>a1k2@2VwPsrj>SzI^idoMB!AMn&%CV0wuxM@*a^
z5p#tFnu%yMO+E?B{xY5}<7z$xE|!jGtr0tmV~P={fFq28m9=?gt+7<vD&Qcafa8rJ
zq`W3^O`+UQjG;yxd0A^5<?$a3*?B8z$L;(_=^OT6O{ENMSa&3-8@}_nBx~i---wlT
zvi6Xbc7O+<(&(A@u{e5nHtwBR6yp6@F@Ffy6%relZMAJH`GbNTmb0INV_=zfw#)a&
zoV=BHH0jlKY(oY>aYcFC{3eJ9H!hBAXz|^G8)zMeHFO8JhFr&uUrMYeT^xzv03m?`
z1Rmd{ae!dM^}iLa|IGEL;rPeFNNWg3GFg?x3Etx>wPM5V??i5Y=QEDD{c#W6{<$?<
zdbH)pCJzh-R6029DR?~m8N^YLy^_GwkQY~3mGcsphE?7;=GRBm01iOlio4*bf*O23
zeFrXXIPBTdsA&oCW60U0hS)CNqzvWb=5S~shQpu<cha-r6d#$4@dWm!12_AQl~ilM
zORjJBa2SNMUF3fbXS}4z+p#OF!6{;LeX76w9hBqnWE8C?Z(4G2l3Pi^uHJ%vw|Yso
zM@+B$6TG)+z@sK&w=JT<cDU6yMI#*N2-O4*gK*@uo(~Sq2r~%^SU4jN%W>dj%EkFL
zbxIFolJzu`PcV6j$;(V;nanWRiUj6;vZ1SRB<MNjQOkmoBMQM7p8djgF!)BT7Wh6|
z!$)6fEW`TTYvlLdgU@%ZF|4XssHuK9f){8%7U&?yf@?P2Ab0Ika1fQ<>Q(xyhKz(F
z4?ZrA1yOr|R-$qhSi#|tE4SGla;ZINVG`ZS=rMFHPKA+JtJv)!=y&PBUZ%POQ<DBF
zy^9GIS@8hg`vLxR4X=HQT{txTp9uWBO#TZKLFM<Or%1(QG=lyE=0A(XPtH(JtMxS-
zYOdncqBrIaT$rJ|BLxZS>c(~H1Pa;)V?W1kXr4{)(97&7J=a*Ofq*rAm7TxO<a10u
z&t#R!8k1X0ZZr9LCfAr;XYx}>{0!tdrhXn8QTnafGjqphUOJ(_z~WzEaeTg7SNgMj
zOqLsP0xkH7>sR4b^sRUF&k^irSe5`(<y}nq5;Q%fPcYA!J@pDyz|(Vcr%&pe$eGUR
zZ?eYMSei3yLPU-JW#)9OlDE-Z<C^Fl0d5ZU9BSkc@|9ZST6?(!usP@)RD-z4{Naul
zGaVhXClhxTO+5vQHjm*U4~B0;oaDiDvp5$IM8!viBt^@ZVZIb4_9RZ!$01yj))Yh&
zL;zU&RtStq)YWXW>>vsM_#DSIgR4nzuV-`Gd0xlOf>Uj<3vOm?7g?29tA7kIp&xcA
zspTiJFreQ0$!kIsGm!SS_&vaUiUVOsX`ri_)3;wEG`L^Uec%fW5zya3HXVQPfmNWs
za)D_Ef?yNLoJoi6UY|lyiM?`Fk64=TU41g28%c&vp5!u)W1KPuvLQyED@3sGMDLI;
zJRG%o*bgphqg-SZ--P2ir>g&qyY@e$IheW`PBlG}O<^Z~Qj4=se;37Oha_J*HS^l(
zGjzwwItwj^gjsbxV`A7ug=3=pOk3NzC~knB%rA31;h#_%FEX|d@ZbtHc>pU?VfzZa
zdx$`>JdASPxf4@1q^kq@0(S!c-MDhvr4y)U;RxM~5^cp@D{5;J0TWm@XeZ$&FQwYG
zICH@pLKaxYoGW{UQiyF4PDjGG#%3DJHC3!P+#>BTaoiNma?7;0u;CxiqXHc2w6jh&
z7i1l_*Md-XI_{@x#;t8uhL{{UAi*;2A!r$m)kshZnGuUSCyCtzb<Uo`J^qn+{=0Sz
zNlVOywB#Z!2v;r{64156#StzIvxBg!lFFt!LlV3#izOg*N+G<6#8q&eOvg_d@d%t^
zt<eq0GMzSL_#OcCpE4m1JrvO2M;<)2f5oB?naB?Lh)+Efcn*2*k^)0`u(6=B;4ovL
zLK>H5Y2U(1AyY<w0S}&qO|uWnX(tp2jvm_ubl@|hWbRmcyBl9m@JV<Il(|c3ynzb!
zq?-YUam?bv(nFpsJ?7fslDmSiInc#E)!MWHHQ=|-wj~OT)cK~E@5ySD8%RHhN`7Jq
z9E%ZCLL$zo99dNiwR&juB^19$8qw*J#wvkg3px#1c{_e1iF^EenlN4Lg>8~j4zA{V
zNJx`F&E-)?pyndiS;K~6=$=a96c_AA&L|TqO%+sPdH{hiwJd5ALgJGUkA#S%pgf5d
z5Sb*gAq?zGL&c1nC?!CRPyrPxj&>+ytpV{>6i>n}T7vT;A+LAi_Xl{$*bGk@^#oAY
zMJ5yvzN6t1+RRrL1fDSQ*1+MG0Fgv@6Ifjb1mwkDLepgO?bEO0SYeQ-f6d~l%C+K|
z22Qf>QQUDScKwl^jpEXO#N;rO|H_1O19v`MEbx}eOo_OB2Km3p{;7bpra}hMuXF)V
zaSU!@YgE6qi)1=w_RzPPA=*ldPmva4)PuOJ4?pP&vHWB$xX;VoV3Q*ff)aP(7r1V?
zx42?x7SU{)q7Y<hnF1*M<5(hr^djm7OgkPh?cInYOCS)@*5h8>OMv0Rh;eaZeaBu&
zqJ-g#yZTB}S5kd>Y2?{%;(QWm>OA%%p0VpzGH&)#Zawb~+_9RkxY_jrpRfW4C4a~r
zfN5r{I5=I_9mc-TAUZ6&KH}w6VnPg*D{pwo4fB7(OKq5c(MxZb@1ot&JKzW4tTgIw
z!KlmEGk8nm?E{s;o2{}ddGJ|8lmW+0UY-Vym!=asSnoNbvlvP963FXYCEv>GclelS
zE|zdcZ5{HHixpLc&(fK%E=-Si2pr0qHf#0s4ZTpRS6dZTa!XMCBG*x>R(bb&sa*Dl
zg1pnOzG05N{4uDcJmu<gEp@Og9Fap*HAN5h;p2nHwW^zj$BTkY7KR`Jjc5TsQRV@j
zC}4i7X(B792aGDN2`4y-Q&v11ekPEEP(VZorsig;lEASho+k60Gj1C5l8|QofeYu2
z_A9yiFWES@ezRA6G*Iw&xx{H$jMxyI5FBIHn4Pp4z6muI%bo%iJ_`CNqKo2%R~&W5
zEX{VxWSQX~pEI~b&`?Q$-etKqgo@?HUD)M}$)%8>?g4?45CP+en1zU;fbkrJUy{v+
z%K^ewbPnJfyPttiR9vJlOlSQ8I!KQ-^sBnGbg?ON)z2<0SIf0h6DOwdiZ=A(taywa
zXc1yjw6!=<Lyblp_v1JiMTG^n#8&lV=x!qo)z1kGoNxg9ccdYa+XX)on8N|QcZn0?
zOi@C5d`NzRg&*Ttd8m6DJIygzc>us+!R(3InK{^M`-y_E%7|ZzE{cwSiyf2K4%wR4
zMwhf_J};RBsz`9Zgw}!FDOK9M8Pb4Op&!4pT<ue}Xf^x}JLAlzX^b$ZYm8)mk;weF
zc<B(C2tVo)nU;u5Fo!OYi6GucWLk`UhsY#E7}A?Sm@mbqBVsj)I0F%zs}{mPdju!?
zPT@+c5&Vzd#<?BdSYr%=)`Hgqo}EC@O4u(-5$ub9X<fwac&E5^_U+r(N_+y8c|pq|
zU?fB}kMLUCU&0~f^~N&JCWD7-rMk=D98&WIjgJFjMxvVNmJYG%yi;jHt`;P!rY%-E
z*JWH?e_TTps~^&YfL8>5k?72>EnxK#_7*J`Ig9=SWQk`Z8*6QbtxL4}rK=ub&E07H
zZoD0E(}Apgs?moO*Vuz(J<L1!_<E7~LuOIZL;n*?@Y{HPpxhSkfW83`KXK;xtFO&X
zC(KGsxSS9;$jxnDZqEFvh8eq1fuh_`1q1$F^ee_fUOcT7=OZw$$05?kEj@$Mt_?&T
zgQsw$dh;t--{4}{IJa<KYJ5@f8pcI%PhkDnYxusvsDfOx^$C=vAnqOs%AgQZO_Z`s
ze;$ksM`eW%+Ch*jyr)JXXTUmFz8c{E0B2080u=SQJPqP05#aG@whNCZLfE<P@s?I&
zx05T$`nCX@MgXy?4}v~K*6l0tOz>+4zD$JMqW@%wsnkfKrwMpl5eP4;Zx+GsR6F3K
z^iuo+1?5{X>M$g}UC!`cQ#TBTkIL5bUgnN<%R(y*dfzIZN`AeP=C><oEewv2Y6#p;
z3G<7?ulta|KHL!kA3z3ykkQ+E0eof`rA*KlzJWpAm}1>9EWX`K8)t;W%XkO*wM@Xb
zkf)~Aj#UR7W{+U-IB-R9d?#hhTzz2aCxw2TttW!H+KX_+!TMG+SAdueAod~5(affS
zKmfMPRZqS*h;@Q6bMaXZMzh3qgjGWSeCIlHhoHP=SB6}K`FlfmEO*o!I?C|xYBjbp
z?2e-Jz{=1<%tKiI(HKiN)5o%F1)=lmpakjvETG@R+aY=TBH!FG1L=)yhW{LLJ}YIv
z#8#m6j(hl|LW0cS1N6Hd715!UAybOy^a?&XAfz|)9C7(Nu7BiCtWU0qiYhqgf%C-r
zgn{e8`#0Uq?&zA0lXB}Om~=$vdqS;5->w*~Z;vSbmjO)g>L0C}eUa$f7vV<&?R_yC
zQJiakRJGTB__;uP9Sxv->ncCiI4tYLtm^QCfNr=a;meN$*1`8x+yPvf;pYL}kdLS%
zfG}$}{5+ryVWUs8-zL4u4c6{tw057Cez<lwdz&{{ySDyfMAOc-I~A>6TYoix=w72!
zb+dL8_>`h^O;34K9cy<gq9oVu6xObM?$YLg?$jE19s)O^3xFHrgNMLqc;*yjacAu9
zt8ctn6vdFyLv#4HN{@oiSNDRQSI>a6n?nwIE%hcAd<}Y>rivyWEWHy?2+u`atbQ1e
z#{BIJejVxzZeuuX^}j(!x_5~667vWPu82H^%n}(%Jxl^x^rL)w3`zAqaDk=$1T*$B
zkt3I_c%l;tdTXOre4|va)SfN!&nOdAKw2rBo)@79K3FCb57F@;iK!tP(fDNa6j|TH
z5aY)B-W=E_8>k0K*1IS@L-XCRXN*O(fhJpaq2w||1rrsILR3UO)(j&NWTqWI#h@z;
z!>GEVa(8Plqd-oymx4J2(bK4XKYy}Rhx6kBu}*-1ev5ik%wcha$}WzDa`0QAqOp#o
zr*;w%WFqV{X*4%|3GMiCDj7clxuLgHpD^V97HXQOsP%0jx#5NvhsGwt(4kTNKPZB4
ztG!&>E9ErCQvGeNT6#VF1nkn&ullLkPtLvd(wVbA0$Lb+ma*noH>5X*4R=hMS$Gc&
z#0be!h@_u16V%rO%>g3X<k1NfQUw2UQ-*UL^cKu6d*f`wj6Hs0_N=k3&PG&<x}_2(
zvz_nPfWukUKgN28*`nnS5`VK8e)ybLIG_`#**6dr7$r8^F#1%4*#`5s7?Ngst5IRa
z>vRpR=y#c1XF@|xuQ2%xlb=T7Tc?aZD-Npnn0=Fp_<^5ik8~O6&3N)tjJ~a$teCwr
z$l7r_FO26HAI-)5VFV<}A~;^Z&@f}C@u}Zr@{6c3y~U9EEoS^3HZg{%diWvcKr(!u
zuw1IaC8GT-zPgQ`NzNdmC=U+^8IbTFKYTOMNdvWjJDJxsBFE;riu@Fg$GE=mV5Z*B
z&v#6ZpSprg1XD-9#4!}al0Jz5DTxs??uzVUNneId{|e&5GBC;|TDj>D5w)OKgVJz~
zBk9V@%`+=N!Mp@$3`t2RDmgY7pyOwxSj>n4=C^T@rJK}3#aBs6G0No>RoIoRVp+gN
zlzt0W^S40kbOO;Oae7b??IDpSS$GH-VUkWwA$Q7}gii%dd*qsk3?2|p!O6H4R{?f5
zqd*GIkmP04lPI0E#>FqgmeNTc8m1ju<XIdW@?s=M{6_qKI?dz8_z$uu9mhdpHZdh_
zqeq8vFLY#@ex#2?&<BnngFb*I<6-i4-qsvd*PfvPO2+EoYJLW#koKX2+@|D5j0UYC
zOFrC%zvB{*72>deEYY+RVGM_SDrw46{ERoS?h|l|Gpa<QMx-tr>>HNTJHYzMsBN5#
z!PbK22|bA)`*xQ`Jxlb6^o-i@ECf0{J_a_H79F-db!i_Jf|ir8i%iS>WH`h-4G6K*
z<dZ8?9(+*jW}x3@U?(Nkv*;1l5v`6ZU&3?Vv(-oxvB9tT5y1iMVKec3Pbfy1aj~>$
zEp?1ZycLrj-U@vj^P#t6W{j_&nuKxq2hIn&^F`BuezA<8Rd_0dcLe&yeirXzGKU0`
zj2ym|ow>+3$tHmAJ<LLed5HHzAH`GG>cG1kOm~U~#&AgNM#!~g>tSQEYa@vq9IQ7G
z)V(N-;3PuA!I_53CL9f4Hw74IiHH!99m$NpM}GV^#*L#a*}0K`-$0F_yR(%8gTS{N
zGIETOLKxoIwmU)?w6O7?LXin;_$;4pF!=%#Vb#L9d6aDq8M5NX0K;n!<Y3pZPA~DB
z4Foz8&w%_w{jNHPFz!A81V>5)aR({Hx0|0QXh2Fd!#9jBBCNi~1tGK|={^fuH7V!v
zhUwU3J;HwAyac<EcRDhe2e&V-t#I@bMo3@grVsxp_#?O9e?aS4&T#7no0r><>_A*2
z!3M*rVNgof8%z0(qG1c|GH$-^GJJ=RQ#PKWIXE#%N@+5zxsG!Zm<bHyVy!42=aq4l
zma{efm_{1g+!0qrKU7|z1|ldrEc&4kFb0U2w8M?q4Z$;>p#Vp^D7s34BOKkr7`pO-
zBhCxOEjb+ZPIpQ_R9>J)BB+gUOn^m={tz5-Vx-0lvw@?64xf-N{Ivk~2}c#@qt6X$
zwmkJ<iTi4G**>z`gmoMQzysOAhg^mXPck2i@iQ?A4$*(kW&?#ZS20!<zaZ>UoWIVE
z{u<U<Yg@;nf50Z!uWp1&XrhQGK15~RzJutx5SiCQWc2&oI^DeH8${tnLVRMQ=22=4
z5Bd&i0rBfXVR~?}pJlbDnMj-MB;sVh&erK1ZB0Dji-HHzR^KGD-(l;W)XY@QB4oJD
z4Wh1oGo)o-iy?{h=c5+eB)Qd3l7AnPq_3rr8hVwrsIcp`sQw1S|Lt)2KRkFYFfZu=
zhtc7*%}dLlFdw0D?F1~*a80wg;`(#!{%4pld;sT%B4>K$_;*6mgf41P?L{1jmM$_u
zM8sz~r1zNIWb!|eOy~W?1%B^f0u*keuP8{|d<T(s8%yR1pTkBH4^i^%b&o)__De^s
z{$xjXFuouNe_MY^J7x{1TD5Q)KringiT*j#LKekUJbeWtZ}L2zyuZMZVoCwrsSHjA
z*BI_YiT=FpkR6(>bR7p$QKY3fgG-v`5BKl`7Kyt-z%EXv!Z&bvFR@@Fyh$LA(%mg8
zkZ>c!vAv8&U>?7Zn~BTHw#g)?Qq#$vDb_#F2}qh{fe)*r@4DLC`ey<0A31RDSs5_T
zxR~1|>>QL5LI$!M57D_MVy?Q*H4y}J2LdTzqClH8Ku}qZ1`_O=If!Uu$30Huvh~U}
zW}#H?MTVSF!Zjdvg;*G3di3lUT6y0j*0^H3EwTE_$cz5?Oq_|YKFD@u10Lej874gq
zegb)a7&eH5Oum<+*6=|1NZKC)B&0>8_P4{ok?VvC{tA_U24&-Sa{?)!6#c*njzhwj
znN(DsM!6gYMn``s9tQ^cL)0=4VuF@UJPn+I=qHJTm;uP(LZJS$?{?c6ltN*@ZAZV+
z5|(8}J%>|qoi55e^a-lx3wTC9fyP3u#I%=-@S*e`Rv8>UI5>#Ju?w_}F}O~tqjZm(
zkoZwA%3eT@N08ysNt;51oQNPNBgpU&r5kdy^lqa~{1jNwdw8VM4Y>u7Hk{5|Be?J&
zC8R><%!R)m#6bE1%s@_U3y)F}HL^XRad=45ZXttGL*wvwCxSBB!#ny+6){j~uVBTC
zAB@Vld{x$RiZ2v(i%Yf6JkdQS_|!yRYp}<B(V~CQYSAvreFqG(YyF}`HUtfm@DP<A
zD++#Y4LFlV7TeCjfWOK%frs`w+srpafo}@??vWbb!H&=$CuZ9^TIp9#LZ=_mFi}iz
z;^~2y>Sxj9M)<)y5{Y%h!4H%>?Fm7*2f;rJ5%l3YzJQRBi$7YY;~m2ZDn=dauMoR7
zFQx4GZrD$&U?fd3u-zWq-IqJYINZ@)holOpptIP7XZLjf7g6{5)BIHUL7fAv$Log$
z&^cDt4y{g(5MK0O=^yQVc1M*>A4MTnWn?KA`8QqYFe}@B!nSSM)+27Rk&SH28|-`l
z&Rc#bXf%gp&G_9C^RHEEOZYOYYw`obP(F|}YZbBfSkJ~e2gBD-!%HViK>uxEjeHZy
z?r&=L7@20J-w4MI;rmrSb2k`jtn*0}h}ZG-El{KR3h<>*in6qATAJ^Tm`9_M3I_dL
zofmyl1^BDP5T(J%7W-rx=WPa;xSgRWY!J?Znffz8n1#_^oWF|Z;!YqkA>#Sob?DEq
zrO2`MGD^YwB6~KnFU4)7oIoPcS!G;Zjz}zHw}q}Q@RvY(gw}A|Rh5D*mO?4qN0D8K
zU$_h8BrP?_74U0(!UVpK;1qFMvHfnS0r{NChUFK+h4ynd+@Q2J6hVb{bT`y+U}c7F
zg|=u16vi|v_VQTNR%D9;yA0&qI9++d@h3Vt{yq@6RlrItmL!U_Z8wS-*uzOXz+^)+
zvVX_6e<wm8VYT*LlMxMLu8lRdtAnQme7&DB%O^G)TOMf0@QO}mJlWr9H@^~7Pr{VD
z@fsywloUVQMBUK-+F+QQ*xZX8{sT=?ku~kpjWYxmoHl4~BBGjebno6WK)e2;N%$jV
z8<jGO?gax)m*CVB^QU{vo5NO3Sn}hj+;3fdmQB7x!U$IsJL^5>-DGkD$#g~(dShGt
z8X)}<{Fo4a0f%}uXI8rY3c<h1<X4#dDwD6XS>gow`wc#~XGgf#KSW*{NH%u?hJ4F#
zAu)u+uTJhn|GK#*EzKHT?1N@b6#4jRytKv{b{9H3$R5@mMdJoMi($72x5w8S{D=Tw
zMGji<^)hT0Ut=ITumcXN=sPr;-~f}jLA=Y>LR2JBfOnP~id*T~;{9E77pu*pu3TB>
z&&A-eRO6TL*YSfJ^7|RcU#=iz`gv3|l1bc`|AVjJXCf}m@8Sup{+rAYLzz)j{K7>G
zpiFIhe*ZbJ*hbQkkTys(0s|@uLXFs(Wqm}-0&WJ^hi@TiO+JW7{G2o*6Jn{mzrW}v
zQ~)ca65<3u!P+VWf-WGCeBh)11+coC{NY5n8`t-No>^+YjtU!&{g+ViD8~LFeW4sX
z4^)!)z(=#>fg{)d5~bnK#I^>Y%i+tYJ{0wDu-kvdq=)2pkhdS*>VM5bzIAm)JC&vy
ziQN1?O1nv{u<|?`CeQz(NTGn@H&gh9TwDK?>mUcea4b3eayKELfkb(<YuzMWt|1hn
zaOh|a>Ku*9_pu#05;-(Q$14KD9evKtbiyOr2j8|{$}%b#{hvdGSpugc=;g|b5|WPZ
zV0+=8i(UDYn}ey=)k`lPS8_78l#GPI=f1#a%-Eg_M-jXSwS%{gQ3N?MigDm#u#a-P
zu>Xw??7!NDeFr?S|2Gl#QGRd#0rpwJJ_qbS6zua6_Ho3?@w`c9O6ZkCnEwE`3F^ky
z3sGE5Aqb|gVm0W0zy(w1)5}aw<MkewivAi4vV5fq<R{IV)E`4mYg)Jpxe0!GX94lB
zMSLB~-=;I_A~V5rmKxff9_-y!@3ZmmG5J#_o7v4dCK3fQ$h;rt>jaa(;H&VVKgJV2
znG!utSkO?UnlhZbirDT%xv^Y#5kX?!^+&8+WiywU^lX`5M&8rdGEK6+f*lt%bOb)D
z?KnmqvvxyawQukjjCa{X_>AA=nh*7YTpYTNSy=~1qHN39WC!rOAlGr~?Ow)LN%(YS
znK*b%WT4{U83_vaT>y_(_#w2$g86+6^Q=!2T!#NSJo%|cb05EjGDO?Mu7p&7pHH7B
z?5coO>Q}3}Q8ypr3LU@9!m~_%kI7R^-eyu}@*I;dF%bec_#^=X5*tK%o3NE4`;lWd
zI`hO4W?~BH5fYtTG-r{2!rz6}2@^7eX%k+aC>8LddHD5?<ywUfRQ$*~lJpDwH3a_W
zgL^z3#~F4UXWaNTgt6F9r(aJ$mPTeemCmQL=>c4i;W?g8qzfp0IK4ByH=V%=cs%_G
Ku1zQh;Qs>^Q=GH_

diff --git a/aerialvision/__pycache__/variableclasses.cpython-310.pyc b/aerialvision/__pycache__/variableclasses.cpython-310.pyc
deleted file mode 100644
index 4b348933624221b45c2ed7c8779c089a792fcf5d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4892
zcma)9O^+N$8LsND>7AKf+v^W(C4@-;DP!VYXOq}Tu#AOm5K%G*R*GYTX?nYA$L;Nz
zo~^3!u1B*ZiXDj?0uEeq*bOIh<R5V0gv1Z1D<}U0WCg<WR?qCm#F6M$y<JsreZJ58
z)#dzro#FZT)oX&i%-FxEarp7kxQ3$N2l*)zOz_7jkGwn7@`R6;j~4GT5q!Z@PlO_R
z%<lN$)<g_$3~qqdoT#H!M=KPKRn~51Sn<fSzt?$mOLm6~cE=N3`s<$CiQfrCAVRE;
zJwy%PNW@ra&c+W?m8WZkv{5$HrtD?Wp_Qrbp<GANZ-R8$m?dn?LHXDN^~OG^KMp{H
zaR?d;;zo|v1Y2cKyv|chW$oAoA~mV47iGCI93&>~+Q2*<NE@t`Wntq|b<=)6k~XGs
zj?>8dxyjQ4y~57vv^OYZXV_~;Hqf$IS9C&EBN7vtBSN@5VQK-NXTM#(SN7y`FYhm>
z%h^4Z>y<ZOU%pXh!=CILy=<g5m)?16xk%SmmTz9adFkTYE0+_U_l8Ak^0J={RC!-!
zCK;s4ypcRuS?&&o7bg5tp*3kQrxJoJ9^@LcmZwt9id1W<mj(}QoFsIYByqw#j>AR#
z@LT7j6Vqu+)3F&zfBMi3>27X;4>@Co87}ze-i||=5GHg#4|jYKlh%laXo`8XgVDzh
zIZrGQTH=^E{un6k2I9mXzrh~yop2nA#dCzN4@5)Zonx*Q8itmC4r2&a;2erz!mB|8
zLJ>|{@tEDmybVu%h#E9_!_%wpu}}Fe=>95J2G<8yI_}8_Os~yw18|Mwp2FPC8Ey#f
zBgbuk`^gM90{3^0+XVLy2i$7+fSI|6f8lH=;%rnOM>{oB7pKSk4DQ9*xb`V5W+(bh
zw8ghr`%I_pSwHU^8?F~+YMw&W>?tAflnm`@KqLfdhc?n#TBOQG51=HavUP3JY(t7Q
z^u<-*iohtSkCy5zCfO+O4L+0^zG9ocF}^D|P*X3uHLFoEMiqIM;}CgY$ZcD5pB3Ro
zS(eeErVpKsPPK}nX%j;GZoI>RG23Iie2@L!+w@i|qrmM#ZB#z6VNq^Lg|@eJ(T1kn
zkbP%16*-25Z2d}+Ym=3|!PQ%|I&F29jV|n~D$ZFd^AlC}R<UNkyL6=}ArpNSmFdi9
z{53`AYkTV3_^6jaZ0tk@L5<$HsZ^<2dB2q@3A5{WTYb6J0^hWz`?S`v1G;F7cU#}r
z?S?u*ODz&1Fq|vE#UM5FJda@KZf<9CKt^rD0hTfCSk2Qx8r7<a9srGbay<$8o&?=?
zeo&O=pxK<dD8~%kj1$}2+leVfnPm6UepmLS<#JYx$7jXV!Or>$NSz<Y>v@qE@l(9c
z7bb7rr`F-Jz@x7sa0A{%i9hlFGdedF&(iGrNA!bqg;dvh4%!r!f>;Ebc@Z@ZtM%1_
zK}>qYKSexpuk-7I{5s+jXh2p>r5$!0nCe?tXmpXB+Nx9U4^p7B-2(W96ct`ussxzR
zL$*k)>LjMML)E~i!s#`tIS8AmJ>d?OuM2QypFYNFno#G95J&`J1z#Q($jtaoyGA~^
zpY~*u*m{!mN-->`-b|9sVOmt2N~_2{tG9?;CPJa)5Fx`*B-3jsn%r{C7W}x;Y`NYH
z9Y-FrnQJKe6v#n%79NFr;X|Z>jS&Jjda2s@8gMk*0k?#T3#(Na22g;-bJTY<aC91g
z2`aCnzKdD7dT4TkC|_UC3wixssb$|bNY$(QM~54?ZKJEw!M(|_Ks*%N;7gssXOR7<
zOIUMO9wf3Y6ivdK4Q$iP?FY6_J5X=aD(?_^mxz=3^CbO4aEB<GTu98Co}!+1-G*5?
zgqN-%RLg$$&^Coidx=r$`g)$(6S^mR$=XJemHl<NQlpVTP}f$f2KIE9s%e%<t&^_o
zONB*{Vz_QM;+WhXKu#i=E`N-=6OliV3KKGE8?GbQdML#a*yx}>ii7P}s33~E+>bo6
zx&W^XkG$*cu^n&h=@TwKdb_^x_PA<~z1^VV>>^_B@f}2D#CG->Vlvg!$c^XPWmgl6
zv|NqmH>B#zqE+O5*?RXGZ>iHHr<x7Q0pO0|TLtlKBxF_!C<8pKGI}dF_tf_=RQ&+N
zHfhW%d~XM)LYhe`L&jkPs@cG;acp3=-#;{PMjh;NL{4#dz-L%np8=_R<iKLsD;08O
zX#ZP%1XhM}<k`2Ji|9VE99WG@pr4o4iaA(SP1K?N4O`EKBBfJwO6RjOLS_0;?ntSl
zL+L&$IT_R>yEWbocDJ_U4C_7{`&ZazJf^<ZPGEe&O)w4~A)80f8Dvk$i_RgdhaY{N
ze5*lbM0wF}Xk5-j`O_XUjm;M}7ad}L<wI@I85RYZCj~og<B3$YqEK!_MFGY7kiU)6
z0Z<gqKeeOENB)uq(Hw#cvuP1R%?7!VNJ^;vvl}zeI79mrW;!&;Qe(oyColRU_dfCd
z8+<i7c_6rbn>~_@l8RGt!XxDo9LkKVkNG(Up~O&C&+suq%4MkeCXz*@A0&Px0l&bf
zx=2ukV1GkCw7wqp)Eco4ynu7H5`BM0ajrIS@iE||g+oWg%p<3xefFXKF&d*fnc#hx
zA3?F>-C}ee?p`{%@$UQTB@+m`Lp93d?6w<K(e?zg(aZaJgv(g=6>Q0*8}fs6`_(q`
zLcf=8SJ%9a-EAo3+OVrGpx*W?>;(1urZ<?mt*uSgd4d5hVtB|8f-w3r091=G769ap
z77yKiGU14FQ#>UFclg{vh3>zLo9%Jj-=DPLaL@`yA(H;!1$GNDqmqe}$9JIQ=zj!B
zj02Z0oOk`_NluhF-v;XV!J8^`ocGr$9gqm`E-xeC6bV3eiO}_*gcafe5P#hgUBORa
zZ-6@J%B(Qv>25!#U$NC;PyZDQIFV42<au@X14SuQjN0hzj7avUb&%=J1RHlA7jfJf
zb2{E8kptCpHK%`cgFn*Nly1Z<@M14+#!++5-7Q1sk!Rm&y^8Mt&uPI~&Dpt$@lSmf
ztuuW{p$Sq4C;f?6zKZ_;;A;S)^F;GdWFLR$h!pg{20s)~9lonhUZ*m@u+q!BSVU25
zsxlt(H&fO8HTBZ94oSlaf&}(=6laJ~M(-?#uUOna_w!u^-GhodJN>{;-2`sUJ8kaH
z1Ft~IxFWNd+1A#TG(*)VC-)Dq-9HF^mrot+dobXejo>{@J>mqM5y04;w4P@K6QzH}
zDg7%)m0nT6nCiLnXNpd~uji_u=7Bbm{A{B-r&G*-qC=BFoh~C1Qflw?`1I_(+<4@#
z->k%%HZw#Hc0ABuI@#Y)GzovAzfsdy)ZalYp;Xr8R4(ecNL*R1yV7%3wDFZHd%j9j
SomnjWXyGjwMX<22xb#2ERZT4b


From b98d7998283532e325cc7e35b5f3fc57040806b3 Mon Sep 17 00:00:00 2001
From: christindbose <chris241@purdue.edu>
Date: Wed, 10 May 2023 15:01:30 -0400
Subject: [PATCH 113/154] Pushed aerialvision.py

---
 bin/aerialvision.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/bin/aerialvision.py b/bin/aerialvision.py
index a5b02f0fe..5cc7ad983 100755
--- a/bin/aerialvision.py
+++ b/bin/aerialvision.py
@@ -66,20 +66,21 @@
 import os
 
 if not os.environ['HOME']:
-	print 'please set your HOME environment variable to your home directory'
+	print('please set your HOME environment variable to your home directory')
 	sys.exit
 if not os.environ['GPGPUSIM_ROOT']:
-	print 'please set your GPGPUSIM_ROOT environment variable to your home directory'
+	print('please set your GPGPUSIM_ROOT environment variable to your home directory')
 	sys.exit
 
 sys.path.append( os.environ['GPGPUSIM_ROOT'] + '/aerialvision/' ) 
 
-import Tkinter as Tk
+import tkinter as Tk
 import Pmw
 import startup
 import time
 
-from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg, NavigationToolbar2TkAgg
+from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg, NavigationToolbar2Tk
 from matplotlib.figure import Figure
 
 startup.fileInput(sys.argv[1:])
+

From e633760f87c1755412e0c02903947445ae7c9e77 Mon Sep 17 00:00:00 2001
From: Weili An <an107@purdue.edu>
Date: Wed, 10 May 2023 15:24:37 -0400
Subject: [PATCH 114/154] Fix typos

---
 src/gpgpu-sim/gpu-cache.cc | 10 +++++-----
 src/gpgpu-sim/mem_fetch.h  |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 3a5a67dfa..8d129c649 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -1375,7 +1375,7 @@ enum cache_request_status data_cache::wr_miss_wa_naive(
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
       wb->set_chip(mf->get_tlx_addr().chip);
-      wb->set_parition(mf->get_tlx_addr().sub_partition);
+      wb->set_partition(mf->get_tlx_addr().sub_partition);
       send_write_request(wb, cache_event(WRITE_BACK_REQUEST_SENT, evicted),
                          time, events);
     }
@@ -1428,7 +1428,7 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
         // the evicted block may have wrong chip id when advanced L2 hashing  is
         // used, so set the right chip address from the original mf
         wb->set_chip(mf->get_tlx_addr().chip);
-        wb->set_parition(mf->get_tlx_addr().sub_partition);
+        wb->set_partition(mf->get_tlx_addr().sub_partition);
         send_write_request(wb, cache_event(WRITE_BACK_REQUEST_SENT, evicted),
                            time, events);
       }
@@ -1501,7 +1501,7 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
         // the evicted block may have wrong chip id when advanced L2 hashing  is
         // used, so set the right chip address from the original mf
         wb->set_chip(mf->get_tlx_addr().chip);
-        wb->set_parition(mf->get_tlx_addr().sub_partition);
+        wb->set_partition(mf->get_tlx_addr().sub_partition);
         send_write_request(wb, cache_event(WRITE_BACK_REQUEST_SENT, evicted),
                            time, events);
       }
@@ -1568,7 +1568,7 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
       wb->set_chip(mf->get_tlx_addr().chip);
-      wb->set_parition(mf->get_tlx_addr().sub_partition);
+      wb->set_partition(mf->get_tlx_addr().sub_partition);
       send_write_request(wb, cache_event(WRITE_BACK_REQUEST_SENT, evicted),
                          time, events);
     }
@@ -1650,7 +1650,7 @@ enum cache_request_status data_cache::rd_miss_base(
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
       wb->set_chip(mf->get_tlx_addr().chip);
-      wb->set_parition(mf->get_tlx_addr().sub_partition);
+      wb->set_partition(mf->get_tlx_addr().sub_partition);
       send_write_request(wb, WRITE_BACK_REQUEST_SENT, time, events);
     }
     return MISS;
diff --git a/src/gpgpu-sim/mem_fetch.h b/src/gpgpu-sim/mem_fetch.h
index e039846e3..283fe80e5 100644
--- a/src/gpgpu-sim/mem_fetch.h
+++ b/src/gpgpu-sim/mem_fetch.h
@@ -77,7 +77,7 @@ class mem_fetch {
 
   const addrdec_t &get_tlx_addr() const { return m_raw_addr; }
   void set_chip(unsigned chip_id) { m_raw_addr.chip = chip_id; }
-  void set_parition(unsigned sub_partition_id) {
+  void set_partition(unsigned sub_partition_id) {
     m_raw_addr.sub_partition = sub_partition_id;
   }
   unsigned get_data_size() const { return m_data_size; }

From cb6060a60fd38102dd222f4f1a531c8f4c725d2b Mon Sep 17 00:00:00 2001
From: christindbose <chris241@purdue.edu>
Date: Wed, 10 May 2023 15:29:48 -0400
Subject: [PATCH 115/154] Added check if reservation fail happens to prevent
 SEGF during cache probe

---
 src/gpgpu-sim/gpu-cache.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 3a5a67dfa..b3105ae0c 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -409,6 +409,11 @@ void tag_array::fill(new_addr_type addr, unsigned time,
   // assert( m_config.m_alloc_policy == ON_FILL );
   unsigned idx;
   enum cache_request_status status = probe(addr, idx, mask, is_write);
+
+  if (status == RESERVATION_FAIL) {
+	 return;
+  }
+
   bool before = m_lines[idx]->is_modified_line();
   // assert(status==MISS||status==SECTOR_MISS); // MSHR should have prevented
   // redundant memory request

From 5418798618ded0dbfcc0ca577c6b04e7d3a4dbc0 Mon Sep 17 00:00:00 2001
From: tgrogers <tgrogers@purdue.edu>
Date: Fri, 19 May 2023 19:52:02 -0400
Subject: [PATCH 116/154] thowing an error flag on failure

---
 setup_environment | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/setup_environment b/setup_environment
index 7eeaa4f12..871bb593a 100644
--- a/setup_environment
+++ b/setup_environment
@@ -16,17 +16,17 @@ echo -n "GPGPU-Sim version $GPGPUSIM_VERSION_STRING (build $GPGPUSIM_BUILD_STRIN
 
 if [ ! -n "$CUDA_INSTALL_PATH" ]; then
 	echo "ERROR ** Install CUDA Toolkit and set CUDA_INSTALL_PATH.";
-	return;
+	return 1;
 fi
 
 if [ ! -d "$CUDA_INSTALL_PATH" ]; then
 	echo "ERROR ** CUDA_INSTALL_PATH=$CUDA_INSTALL_PATH invalid (directory does not exist)";
-	return;
+	return 1;
 fi
 
 if [ ! `uname` = "Linux" -a  ! `uname` = "Darwin" ]; then
 	echo "ERROR ** Unsupported platform: GPGPU-Sim $GPGPUSIM_VERSION_STRING developed and tested on Linux."
-	return;
+	return 1;
 fi
 
 export PATH=`echo $PATH | sed "s#$GPGPUSIM_ROOT/bin:$CUDA_INSTALL_PATH/bin:##"`
@@ -41,7 +41,7 @@ if [ $? = 1 ]; then
 	echo "         Try adding $CUDA_INSTALL_PATH/bin/ to your PATH environment variable.";
 	echo "         Please also be sure to read the README file if you have not done so.";
 	echo "";
-	return;
+	return 1;
 fi
 
 CC_VERSION=`gcc --version | head -1 | awk '{for(i=1;i<=NF;i++){ if(match($i,/^[0-9]\.[0-9]\.[0-9]$/))  {print $i; exit 0}}}'`
@@ -51,7 +51,7 @@ export CUDA_VERSION_NUMBER=`echo $CUDA_VERSION_STRING | sed 's/\./ /' | awk '{pr
 if [ $CUDA_VERSION_NUMBER -gt 11100 -o $CUDA_VERSION_NUMBER -lt 2030  ]; then
 	echo "ERROR ** GPGPU-Sim version $GPGPUSIM_VERSION_STRING not tested with CUDA version $CUDA_VERSION_STRING (please see README)";
 	echo $CUDA_VERSION_NUMBER
-  return
+    return 1;
 fi
 
 if [ $CUDA_VERSION_NUMBER -ge 6000 ]; then
@@ -121,7 +121,7 @@ fi
 if [ -d $GPGPUSIM_ROOT/src/accelwattch/ ]; then
 	if [ ! -f $GPGPUSIM_ROOT/src/accelwattch/gpgpu_sim.verify ]; then
 		echo "ERROR ** gpgpu_sim.verify not found in $GPGPUSIM_ROOT/src/accelwattch";
-		return;
+		return 1;
 	fi
 	export GPGPUSIM_POWER_MODEL=$GPGPUSIM_ROOT/src/accelwattch/;
 	echo "configured with AccelWattch.";
@@ -129,13 +129,13 @@ elif [ -n "$GPGPUSIM_POWER_MODEL" ]; then
 	if [ ! -f $GPGPUSIM_POWER_MODEL/gpgpu_sim.verify ]; then
 		echo "";
 		echo "ERROR ** gpgpu_sim.verify not found in $GPGPUSIM_ROOT/src/accelwattch/ - Either incorrect directory or incorrect McPAT version";
-		return;
+		return 1;
 	fi
 	echo "configure with power model in $GPGPUSIM_POWER_MODEL.";
 elif [ ! -d $GPGPUSIM_POWER_MODEL ]; then
 		echo "";
 		echo "ERROR ** GPGPUSIM_POWER_MODEL ($GPGPUSIM_POWER_MODEL) does not exist... Please set this to the gpgpusim_mcpat directory or unset this environment variable.";
-		return;
+		return 1;
 else
 	echo "configured without a power model.";
 fi

From 8ca01b0721445de3b044e70327fd69f017a75ef0 Mon Sep 17 00:00:00 2001
From: JRPan <25518778+JRPan@users.noreply.github.com>
Date: Wed, 7 Jun 2023 14:51:07 +0800
Subject: [PATCH 117/154] Revert QV100 config and add GV100 config

---
 .../SM7_GV100/accelwattch_ptx_sim.xml         | 623 ++++++++++++++++++
 .../SM7_GV100/accelwattch_ptx_sim_alt.xml     | 623 ++++++++++++++++++
 .../SM7_GV100/accelwattch_sass_hw.xml         | 613 +++++++++++++++++
 .../SM7_GV100/accelwattch_sass_hybrid.xml     | 613 +++++++++++++++++
 .../SM7_GV100/accelwattch_sass_sim.xml        | 613 +++++++++++++++++
 .../SM7_GV100/accelwattch_sass_sim_alt.xml    | 613 +++++++++++++++++
 .../SM7_GV100/config_volta_islip.icnt         |  74 +++
 configs/tested-cfgs/SM7_GV100/gpgpusim.config | 237 +++++++
 configs/tested-cfgs/SM7_QV100/gpgpusim.config |   2 +-
 9 files changed, 4010 insertions(+), 1 deletion(-)
 create mode 100644 configs/tested-cfgs/SM7_GV100/accelwattch_ptx_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_GV100/accelwattch_ptx_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_GV100/accelwattch_sass_hw.xml
 create mode 100644 configs/tested-cfgs/SM7_GV100/accelwattch_sass_hybrid.xml
 create mode 100644 configs/tested-cfgs/SM7_GV100/accelwattch_sass_sim.xml
 create mode 100644 configs/tested-cfgs/SM7_GV100/accelwattch_sass_sim_alt.xml
 create mode 100644 configs/tested-cfgs/SM7_GV100/config_volta_islip.icnt
 create mode 100644 configs/tested-cfgs/SM7_GV100/gpgpusim.config

diff --git a/configs/tested-cfgs/SM7_GV100/accelwattch_ptx_sim.xml b/configs/tested-cfgs/SM7_GV100/accelwattch_ptx_sim.xml
new file mode 100644
index 000000000..d94d5bdd4
--- /dev/null
+++ b/configs/tested-cfgs/SM7_GV100/accelwattch_ptx_sim.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="11.44089762" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="21.76302498" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.737353491" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.618027871" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.53469516" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="13.9055689" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.313660815" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.053279375" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.079919063" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.429666768" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.711591276" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.742812382" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.148636575" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.212559571" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.241271438" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.59034036" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.212555149" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.702043615" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.282564496" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="2.485" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.212559047" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.02772" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.0336" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00924" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.046834662" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.670605032" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="3.269555394" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.18020968" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="83.18977901" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.0257" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.01021" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.02466" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.290770573" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.252598514" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.121706665" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.599926258" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="25.84876211" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_GV100/accelwattch_ptx_sim_alt.xml b/configs/tested-cfgs/SM7_GV100/accelwattch_ptx_sim_alt.xml
new file mode 100644
index 000000000..0c6f21147
--- /dev/null
+++ b/configs/tested-cfgs/SM7_GV100/accelwattch_ptx_sim_alt.xml
@@ -0,0 +1,623 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="2" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.57" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.827711067" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="8.827711067" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="8.545163843" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.545163843" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="8.545163843" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="8.545163843" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.65916315" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.73418985" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.763326296" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.126123699" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.126123699" /> <!--Register Writes-->
+		<param name="INT_ACC" value="2.851008299" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.723984758" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.723979435" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.119939033" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147036868" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="1.234782354" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.594393568" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.147037283" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.920933945" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.18425577" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.656666667" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.147036868" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.03058335" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.03058335" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.03058335" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="2.032318132" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.032318132" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="2.032318132" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="2.032318132" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="70.15055938" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.01" /> <!--Pipeline Accesses-->
+
+		<!-- PTX Model specific DYNAMIC Power Activity Factors-->
+
+		<param name="TC_H" value="0.1" /> <!--Texture Cache Hits-->
+		<param name="TC_M" value="0.1" /> <!--Texture Cache Misses-->
+		<param name="INT_MUL24_ACC" value="0.235290825" /> <!--Integer 24-bit MUL Accesses-->
+		<param name="INT_MUL32_ACC" value="0.250616643" /> <!--Integer 32-bit MUL Accesses-->
+		<param name="INT_DIV_ACC" value="5.327155759" /> <!--Integer DIV Accesses-->
+		<param name="FP_DIV_ACC" value="4.992722677" /> <!--FP DIV Accesses-->
+		<param name="DP_DIV_ACC" value="26.46013946" /> <!--DP DIV Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_GV100/accelwattch_sass_hw.xml b/configs/tested-cfgs/SM7_GV100/accelwattch_sass_hw.xml
new file mode 100644
index 000000000..64f89d646
--- /dev/null
+++ b/configs/tested-cfgs/SM7_GV100/accelwattch_sass_hw.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="43.5416637" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="7.193244877" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="8.01198819" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.497094158" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="12.9276971" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="9.322047" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="10.383093" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="4.146674883" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="3.350646078" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.041043127" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.077722905" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.147866461" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.147864271" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.48526506" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.147864519" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.376510619" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.147862451" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.147865348" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.148632914" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.4530026" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.020752854" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.025154975" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.006917618" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.430442278" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.514812193" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.22265896" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.32662057" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="87.30828879" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="1.028" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_GV100/accelwattch_sass_hybrid.xml b/configs/tested-cfgs/SM7_GV100/accelwattch_sass_hybrid.xml
new file mode 100644
index 000000000..175f1fd47
--- /dev/null
+++ b/configs/tested-cfgs/SM7_GV100/accelwattch_sass_hybrid.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="100" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="48.259994" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="0" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="0" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="6.607568654" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="7.359649633" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.456620598" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="11.87512" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="5.932232271" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="6.607445701" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="5.116439734" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0" /> <!--Register Reads-->
+		<param name="REG_WR" value="0" /> <!--Register Writes-->
+		<param name="INT_ACC" value="7.725550311" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.201142468" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.201146878" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.130624774" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.130623287" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.130617842" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.202205086" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.130621909" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.130622532" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.240811904" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="3.453002628" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.01527021" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.018509345" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.00509007" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="3.391355385" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="6.440581599" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="11.09478657" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="3.288716634" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="86.3134871" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_GV100/accelwattch_sass_sim.xml b/configs/tested-cfgs/SM7_GV100/accelwattch_sass_sim.xml
new file mode 100644
index 000000000..570332d1c
--- /dev/null
+++ b/configs/tested-cfgs/SM7_GV100/accelwattch_sass_sim.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="10" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="4.661" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="8.593489331" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="29.735231" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.835033124" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="10.95446778" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="0.679656761" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="17.67551799" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.1107" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.1233" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="0.779992642" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.100560581" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.140604679" /> <!--Register Writes-->
+		<param name="INT_ACC" value="14.98768151" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.529670751" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="0.777229051" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.115098047" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.089517055" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.195089274" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.125521663" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.13336307" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.36204415" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.1321288" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="0.815454621" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.115100088" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.025941068" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.031443719" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.008647023" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.260867526" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="2.394535301" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="4.124916" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.222707601" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="32.09037703" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.514" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_GV100/accelwattch_sass_sim_alt.xml b/configs/tested-cfgs/SM7_GV100/accelwattch_sass_sim_alt.xml
new file mode 100644
index 000000000..9998e9656
--- /dev/null
+++ b/configs/tested-cfgs/SM7_GV100/accelwattch_sass_sim_alt.xml
@@ -0,0 +1,613 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+	<component id="system" name="system">
+<!-- Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+Northwestern University, Purdue University, The University of British Columbia
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer;
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution;
+3. Neither the names of Northwestern University, Purdue University,
+   The University of British Columbia nor the names of their contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE. -->
+
+		<!-- DYNAMIC Power Activity Factors -->
+
+		<param name="TOT_INST" value="16" /> <!--Instruction Buffer Activity Factor(Total Warp Instructions)-->
+		<param name="FP_INT" value="2" /> <!--Scheduler Activity Factor(Non-Memory Warp Instructons)-->
+		<param name="IC_H" value="9.967097889" /> <!--Instruction Cache Hits -->
+		<param name="IC_M" value="9.967097889" /> <!--Instruction Cache Misses-->
+		<param name="DC_RH" value="9.382262155" /> <!--L1 Data Cache Read Hits--> 
+		<param name="DC_RM" value="9.382262155" /> <!--L1 Data Cache Read Misses-->
+		<param name="DC_WH" value="9.382262155" /> <!--L1 Data Cache Write Hits-->
+		<param name="DC_WM" value="9.382262155" /> <!--L1 Data Cache Write Misses-->
+		<param name="CC_H" value="0.5" /> <!--Constant Cache Hits-->
+		<param name="CC_M" value="0.5" /> <!--Constant Cache Misses-->
+		<param name="SHRD_ACC" value="1.045696662" /> <!--Shared Memory Accesses-->
+		<param name="REG_RD" value="0.096144789" /> <!--Register Reads-->
+		<param name="REG_WR" value="0.134430479" /> <!--Register Writes-->
+		<param name="INT_ACC" value="18.18093465" /> <!--Integer ALU Accesses-->
+		<param name="FP_ACC" value="0.810674358" /> <!--FPU Accesses-->
+		<param name="DP_ACC" value="1.061592439" /> <!--Integer Accesses-->
+		<param name="INT_MUL_ACC" value="0.114254558" /> <!--Integer MUL Accesses-->
+		<param name="FP_MUL_ACC" value="0.155989373" /> <!--FP MUL Accesses-->
+		<param name="FP_SQRT_ACC" value="0.377990392" /> <!--SFU: sqrt Accesses-->
+		<param name="FP_LG_ACC" value="0.155990168" /> <!--SFU: log Accesses-->
+		<param name="FP_SIN_ACC" value="0.155991536" /> <!--SFU: sin/cos Accesses-->
+		<param name="FP_EXP_ACC" value="0.571396061" /> <!--SFU: exponent Accesses-->
+		<param name="DP_MUL_ACC" value="0.155989896" /> <!--DP MUL Accesses-->
+		<param name="TENSOR_ACC" value="1.013271823" /> <!--Tensor Accesses-->
+		<param name="TEX_ACC" value="0.167780179" /> <!--Texture Unit Accesses-->
+		<param name="MEM_RD" value="0.032236806" /> <!--DRAM Reads-->
+		<param name="MEM_WR" value="0.032236806" /> <!--DRAM Writes-->
+		<param name="MEM_PRE" value="0.032236806" /> <!--DRAM Precharge-->
+		<param name="L2_RH" value="1.175516289" /> <!--L2 Data Cache Read Hits--> 
+		<param name="L2_RM" value="1.175516289" /> <!--L2 Data Cache Read Misses-->
+		<param name="L2_WH" value="1.175516289" /> <!--L2 Data Cache Write Hits-->
+		<param name="L2_WM" value="1.175516289" /> <!--L2 Data Cache Write Misses-->
+		<param name="NOC_A" value="77.25" /> <!--Interconnect Network Accesses-->
+		<param name="PIPE_A" value="0.4" /> <!--Pipeline Accesses-->
+
+		<!-- STATIC & CONSTANT Power -->
+
+		<param name="constant_power" value="32.32522272" /> <!--Constant power-->
+		<param name="idle_core_power" value="0.28279166"/> <!--Idle SM power-->
+
+		<param name="static_cat1_flane" value="15.29035866"/> <!--INT (ADD+MUL) First Lane Activation Power-->
+		<param name="static_cat1_addlane" value="0.586233603"/> <!--INT (ADD+MUL) Additional Lane Activation Power-->
+
+		<param name="static_cat2_flane" value="18.6179906"/> <!--INT+FP First Lane Activation Power-->
+		<param name="static_cat2_addlane" value="0.645228013"/> <!--INT+FP Additional Lane Activation Power-->
+
+		<param name="static_cat3_flane" value="19.10017723"/> <!--INT+FP+DP First Lane Activation Power-->
+		<param name="static_cat3_addlane" value="0.726863055"/> <!--INT+FP+DP Additional Lane Activation Power-->
+
+		<param name="static_cat4_flane" value="18.55029744"/> <!--INT+FP+SFU First Lane Activation Power-->
+		<param name="static_cat4_addlane" value="0.6099397"/> <!--INT+FP+SFU Additional Lane Activation Power-->
+
+		<param name="static_cat5_flane" value="14.74826681"/> <!--INT+FP+TEX First Lane Activation Power-->
+		<param name="static_cat5_addlane" value="0.514367937"/> <!--INT+FP+TEX Additional Lane Activation Power-->
+
+		<param name="static_cat6_flane" value="48.94875596"/> <!--INT+FP+TENSOR First Lane Activation Power-->
+		<param name="static_cat6_addlane" value="0.0"/> <!--INT+FP+TENSOR Additional Lane Activation Power-->
+
+		<param name="static_light_flane" value="1.965373811"/> <!--LIGHT_SM First Lane Activation Power-->
+		<param name="static_light_addlane" value="0.003966868"/> <!--LIGHT_SM Additional Lane Activation Power-->
+
+		<param name="static_intadd_flane" value="19.70468506"/> <!--INT ADD First Lane Activation Power-->
+		<param name="static_intadd_addlane" value="0.388578623"/> <!--INT ADD Additional Lane Activation Power-->
+
+		<param name="static_intmul_flane" value="16.64811823"/> <!--INT MUL First Lane Activation Power-->
+		<param name="static_intmul_addlane" value="0.281803166"/> <!--INT MUL Additional Lane Activation Power-->
+
+		<param name="static_geomean_flane" value="17.21745077"/> <!--GEOMEAN First Lane Activation Power-->
+		<param name="static_geomean_addlane" value="0.650630555"/> <!--GEOMEAN Additional Lane Activation Power-->		
+		<param name="static_shared_flane" value="31.40965691"/> <!--Shared Memory First Lane Activation Power-->
+		<param name="static_l1_flane" value="34.79491352"/> <!--L1D First Lane Activation Power-->
+		<param name="static_l2_flane" value="17.30654755"/> <!--L2D ADD+MUL First Lane Activation Power-->
+
+
+		<!-- LEAVE BELOW UNCHANGED! These are parameters used for initial GPUWattch model left untouched -->
+
+
+		<param name="GPU_Architecture" value="1"/><!-- 0-G80; 1-Fermi; others not supported -->
+		<param name="architecture" value="1"/> <!-- fermi:1 quadro:2 other: undefined-->
+		<param name="number_of_L1Directories" value="0"/>
+		<param name="number_of_L2Directories" value="0"/>
+		<param name="number_of_L2s" value="1"/> <!-- This number means how many L2 clusters in each cluster there can be multiple banks/ports -->
+		<param name="number_of_L3s" value="0"/> <!-- This number means how many L3 clusters -->
+		<param name="number_of_NoCs" value="1"/> 
+		<param name="homogeneous_cores" value="1"/><!--1 means homo -->
+		<param name="homogeneous_L2s" value="1"/>
+		<param name="homogeneous_L1Directorys" value="1"/>
+		<param name="homogeneous_L2Directorys" value="1"/>
+		<param name="homogeneous_L3s" value="1"/>
+		<param name="homogeneous_ccs" value="1"/><!--cache coherece hardware -->
+		<param name="homogeneous_NoCs" value="1"/>
+		<param name="core_tech_node" value="23"/><!-- nm -->
+
+		<param name="number_cache_levels" value="2"/>
+		<param name="interconnect_projection_type" value="0"/><!--0: agressive wire technology; 1: conservative wire technology -->
+		<param name="device_type" value="0"/><!--0: HP(High Performance Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power)  -->
+		<param name="longer_channel_device" value="1"/><!-- 0 no use; 1 use when possible -->
+		<param name="machine_bits" value="32"/>
+		<param name="virtual_address_width" value="32"/>
+		<param name="physical_address_width" value="32"/>
+		<param name="virtual_memory_page_size" value="4096"/>
+
+		<param name="IDLE_CORE_N" value="1"/>
+		<stat name="num_idle_cores" value="0"/><!-- Average Number of idle cores during this period -->
+		<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		<stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		<stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!--This page size(B) is complete different from the page size in Main memo secction. this page size is the size of 
+			virtual memory from OS/Archi perspective; the page size in Main memo secction is the actuall physical line in a DRAM bank  -->
+		<!-- *********************** cores ******************* -->
+		<component id="system.core0" name="core0">
+			<!-- Core property -->
+			<param name="clock_rate" value="700"/>
+			<param name="instruction_length" value="32"/>
+			<param name="opcode_width" value="9"/>
+			<!-- address width determins the tag_width in Cache, LSQ and buffers in cache controller 
+			default value is machine_bits, if not set --> 
+			<param name="machine_type" value="1"/><!-- 1 inorder; 0 OOO-->
+			<!-- inorder/OoO -->
+			<param name="number_hardware_threads" value="32"/>
+			<!-- number_instruction_fetch_ports(icache ports) is always 1 in single-thread processor,
+			it only may be more than one in SMT processors. BTB ports always equals to fetch ports since 
+			branch information in consective branch instructions in the same fetch group can be read out from BTB once.--> 
+			<param name="fetch_width" value="1"/>
+			<!-- fetch_width determins the size of cachelines of L1 cache block -->
+			<param name="number_instruction_fetch_ports" value="1"/>
+			<param name="decode_width" value="1"/>
+			<!-- decode_width determins the number of ports of the 
+			renaming table (both RAM and CAM) scheme -->
+			<param name="issue_width" value="2"/>
+			<!-- issue_width determins the number of ports of Issue window and other logic 
+			as in the complexity effective proccessors paper; issue_width==dispatch_width -->
+			<param name="commit_width" value="2"/>
+			<!-- commit_width determins the number of ports of register files -->
+			<param name="fp_issue_width" value="1"/>
+			<param name="prediction_width" value="0"/> 
+			<!-- number of branch instructions can be predicted simultannouesl-->
+			<!-- Current version of McPAT does not distinguish int and floating point pipelines 
+			Theses parameters are reserved for future use.--> 
+			<param name="pipelines_per_core" value="1,1"/>
+			<!--integer_pipeline and floating_pipelines, if the floating_pipelines is 0, then the pipeline is shared-->
+			<param name="pipeline_depth" value="8,8"/>
+			<!-- pipeline depth of int and fp, if pipeline is shared, the second number is the average cycles of fp ops -->
+			<!-- issue and exe unit-->
+			<param name="ALU_per_core" value="32"/>
+			<!-- contains an adder, a shifter, and a logical unit -->
+			<param name="MUL_per_core" value="4"/>
+			<!-- For MUL and Div -->
+			<param name="FPU_per_core" value="32"/>		
+			<!-- buffer between IF and ID stage -->
+			<param name="instruction_buffer_size" value="1"/>
+			<!-- buffer between ID and sche/exe stage -->
+			<param name="decoded_stream_buffer_size" value="1"/>
+			<param name="instruction_window_scheme" value="0"/><!-- 0 PHYREG based, 1 RSBASED-->
+			<!-- McPAT support 2 types of OoO cores, RS based and physical reg based-->
+			<param name="instruction_window_size" value="1"/>
+			<param name="fp_instruction_window_size" value="1"/>
+			<!-- the instruction issue Q as in Alpha 21264; The RS as in Intel P6 -->
+			<param name="ROB_size" value="0"/>
+			<!-- each in-flight instruction has an entry in ROB -->
+			<!-- registers -->
+			<!-- SM parameters  Added by Syed Gilani -->
+			<param name="rf_banks" value="32"/>
+			<param name="simd_width" value="32"/>
+			<param name="collector_units" value="32"/>
+			<param name="core_clock_ratio" value="2"/>
+			<param name="warp_size" value="32"/>
+
+			<param name="archi_Regs_IRF_size" value="65536"/>			
+			<param name="archi_Regs_FRF_size" value="32"/>
+			<!--  if OoO processor, phy_reg number is needed for renaming logic, 
+			renaming logic is for both integer and floating point insts.  -->
+			<param name="phy_Regs_IRF_size" value="32"/>
+			<param name="phy_Regs_FRF_size" value="32"/>
+			<!-- rename logic -->
+			<param name="rename_scheme" value="0"/>
+			<!-- can be RAM based(0) or CAM based(1) rename scheme 
+			RAM-based scheme will have free list, status table;
+			CAM-based scheme have the valid bit in the data field of the CAM 
+			both RAM and CAM need RAM-based checkpoint table, checkpoint_depth=# of in_flight instructions;
+			Detailed RAT Implementation see TR -->
+			<param name="register_windows_size" value="0"/>
+			<!-- how many windows in the windowed register file, sun processors;
+			no register windowing is used when this number is 0 -->
+			<!-- In OoO cores, loads and stores can be issued whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+			They will always try to exeute out-of-order though. -->
+			<param name="LSU_order" value="inorder"/>
+			<param name="store_buffer_size" value="32"/>
+			<!-- By default, in-order cores do not have load buffers -->
+			<param name="load_buffer_size" value="32"/>	
+			<!-- number of ports refer to sustainable concurrent memory accesses --> 
+			<param name="memory_ports" value="2"/>	
+			<!-- max_allowed_in_flight_memo_instructions determins the # of ports of load and store buffer
+			as well as the ports of Dcache which is connected to LSU -->	
+			<!-- dual-pumped Dcache can be used to save the extra read/write ports -->
+			<param name="RAS_size" value="1"/>						
+			<!-- general stats, defines simulation periods;require total, idle, and busy cycles for senity check  -->
+			<!-- please note: if target architecture is X86, then all the instrucions refer to (fused) micro-ops -->
+			<stat name="total_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="branch_instructions" value="branch_instruction_match_mcpat"/>
+			<stat name="branch_mispredictions" value="0"/>
+			<stat name="load_instructions" value="load_instruction_match_mcpat"/>
+			<stat name="store_instructions" value="store_instruction_match_mcpat"/>
+			<stat name="committed_instructions" value="total_instructions_match_mcpat"/>
+			<stat name="committed_int_instructions" value="int_instruction_match_mcpat"/>
+			<stat name="committed_fp_instructions" value="flt_instruction_match_mcpat"/>
+			<stat name="pipeline_duty_cycle" value="0.6"/><!--<=1, runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+			<!-- the following cycle stats are used for heterogeneouse cores only, 
+				please ignore them if homogeneouse cores -->
+			<stat name="total_cycles" value="total_cycles_match_mcpat"/>
+		    <stat name="idle_cycles" value="idle_cycles_match_mcpat"/>
+		    <stat name="busy_cycles"  value="busy_cycles_match_mcpat"/>
+			<!-- instruction buffer stats -->
+			<!-- ROB stats, both RS and Phy based OoOs have ROB
+			performance simulator should capture the difference on accesses,
+			otherwise, McPAT has to guess based on number of commited instructions. -->
+			<stat name="ROB_reads" value="263886"/>
+			<stat name="ROB_writes" value="263886"/>
+			<!-- RAT accesses -->
+			<stat name="rename_accesses" value="263886"/>
+			<stat name="fp_rename_accesses" value="263886"/>
+			<!-- decode and rename stage use this, should be total ic - nop -->
+			<!-- Inst window stats -->
+			<stat name="inst_window_reads" value="263886"/>
+			<stat name="inst_window_writes" value="263886"/>
+			<stat name="inst_window_wakeup_accesses" value="263886"/>
+			<stat name="fp_inst_window_reads" value="263886"/>
+			<stat name="fp_inst_window_writes" value="263886"/>
+			<stat name="fp_inst_window_wakeup_accesses" value="263886"/>
+			<!--  RF accesses -->
+			<stat name="int_regfile_reads" value="int_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_reads" value="int_register_write_access_match_mcpat"/>
+			<stat name="int_regfile_writes" value="float_register_read_access_match_mcpat"/>
+			<stat name="float_regfile_writes" value="float_register_write_access_match_mcpat"/>
+			
+			<!-- The following stat is for operand collector power - Added by Syed -->
+			<stat name="non_rf_operands" value="0"/>
+
+			<!-- accesses to the working reg -->
+			<stat name="function_calls" value="0"/>
+			<stat name="context_switches" value="0"/> <!--not used in the McPAT -->
+			<!-- Number of Windowes switches (number of function calls and returns)-->
+			<!-- Alu stats by default, the processor has one FPU that includes the divider and 
+			 multiplier. The fpu accesses should include accesses to multiplier and divider  -->
+			<stat name="ialu_accesses" value="ialu_accesses_match_mcpat"/>			
+			<stat name="fpu_accesses" value="fpu_accesses_match_mcpat"/>
+			<stat name="mul_accesses" value="mul_accesses_match_mcpat"/>
+			<stat name="cdb_alu_accesses" value="0"/>
+			<stat name="cdb_mul_accesses" value="0"/>
+			<stat name="cdb_fpu_accesses" value="0"/>
+			<!-- multiple cycle accesses should be counted multiple times, 
+			otherwise, McPAT can use internal counter for different floating point instructions 
+			to get final accesses. But that needs detailed info for floating point inst mix -->
+			<!--  currently the performance simulator should 
+			make sure all the numbers are final numbers, 
+			including the explicit read/write accesses, 
+			and the implicite accesses such as replacements and etc.
+			Future versions of McPAT may be able to reason the implicite access
+			based on param and stats of last level cache
+			The same rule applies to all cache access stats too!  -->
+			<!-- following is AF for max power computation. 
+				Do not change them, unless you understand them-->
+			<stat name="IFU_duty_cycle" value="0.25"/>			
+			<stat name="LSU_duty_cycle" value="0.25"/>
+			<stat name="MemManU_I_duty_cycle" value="1"/>
+			<stat name="MemManU_D_duty_cycle" value="0.25"/>
+			<stat name="ALU_duty_cycle" value="0.9"/>
+			<stat name="MUL_duty_cycle" value="0.5"/>
+			<stat name="FPU_duty_cycle" value="1"/><!-- FPU numbers are already average -->
+			<stat name="ALU_cdb_duty_cycle" value="0.9"/>
+			<stat name="MUL_cdb_duty_cycle" value="0.5"/>
+			<stat name="FPU_cdb_duty_cycle" value="15"/>	
+			<component id="system.core0.predictor" name="PBT">
+				<!-- branch predictor; tournament predictor see Alpha implementation -->
+				<param name="local_predictor_size" value="10,3"/>
+				<param name="local_predictor_entries" value="1024"/>
+				<param name="global_predictor_entries" value="4096"/>
+				<param name="global_predictor_bits" value="2"/>
+				<param name="chooser_predictor_entries" value="4096"/>
+				<param name="chooser_predictor_bits" value="2"/>
+				<!-- These parameters can be combined like below in next version
+				<param name="load_predictor" value="10,3,1024"/>
+				<param name="global_predictor" value="4096,2"/>
+				<param name="predictor_chooser" value="4096,2"/>
+				-->
+			</component>
+			<component id="system.core0.itlb" name="itlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+				<!-- there is no write requests to itlb although writes happen to itlb after miss, 
+				which is actually a replacement -->
+			</component>
+			<component id="system.core0.icache" name="icache">
+				<!-- there is no write requests to itlb although writes happen to it after miss, 
+				which is actually a replacement -->
+				<param name="icache_config" value="16384,128,4,1,1,3,8,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<!-- cache_policy;//0 no write or write-though with non-write allocate;1 write-back with write-allocate -->
+				<param name="buffer_sizes" value="16, 16, 16,0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size--> 
+				<stat name="read_accesses" value="total_instructions_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="conflicts" value="0"/>				
+			</component>
+			<component id="system.core0.dtlb" name="dtlb">
+				<param name="number_entries" value="1"/>
+				<stat name="total_accesses" value="0"/>
+				<stat name="total_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.ccache" name="ccache">
+			        <!-- all the buffer related are optional -->
+				<param name="ccache_config" value="16384,64,2,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="ccache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="ccache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.tcache" name="tcache">
+			        <!-- all the buffer related are optional -->
+				<param name="tcache_config" value="49152,128,8,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="tcache_read_accesses_match_mcpat"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="tcache_read_misses_match_mcpat"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+      <!--model the shared memory by mimicing dcache-->
+			<component id="system.core0.sharedmemory" name="sharedmemory">
+			        <!-- all the buffer related are optional -->
+				<param name="sharedmemory_config" value="98304,16,1,16,1,3,16,0"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="sharedmemory_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="sharedmemory_write_access_match_mcpat"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.dcache" name="dcache">
+			        <!-- all the buffer related are optional -->
+				<param name="dcache_config" value="16384,32,4,1,1,3,8,0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 0"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="dcache_read_access_match_mcpat"/>
+				<stat name="write_accesses" value="dcache_write_access_match_mcpat"/>
+				<stat name="read_misses" value="dcache_read_miss_match_mcpat"/>
+				<stat name="write_misses" value="dcache_write_miss_match_mcpat"/>
+				<stat name="conflicts" value="0"/>	
+			</component>
+			<component id="system.core0.BTB" name="BTB">
+			        <!-- all the buffer related are optional -->
+				<param name="BTB_config" value="8192,4,2,1, 1,3"/>
+				<!-- the parameters are capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			</component>
+	</component>
+		<component id="system.L1Directory0" name="L1Directory0">
+				<param name="Directory_type" value="0"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="2048,1,0,1, 4, 4,8"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="800000"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="20"/>	
+				<stat name="duty_cycle" value="0.45"/>	
+		</component>
+		<component id="system.L2Directory0" name="L2Directory0">
+				<param name="Directory_type" value="1"/>
+			    <!--0 cam based shadowed tag. 1 directory cache -->	
+				<param name="Dir_config" value="1048576,16,16,1,2, 100"/>
+				<!-- the parameters are capacity,block_width, associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+			    <param name="buffer_sizes" value="8, 8, 8, 8"/>	
+				<!-- all the buffer related are optional -->
+			    <param name="clockrate" value="1400"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw search ports -->
+				<param name="device_type" value="0"/>
+				<!-- altough there are multiple access types, 
+				Performance simulator needs to cast them into reads or writes
+				e.g. the invalidates can be considered as writes -->
+				<stat name="read_accesses" value="0"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>
+			    <stat name="duty_cycle" value="0.45"/>		
+		</component>
+		<component id="system.L20" name="L20">
+			<!-- all the buffer related are optional -->
+				<param name="L2_config" value="131072,128,16,1, 4,23, 64, 1"/>
+			    <!-- consider 4-way bank interleaving for Niagara 1 -->
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<param name="clockrate" value="2962"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<stat name="read_accesses" value="200000"/>
+				<stat name="write_accesses" value="0"/>
+				<stat name="read_misses" value="0"/>
+				<stat name="write_misses" value="0"/>
+				<stat name="conflicts" value="0"/>	
+			    <stat name="duty_cycle" value="0.5"/>	
+		</component>
+		
+<!--**********************************************************************-->
+<component id="system.L30" name="L30">
+				<param name="L3_config" value="1048576,64,16,1, 2,100, 64,1"/>
+				<!-- the parameters are capacity,block_width, associativity, bank, throughput w.r.t. core clock, latency w.r.t. core clock,output_width, cache policy -->
+				<param name="clockrate" value="3500"/>
+				<param name="ports" value="1,1,1"/>
+				<!-- number of r, w, and rw ports -->
+				<param name="device_type" value="0"/>
+				<param name="buffer_sizes" value="16, 16, 16, 16"/>
+				<!-- cache controller buffer sizes: miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->	
+				<stat name="read_accesses" value="58824"/>
+				<stat name="write_accesses" value="27276"/>
+				<stat name="read_misses" value="1632"/>
+				<stat name="write_misses" value="183"/>
+				<stat name="conflicts" value="0"/>	
+	            <stat name="duty_cycle" value="0.35"/>	
+		</component>
+
+
+<!--**********************************************************************-->
+		<component id="system.NoC0" name="noc0">
+			<param name="clockrate" value="700"/>
+			<param name="type" value="1"/>
+			<!-- 1 NoC, O bus -->
+			<param name="horizontal_nodes" value="2"/>
+			<param name="vertical_nodes" value="1"/>
+			<param name="has_global_link" value="0"/>
+			<!-- 1 has global link, 0 does not have global link -->
+			<param name="link_throughput" value="1"/><!--w.r.t clock -->
+			<param name="link_latency" value="1"/><!--w.r.t clock -->
+			<!-- througput >= latency -->
+			<!-- Router architecture -->
+			<param name="input_ports" value="6"/>
+			<param name="output_ports" value="6"/>
+			<param name="virtual_channel_per_port" value="1"/>
+			<!-- input buffer; in classic routers only input ports need buffers -->
+			<param name="flit_bits" value="32"/>
+			<param name="input_buffer_entries_per_vc" value="1"/><!--VCs within the same ports share input buffers whose size is propotional to the number of VCs-->
+			<param name="chip_coverage" value="1"/>
+			<!-- When multiple NOC present, one NOC will cover part of the whole chip. chip_coverage <=1 -->
+			<stat name="total_accesses" value="0"/>
+			<!-- This is the number of total accesses within the whole network not for each router -->
+			<stat name="duty_cycle" value="0.6"/>
+		</component>	
+<!--**********************************************************************-->
+<!--**********************************************************************-->
+
+		<component id="system.mem" name="mem">
+			<!-- Main memory property -->
+			<param name="mem_tech_node" value="23"/>
+			<param name="device_clock" value="200"/><!--MHz, this is clock rate of the actual memory device, not the FSB -->
+			<param name="peak_transfer_rate" value="3200"/><!--MB/S-->
+			<param name="internal_prefetch_of_DRAM_chip" value="4"/>
+			<!-- 2 for DDR, 4 for DDR2, 8 for DDR3...-->
+			<!-- the device clock, peak_transfer_rate, and the internal prefetch decide the DIMM property -->
+			<!-- above numbers can be easily found from Wikipedia -->
+			<param name="capacity_per_channel" value="4096"/> <!-- MB -->
+			<!-- capacity_per_Dram_chip=capacity_per_channel/number_of_dimms/number_ranks/Dram_chips_per_rank
+			Current McPAT assumes single DIMMs are used.--> 		
+			<param name="number_ranks" value="2"/>
+			<param name="num_banks_of_DRAM_chip" value="6"/>			
+			<param name="Block_width_of_DRAM_chip" value="64"/> <!-- B -->
+			<param name="output_width_of_DRAM_chip" value="8"/>
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<!--number of Dram_chips_per_rank=" 72/output_width_of_DRAM_chip-->
+			<param name="page_size_of_DRAM_chip" value="8"/> <!-- 8 or 16 -->
+			<param name="burstlength_of_DRAM_chip" value="8"/>
+			<stat name="memory_accesses" value="1052"/>
+			<stat name="memory_reads" value="1052"/>
+			<stat name="memory_writes" value="1052"/>									
+		</component>
+		<component id="system.mc" name="mc">
+			<!-- Memeory controllers are for DDR(2,3...) DIMMs -->
+			<!-- current version of McPAT uses published values for base parameters of memory controller
+			improvments on MC will be added in later versions. -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="mc_clock" value="1848"/><!--DIMM IO bus clock rate MHz DDR2-400 for Niagara 1--> 
+			<param name="peak_transfer_rate" value="29568"/><!--MB/S  Syed: GTX 470 has 177.4GB/s mem transfer rate with 6 MCs -->
+			<param name="block_size" value="64"/><!--B-->
+			<param name="number_mcs" value="6"/><!-- 6 GDDR5 memory controllers  -->
+			<!-- current McPAT only supports homogeneous memory controllers -->
+			<param name="memory_channels_per_mc" value="2"/>
+			<param name="number_ranks" value="1"/>
+			<param name="withPHY" value="0"/>
+			<!-- # of ranks of each channel-->
+			<param name="req_window_size_per_channel" value="16"/>
+			<param name="IO_buffer_size_per_channel" value="16"/>
+			<param name="databus_width" value="32"/>
+			<param name="addressbus_width" value="32"/>
+			<param name="PRT_entries" value="32"/>
+			<!-- # of empirical DRAM model parameter -->
+			<param name="dram_cmd_coeff" value="0"/>
+			<param name="dram_act_coeff" value="0"/>
+			<param name="dram_nop_coeff" value="0"/>
+			<param name="dram_activity_coeff" value="0"/>
+			<param name="dram_pre_coeff" value="3.8475e-8f"/>
+			<param name="dram_rd_coeff" value="7.74707143e-8f"/>
+			<param name="dram_wr_coeff" value="3.54664286e-8f"/>
+			<param name="dram_req_coeff" value="0"/>
+			<param name="dram_const_coeff" value="0"/>
+
+			<!-- McPAT will add the control bus width to the addressbus width automatically -->
+			<stat name="memory_accesses" value="memory_accesses_match_mcpat"/>
+			<stat name="memory_reads" value="memory_reads_match_mcpat"/>
+			<stat name="memory_writes" value="memory_writes_match_mcpat"/>
+			<!-- McPAT does not track individual mc, instead, it takes the total accesses and calculate 
+			the average power per MC or per channel. This is sufficent for most application. 
+			Further trackdown can be easily added in later versions. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.niu" name="niu">
+			<!-- On chip 10Gb Ethernet NIC, including XAUI Phy and MAC controller  -->
+			<!-- For a minimum IP packet size of 84B at 10Gb/s, a new packet arrives every 67.2ns. 
+				 the low bound of clock rate of a 10Gb MAC is 150Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/> <!-- unlike PCIe and memory controllers, each Ethernet controller only have one port -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- ratio of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual nic, instead, it takes the total accesses and calculate 
+			the average power per nic or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.pcie" name="pcie">
+			<!-- On chip PCIe controller, including Phy-->
+			<!-- For a minimum PCIe packet size of 84B at 8Gb/s per lane (PCIe 3.0), a new packet arrives every 84ns. 
+				 the low bound of clock rate of a PCIe per lane logic is 120Mhz -->
+			<param name="type" value="0"/> <!-- 1: low power; 0 high performance -->
+			<param name="withPHY" value="1"/>
+			<param name="clockrate" value="350"/>
+			<param name="number_units" value="0"/>
+			<param name="num_channels" value="8"/> <!-- 2 ,4 ,8 ,16 ,32 -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual pcie controllers, instead, it takes the total accesses and calculate 
+			the average power per pcie controller or per channel. This is sufficent for most application. -->  			
+		</component>
+<!--**********************************************************************-->
+		<component id="system.flashc" name="flashc">
+		    <param name="number_flashcs" value="0"/>
+			<param name="type" value="1"/> <!-- 1: low power; 0 high performance -->
+            <param name="withPHY" value="1"/>
+			<param name="peak_transfer_rate" value="200"/><!--Per controller sustainable reak rate MB/S -->
+			<stat name="duty_cycle" value="1.0"/> <!-- achievable max load <= 1.0 -->
+			<stat name="total_load_perc" value="0.7"/> <!-- Percentage of total achived load to total achivable bandwidth  -->
+			<!-- McPAT does not track individual flash controller, instead, it takes the total accesses and calculate 
+			the average power per fc or per channel. This is sufficent for most application -->  			
+		</component>
+<!--**********************************************************************-->
+
+		</component>
+</component>
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_GV100/config_volta_islip.icnt b/configs/tested-cfgs/SM7_GV100/config_volta_islip.icnt
new file mode 100644
index 000000000..5ad7ecd48
--- /dev/null
+++ b/configs/tested-cfgs/SM7_GV100/config_volta_islip.icnt
@@ -0,0 +1,74 @@
+//21*1 fly with 32 flits per packet under gpgpusim injection mode
+use_map = 0;
+flit_size = 40; 
+
+// currently we do not use this, see subnets below
+network_count = 2;
+
+// Topology
+topology = fly;
+k = 144;
+n = 1;
+
+// Routing
+
+routing_function = dest_tag;
+
+
+// Flow control
+
+num_vcs     = 1;
+vc_buf_size = 256;
+input_buffer_size = 256;
+ejection_buffer_size = 256;
+boundary_buffer_size = 256;
+
+wait_for_tail_credit = 0;
+
+// Router architecture
+
+vc_allocator = islip; //separable_input_first;
+sw_allocator = islip; //separable_input_first;
+alloc_iters  = 1;
+
+credit_delay   = 0;
+routing_delay  = 0;
+vc_alloc_delay = 1;
+sw_alloc_delay = 1;
+
+input_speedup     = 1;
+output_speedup    = 1;
+internal_speedup  = 2.0;
+
+// Traffic, GPGPU-Sim does not use this
+
+traffic                = uniform;
+packet_size ={{1,2,3,4},{10,20}};
+packet_size_rate={{1,1,1,1},{2,1}};
+
+// Simulation - Don't change
+
+sim_type       = gpgpusim;
+//sim_type = latency;
+injection_rate = 0.1;
+
+subnets = 2;
+
+// Always use read and write no matter following line
+//use_read_write = 1;
+
+
+read_request_subnet = 0;
+read_reply_subnet = 1;
+write_request_subnet = 0;
+write_reply_subnet = 1;
+
+read_request_begin_vc = 0;
+read_request_end_vc = 0;
+write_request_begin_vc = 0;
+write_request_end_vc = 0;
+read_reply_begin_vc = 0;
+read_reply_end_vc = 0;
+write_reply_begin_vc = 0;
+write_reply_end_vc = 0;
+
diff --git a/configs/tested-cfgs/SM7_GV100/gpgpusim.config b/configs/tested-cfgs/SM7_GV100/gpgpusim.config
new file mode 100644
index 000000000..8d2b10199
--- /dev/null
+++ b/configs/tested-cfgs/SM7_GV100/gpgpusim.config
@@ -0,0 +1,237 @@
+# Copyright (c) 2018-2021, Vijay Kandiah, Junrui Pan, Mahmoud Khairy, Scott Peverelle, Timothy Rogers, Tor M. Aamodt, Nikos Hardavellas
+# Northwestern University, Purdue University, The University of British Columbia
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer;
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution;
+# 3. Neither the names of Northwestern University, Purdue University,
+#    The University of British Columbia nor the names of their contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+
+
+# This config models the Volta Quadro V100
+# For more info about volta architecture:
+# http://images.nvidia.com/content/volta-architecture/pdf/volta-architecture-whitepaper.pdf
+# https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=8344474&tag=1# 
+# http://on-demand.gputechconf.com/gtc/2018/presentation/s8122-dissecting-the-volta-gpu-architecture-through-microbenchmarking.pdf
+# https://en.wikipedia.org/wiki/Volta_(microarchitecture)
+# https://www.hotchips.org/wp-content/uploads/hc_archives/hc29/HC29.21-Monday-Pub/HC29.21.10-GPU-Gaming-Pub/HC29.21.132-Volta-Choquette-NVIDIA-Final3.pdf
+# https://devblogs.nvidia.com/inside-volta/
+# http://on-demand.gputechconf.com/gtc/2017/presentation/s7798-luke-durant-inside-volta.pdf
+
+# functional simulator specification
+-gpgpu_ptx_instruction_classification 0
+-gpgpu_ptx_sim_mode 0
+-gpgpu_ptx_force_max_capability 70 
+
+# Device Limits
+-gpgpu_stack_size_limit 1024
+-gpgpu_heap_size_limit 8388608
+-gpgpu_runtime_sync_depth_limit 2
+-gpgpu_runtime_pending_launch_count_limit 2048
+-gpgpu_kernel_launch_latency 5000
+-gpgpu_TB_launch_latency 0
+-gpgpu_max_concurrent_kernel 128
+
+# Compute Capability
+-gpgpu_compute_capability_major 7
+-gpgpu_compute_capability_minor 0
+
+# PTX execution-driven
+-gpgpu_ptx_convert_to_ptxplus 0
+-gpgpu_ptx_save_converted_ptxplus 0
+
+# high level architecture configuration
+-gpgpu_n_clusters 80
+-gpgpu_n_cores_per_cluster 1
+-gpgpu_n_mem 32
+-gpgpu_n_sub_partition_per_mchannel 2 
+-gpgpu_clock_gated_lanes 1
+
+# volta clock domains
+#-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
+-gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+# boost mode
+# -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
+
+# shader core pipeline config
+-gpgpu_shader_registers 65536
+-gpgpu_registers_per_block 65536
+-gpgpu_occupancy_sm_number 70
+
+# This implies a maximum of 64 warps/SM
+-gpgpu_shader_core_pipeline 2048:32 
+-gpgpu_shader_cta 32
+-gpgpu_simd_model 1
+
+# Pipeline widths and number of FUs
+# ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE
+## Volta GV100 has 4 SP SIMD units, 4 SFU units, 4 DP units per core, 4 Tensor core units
+## we need to scale the number of pipeline registers to be equal to the number of SP units
+-gpgpu_pipeline_widths 4,4,4,4,4,4,4,4,4,4,8,4,4
+-gpgpu_num_sp_units 4
+-gpgpu_num_sfu_units 4
+-gpgpu_num_dp_units 4
+-gpgpu_num_int_units 4
+-gpgpu_tensor_core_avail 1
+-gpgpu_num_tensor_core_units 4
+
+# Instruction latencies and initiation intervals
+# "ADD,MAX,MUL,MAD,DIV"
+# All Div operations are executed on SFU unit
+-ptx_opcode_latency_int 4,13,4,5,145,21
+-ptx_opcode_initiation_int 2,2,2,2,8,4
+-ptx_opcode_latency_fp 4,13,4,5,39
+-ptx_opcode_initiation_fp 2,2,2,2,4
+-ptx_opcode_latency_dp 8,19,8,8,330
+-ptx_opcode_initiation_dp 4,4,4,4,130
+-ptx_opcode_latency_sfu 100
+-ptx_opcode_initiation_sfu 8
+-ptx_opcode_latency_tesnor 64
+-ptx_opcode_initiation_tensor 64
+
+# Volta has sub core model, in which each scheduler has its own register file and EUs
+# i.e. schedulers are isolated
+-gpgpu_sub_core_model 1
+# disable specialized operand collectors and use generic operand collectors instead
+-gpgpu_enable_specialized_operand_collector 0
+-gpgpu_operand_collector_num_units_gen 8
+-gpgpu_operand_collector_num_in_ports_gen 8
+-gpgpu_operand_collector_num_out_ports_gen 8
+# volta has 8 banks, 4 schedulers, two banks per scheduler
+# we increase #banks to 16 to mitigate the effect of Regisrer File Cache (RFC) which we do not implement in the current version
+-gpgpu_num_reg_banks 16
+-gpgpu_reg_file_port_throughput 2
+
+# shared memory bankconflict detection 
+-gpgpu_shmem_num_banks 32
+-gpgpu_shmem_limited_broadcast 0
+-gpgpu_shmem_warp_parts 1
+-gpgpu_coalesce_arch 70
+
+# Volta has four schedulers per core
+-gpgpu_num_sched_per_core 4
+# Greedy then oldest scheduler
+-gpgpu_scheduler lrr
+## In Volta, a warp scheduler can issue 1 inst per cycle
+-gpgpu_max_insn_issue_per_warp 1
+-gpgpu_dual_issue_diff_exec_units 1
+
+## L1/shared memory configuration
+# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# ** Optional parameter - Required when mshr_type==Texture Fifo
+# Defualt config is 32KB DL1 and 96KB shared memory
+# In Volta, we assign the remaining shared memory to L1 cache 
+# if the assigned shd mem = 0, then L1 cache = 128KB
+# For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
+# disable this mode in case of multi kernels/apps execution
+-gpgpu_adaptive_cache_config 1
+-gpgpu_shmem_option 0,8,16,32,64,96
+-gpgpu_unified_l1d_size 128
+# L1 cache configuration
+-gpgpu_l1_banks 4
+-gpgpu_cache:dl1  S:4:128:64,L:T:m:L:L,A:512:8,16:0,32
+-gpgpu_l1_cache_write_ratio 25
+-gpgpu_l1_latency 20
+-gpgpu_gmem_skip_L1D 0
+-gpgpu_flush_l1_cache 1
+-gpgpu_n_cluster_ejection_buffer_size 32
+# shared memory configuration
+-gpgpu_shmem_size 98304
+-gpgpu_shmem_sizeDefault 98304
+-gpgpu_shmem_per_block 65536
+-gpgpu_smem_latency 20
+
+# 32 sets, each 128 bytes 24-way for each memory sub partition (96 KB per memory sub partition). This gives us 6MB L2 cache
+-gpgpu_cache:dl2 S:32:128:24,L:B:m:L:P,A:192:4,32:0,32
+-gpgpu_cache:dl2_texture_only 0
+-gpgpu_dram_partition_queues 64:64:64:64
+-gpgpu_perf_sim_memcpy 1
+-gpgpu_memory_partition_indexing 2
+
+# 128 KB Inst.
+-gpgpu_cache:il1 N:64:128:16,L:R:f:N:L,S:2:48,4
+-gpgpu_inst_fetch_throughput 4
+# 128 KB Tex
+# Note, TEX is deprected in Volta, It is used for legacy apps only. Use L1D cache instead with .nc modifier or __ldg mehtod
+-gpgpu_tex_cache:l1 N:4:128:256,L:R:m:N:L,T:512:8,128:2
+# 64 KB Const
+-gpgpu_const_cache:l1 N:128:64:8,L:R:f:N:L,S:2:64,4
+-gpgpu_perfect_inst_const_cache 1
+
+# interconnection
+#-network_mode 1 
+#-inter_config_file config_volta_islip.icnt
+# use built-in local xbar
+-network_mode 2
+-icnt_in_buffer_limit 512
+-icnt_out_buffer_limit 512
+-icnt_subnets 2
+-icnt_flit_size 40
+-icnt_arbiter_algo 1
+
+# memory partition latency config 
+-gpgpu_l2_rop_latency 160
+-dram_latency 100
+
+# dram model config
+-gpgpu_dram_scheduler 1
+-gpgpu_frfcfs_dram_sched_queue_size 64
+-gpgpu_dram_return_queue_size 192
+
+# for HBM, three stacks, 24 channles, each (128 bits) 16 bytes width
+-gpgpu_n_mem_per_ctrlr 1
+-gpgpu_dram_buswidth 16
+-gpgpu_dram_burst_length 2
+-dram_data_command_freq_ratio 2  # HBM is DDR
+-gpgpu_mem_address_mask 1
+-gpgpu_mem_addr_mapping dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCB.CCCSSSSS
+
+# HBM timing are adopted from hynix JESD235 standered and nVidia HPCA 2017 paper (http://www.cs.utah.edu/~nil/pubs/hpca17.pdf)
+# Timing for 1 GHZ
+# tRRDl and tWTR are missing, need to be added
+#-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=4:RCD=14:RAS=33:RP=14:RC=47:
+#                        CL=14:WL=2:CDLR=3:WR=12:nbkgrp=4:CCDL=2:RTPL=4"
+
+# Timing for 850 MHZ, V100 HBM runs at 850 MHZ
+-gpgpu_dram_timing_opt "nbk=16:CCD=1:RRD=3:RCD=12:RAS=28:RP=12:RC=40:
+                        CL=12:WL=2:CDLR=3:WR=10:nbkgrp=4:CCDL=2:RTPL=3"
+
+# HBM has dual bus interface, in which it can issue two col and row commands at a time
+-dram_dual_bus_interface 1
+# select lower bits for bnkgrp to increase bnkgrp parallelism
+-dram_bnk_indexing_policy 0
+-dram_bnkgrp_indexing_policy 1
+
+#-dram_seperate_write_queue_enable 1
+#-dram_write_queue_size 64:56:32
+
+# stat collection
+-gpgpu_memlatency_stat 14 
+-gpgpu_runtime_stat 500
+-enable_ptx_file_line_stats 1
+-visualizer_enabled 0
+
+# tracing functionality
+#-trace_enabled 1
+#-trace_components WARP_SCHEDULER,SCOREBOARD
+#-trace_sampling_core 0
\ No newline at end of file
diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index 8d2b10199..1b55aafe3 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -69,7 +69,7 @@
 
 # volta clock domains
 #-gpgpu_clock_domains <Core Clock>:<Interconnect Clock>:<L2 Clock>:<DRAM Clock>
--gpgpu_clock_domains 1447.0:1447.0:1447.0:850.0
+-gpgpu_clock_domains 1132.0:1132.0:1132.0:850.0
 # boost mode
 # -gpgpu_clock_domains 1628.0:1628.0:1628.0:850.0
 

From ff35ae9bf8bbc05409011db58497a40e0794b2a2 Mon Sep 17 00:00:00 2001
From: Aaron M Barnes <barnes88@purdue.edu>
Date: Thu, 8 Jun 2023 15:38:30 -0400
Subject: [PATCH 118/154] shared mem bank conflicts

---
 src/gpgpu-sim/shader.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 4013ae91e..ca26abbd1 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -1880,6 +1880,7 @@ bool ldst_unit::shared_cycle(warp_inst_t &inst, mem_stage_stall_type &rc_fail,
   if (stall) {
     fail_type = S_MEM;
     rc_fail = BK_CONF;
+    m_stats->gpgpu_n_shmem_bkconflict++;
   } else
     rc_fail = NO_RC_FAIL;
   return !stall;
@@ -1977,6 +1978,7 @@ mem_stage_stall_type ldst_unit::process_memory_access_queue_l1cache(
         inst.accessq_pop_back();
       } else {
         result = BK_CONF;
+        m_stats->gpgpu_n_cache_bkconflict++;
         delete mf;
         break;  // do not try again, just break from the loop and try the next
                 // cycle

From b471b3481b2399222ffd9ee0f007628834e68767 Mon Sep 17 00:00:00 2001
From: Ahmad Alawneh <Lahmos4@gmail.com>
Date: Mon, 12 Jun 2023 17:31:05 -0400
Subject: [PATCH 119/154] fixing bunch of formatting warnings (#53)

* fixing bunch of formating warrnings

* remove unintialized and unused results warnnings

* revert the changes , as it doenst fix the warning

---------

Co-authored-by: Fangjia Shen <50934207+FJShen@users.noreply.github.com>
---
 cuobjdump_to_ptxplus/cuobjdumpInstList.cc    |  2 +-
 cuobjdump_to_ptxplus/cuobjdump_to_ptxplus.cc |  2 +-
 libcuda/cuda_runtime_api.cc                  | 10 +++++-----
 src/abstract_hardware_model.cc               |  8 ++++----
 src/abstract_hardware_model.h                |  6 +++---
 src/cuda-sim/cuda-sim.cc                     | 18 +++++++++++-------
 src/cuda-sim/cuda_device_runtime.cc          |  4 ++--
 src/cuda-sim/memory.cc                       |  8 ++++----
 src/cuda-sim/ptx_ir.cc                       |  2 +-
 src/cuda-sim/ptx_loader.cc                   |  6 +++---
 src/cuda-sim/ptx_parser.cc                   |  6 +++---
 src/cuda-sim/ptx_sim.cc                      |  2 +-
 src/debug.cc                                 |  8 ++++++--
 src/gpgpu-sim/gpu-sim.cc                     |  2 +-
 src/gpgpu-sim/local_interconnect.cc          | 14 +++++++-------
 src/gpgpu-sim/shader.cc                      |  8 ++++----
 src/gpgpu-sim/stat-tool.cc                   |  2 +-
 src/intersim2/networks/kncube.cpp            |  2 +-
 src/intersim2/networks/qtree.cpp             |  2 +-
 19 files changed, 60 insertions(+), 52 deletions(-)

diff --git a/cuobjdump_to_ptxplus/cuobjdumpInstList.cc b/cuobjdump_to_ptxplus/cuobjdumpInstList.cc
index 32834c745..d42e59e51 100644
--- a/cuobjdump_to_ptxplus/cuobjdumpInstList.cc
+++ b/cuobjdump_to_ptxplus/cuobjdumpInstList.cc
@@ -505,7 +505,7 @@ std::string cuobjdumpInstList::parseCuobjdumpRegister(std::string reg, bool lo,
 	} else {
 		output("ERROR: unknown register type.\n");
 		printf("\nERROR: unknown register type: ");
-		printf(reg.c_str());
+		printf("%s",reg.c_str());
 		printf("\n");
 		assert(0);
 	}
diff --git a/cuobjdump_to_ptxplus/cuobjdump_to_ptxplus.cc b/cuobjdump_to_ptxplus/cuobjdump_to_ptxplus.cc
index 82dcb7cad..5c6fdcd1b 100644
--- a/cuobjdump_to_ptxplus/cuobjdump_to_ptxplus.cc
+++ b/cuobjdump_to_ptxplus/cuobjdump_to_ptxplus.cc
@@ -54,7 +54,7 @@ FILE *ptxplus_out;
 void output(const char * text)
 {
 	//printf(text);
-	fprintf(ptxplus_out, text);
+	fprintf(ptxplus_out,"%s", text);
 }
 
 void output(const std::string text) {
diff --git a/libcuda/cuda_runtime_api.cc b/libcuda/cuda_runtime_api.cc
index fd05f555c..12d3aac7d 100644
--- a/libcuda/cuda_runtime_api.cc
+++ b/libcuda/cuda_runtime_api.cc
@@ -435,7 +435,7 @@ std::string get_app_binary() {
 
 // above func gives abs path whereas this give just the name of application.
 char *get_app_binary_name(std::string abs_path) {
-  char *self_exe_path;
+  char *self_exe_path = NULL;
 #ifdef __APPLE__
   // TODO: get apple device and check the result.
   printf("WARNING: not tested for Apple-mac devices \n");
@@ -463,7 +463,7 @@ static int get_app_cuda_version() {
       "ldd " + get_app_binary() +
       " | grep libcudart.so | sed  's/.*libcudart.so.\\(.*\\) =>.*/\\1/' > " +
       fname;
-  system(app_cuda_version_command.c_str());
+  int res = system(app_cuda_version_command.c_str());
   FILE *cmd = fopen(fname, "r");
   char buf[256];
   while (fgets(buf, sizeof(buf), cmd) != 0) {
@@ -1410,7 +1410,7 @@ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlagsInternal(
   function_info *entry = context->get_kernel(hostFunc);
   printf(
       "Calculate Maxium Active Block with function ptr=%p, blockSize=%d, "
-      "SMemSize=%d\n",
+      "SMemSize=%lu\n",
       hostFunc, blockSize, dynamicSMemSize);
   if (flags == cudaOccupancyDefault) {
     // create kernel_info based on entry
@@ -3234,7 +3234,7 @@ char *readfile(const std::string filename) {
   fseek(fp, 0, SEEK_SET);
   // allocate and copy the entire ptx
   char *ret = (char *)malloc((filesize + 1) * sizeof(char));
-  fread(ret, 1, filesize, fp);
+  int num = fread(ret, 1, filesize, fp);
   ret[filesize] = '\0';
   fclose(fp);
   return ret;
@@ -3478,7 +3478,7 @@ void gpgpu_context::cuobjdumpParseBinary(unsigned int handle) {
     context->add_binary(symtab, handle);
     return;
   }
-  symbol_table *symtab;
+  symbol_table *symtab = NULL;
 
 #if (CUDART_VERSION >= 6000)
   // loops through all ptx files from smallest sm version to largest
diff --git a/src/abstract_hardware_model.cc b/src/abstract_hardware_model.cc
index fda84e8b0..ed7347de7 100644
--- a/src/abstract_hardware_model.cc
+++ b/src/abstract_hardware_model.cc
@@ -75,7 +75,7 @@ void checkpoint::load_global_mem(class memory_space *temp_mem, char *f1name) {
   FILE *fp2 = fopen(f1name, "r");
   assert(fp2 != NULL);
   char line[128]; /* or other suitable maximum line size */
-  unsigned int offset;
+  unsigned int offset = 0;
   while (fgets(line, sizeof line, fp2) != NULL) /* read a line */
   {
     unsigned int index;
@@ -1006,13 +1006,13 @@ void simt_stack::print(FILE *fout) const {
     }
     for (unsigned j = 0; j < m_warp_size; j++)
       fprintf(fout, "%c", (stack_entry.m_active_mask.test(j) ? '1' : '0'));
-    fprintf(fout, " pc: 0x%03x", stack_entry.m_pc);
+    fprintf(fout, " pc: 0x%03llx", stack_entry.m_pc);
     if (stack_entry.m_recvg_pc == (unsigned)-1) {
       fprintf(fout, " rp: ---- tp: %s cd: %2u ",
               (stack_entry.m_type == STACK_ENTRY_TYPE_CALL ? "C" : "N"),
               stack_entry.m_calldepth);
     } else {
-      fprintf(fout, " rp: %4u tp: %s cd: %2u ", stack_entry.m_recvg_pc,
+      fprintf(fout, " rp: %4llu tp: %s cd: %2u ", stack_entry.m_recvg_pc,
               (stack_entry.m_type == STACK_ENTRY_TYPE_CALL ? "C" : "N"),
               stack_entry.m_calldepth);
     }
@@ -1032,7 +1032,7 @@ void simt_stack::print_checkpoint(FILE *fout) const {
 
     for (unsigned j = 0; j < m_warp_size; j++)
       fprintf(fout, "%c ", (stack_entry.m_active_mask.test(j) ? '1' : '0'));
-    fprintf(fout, "%d %d %d %lld %d ", stack_entry.m_pc,
+    fprintf(fout, "%llu %d %llu %lld %d ", stack_entry.m_pc,
             stack_entry.m_calldepth, stack_entry.m_recvg_pc,
             stack_entry.m_branch_div_cycle, stack_entry.m_type);
     fprintf(fout, "%d %d\n", m_warp_id, m_warp_size);
diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index 6e4a87dac..3b95829b4 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -963,7 +963,7 @@ class inst_t {
   }
   bool valid() const { return m_decoded; }
   virtual void print_insn(FILE *fp) const {
-    fprintf(fp, " [inst @ pc=0x%04x] ", pc);
+    fprintf(fp, " [inst @ pc=0x%04llx] ", pc);
   }
   bool is_load() const {
     return (op == LOAD_OP || op == TENSOR_CORE_LOAD_OP ||
@@ -1157,7 +1157,7 @@ class warp_inst_t : public inst_t {
 
   // accessors
   virtual void print_insn(FILE *fp) const {
-    fprintf(fp, " [inst @ pc=0x%04x] ", pc);
+    fprintf(fp, " [inst @ pc=0x%04llx] ", pc);
     for (int i = (int)m_config->warp_size - 1; i >= 0; i--)
       fprintf(fp, "%c", ((m_warp_active_mask[i]) ? '1' : '0'));
   }
@@ -1386,7 +1386,7 @@ class register_set {
     assert(has_ready());
     warp_inst_t **ready;
     ready = NULL;
-    unsigned reg_id;
+    unsigned reg_id = 0;
     for (unsigned i = 0; i < regs.size(); i++) {
       if (not regs[i]->empty()) {
         if (ready and (*ready)->get_uid() < regs[i]->get_uid()) {
diff --git a/src/cuda-sim/cuda-sim.cc b/src/cuda-sim/cuda-sim.cc
index 680ce7970..b063512bf 100644
--- a/src/cuda-sim/cuda-sim.cc
+++ b/src/cuda-sim/cuda-sim.cc
@@ -545,7 +545,7 @@ void gpgpu_t::gpu_memset(size_t dst_start_addr, int c, size_t count) {
 void cuda_sim::ptx_print_insn(address_type pc, FILE *fp) {
   std::map<unsigned, function_info *>::iterator f = g_pc_to_finfo.find(pc);
   if (f == g_pc_to_finfo.end()) {
-    fprintf(fp, "<no instruction at address 0x%x>", pc);
+    fprintf(fp, "<no instruction at address 0x%llx>", pc);
     return;
   }
   function_info *finfo = f->second;
@@ -559,7 +559,7 @@ std::string cuda_sim::ptx_get_insn_str(address_type pc) {
 #define STR_SIZE 255
     char buff[STR_SIZE];
     buff[STR_SIZE - 1] = '\0';
-    snprintf(buff, STR_SIZE, "<no instruction at address 0x%x>", pc);
+    snprintf(buff, STR_SIZE, "<no instruction at address 0x%llx>", pc);
     return std::string(buff);
   }
   function_info *finfo = f->second;
@@ -1372,7 +1372,7 @@ void function_info::add_param_data(unsigned argn,
       unsigned num_bits = 8 * args->m_nbytes;
       printf(
           "GPGPU-Sim PTX: deferred allocation of shared region for \"%s\" from "
-          "0x%x to 0x%x (shared memory space)\n",
+          "0x%llx to 0x%llx (shared memory space)\n",
           p->name().c_str(), m_symtab->get_shared_next(),
           m_symtab->get_shared_next() + num_bits / 8);
       fflush(stdout);
@@ -1503,7 +1503,7 @@ void function_info::list_param(FILE *fout) const {
     std::string name = p.get_name();
     symbol *param = m_symtab->lookup(name.c_str());
     addr_t param_addr = param->get_address();
-    fprintf(fout, "%s: %#08x\n", name.c_str(), param_addr);
+    fprintf(fout, "%s: %#08llx\n", name.c_str(), param_addr);
   }
   fflush(fout);
 }
@@ -1533,7 +1533,11 @@ void function_info::ptx_jit_config(
            filename_c.c_str());
   assert(system(buff) != NULL);
   FILE *fp = fopen(filename_c.c_str(), "r");
-  fgets(buff, 1024, fp);
+  char * ptr = fgets(buff, 1024, fp);
+  if(ptr == NULL ){
+          printf("can't read file %s \n", filename_c.c_str());
+          assert(0);
+  }
   fclose(fp);
   std::string fn(buff);
   size_t pos1, pos2;
@@ -1877,7 +1881,7 @@ void ptx_thread_info::ptx_exec_inst(warp_inst_t &inst, unsigned lane_id) {
       dim3 tid = get_tid();
       printf(
           "%u [thd=%u][i=%u] : ctaid=(%u,%u,%u) tid=(%u,%u,%u) icount=%u "
-          "[pc=%u] (%s:%u - %s)  [0x%llx]\n",
+          "[pc=%llu] (%s:%u - %s)  [0x%llx]\n",
           m_gpu->gpgpu_ctx->func_sim->g_ptx_sim_num_insn, get_uid(), pI->uid(),
           ctaid.x, ctaid.y, ctaid.z, tid.x, tid.y, tid.z, get_icount(), pc,
           pI->source_file(), pI->source_line(), pI->get_source(),
@@ -2376,7 +2380,7 @@ void cuda_sim::read_sim_environment_variables() {
         "%s\n",
         dbg_pc);
     fflush(stdout);
-    sscanf(dbg_pc, "%d", &g_debug_pc);
+    sscanf(dbg_pc, "%llu", &g_debug_pc);
   }
 
 #if CUDART_VERSION > 1010
diff --git a/src/cuda-sim/cuda_device_runtime.cc b/src/cuda-sim/cuda_device_runtime.cc
index 4a99c1cbb..8ed90bcc2 100644
--- a/src/cuda-sim/cuda_device_runtime.cc
+++ b/src/cuda-sim/cuda_device_runtime.cc
@@ -36,7 +36,7 @@ void cuda_device_runtime::gpgpusim_cuda_getParameterBufferV2(
   unsigned n_args = target_func->num_args();
   assert(n_args == 4);
 
-  function_info *child_kernel_entry;
+  function_info *child_kernel_entry = NULL;
   struct dim3 grid_dim, block_dim;
   unsigned int shared_mem;
 
@@ -258,7 +258,7 @@ void cuda_device_runtime::gpgpusim_cuda_streamCreateWithFlags(
   assert(n_args == 2);
 
   size_t generic_pStream_addr;
-  addr_t pStream_addr;
+  addr_t pStream_addr = 0;
   unsigned int flags;
   for (unsigned arg = 0; arg < n_args; arg++) {
     const operand_info &actual_param_op =
diff --git a/src/cuda-sim/memory.cc b/src/cuda-sim/memory.cc
index 132383780..036badaf1 100644
--- a/src/cuda-sim/memory.cc
+++ b/src/cuda-sim/memory.cc
@@ -109,11 +109,11 @@ void memory_space_impl<BSIZE>::read_single_block(mem_addr_t blk_idx,
   if ((addr + length) > (blk_idx + 1) * BSIZE) {
     printf(
         "GPGPU-Sim PTX: ERROR * access to memory \'%s\' is unaligned : "
-        "addr=0x%x, length=%zu\n",
+        "addr=0x%llx, length=%zu\n",
         m_name.c_str(), addr, length);
     printf(
-        "GPGPU-Sim PTX: (addr+length)=0x%lx > 0x%x=(index+1)*BSIZE, "
-        "index=0x%x, BSIZE=0x%x\n",
+        "GPGPU-Sim PTX: (addr+length)=0x%llx > 0x%llx=(index+1)*BSIZE, "
+        "index=0x%llx, BSIZE=0x%x\n",
         (addr + length), (blk_idx + 1) * BSIZE, blk_idx, BSIZE);
     throw 1;
   }
@@ -169,7 +169,7 @@ void memory_space_impl<BSIZE>::print(const char *format, FILE *fout) const {
   typename map_t::const_iterator i_page;
 
   for (i_page = m_data.begin(); i_page != m_data.end(); ++i_page) {
-    fprintf(fout, "%s %08x:", m_name.c_str(), i_page->first);
+    fprintf(fout, "%s %08llx:", m_name.c_str(), i_page->first);
     i_page->second.print(format, fout);
   }
 }
diff --git a/src/cuda-sim/ptx_ir.cc b/src/cuda-sim/ptx_ir.cc
index 029cf73a8..f25f1d582 100644
--- a/src/cuda-sim/ptx_ir.cc
+++ b/src/cuda-sim/ptx_ir.cc
@@ -1470,7 +1470,7 @@ std::string ptx_instruction::to_string() const {
   unsigned used_bytes = 0;
   if (!is_label()) {
     used_bytes +=
-        snprintf(buf + used_bytes, STR_SIZE - used_bytes, " PC=0x%03x ", m_PC);
+        snprintf(buf + used_bytes, STR_SIZE - used_bytes, " PC=0x%03llx ", m_PC);
   } else {
     used_bytes +=
         snprintf(buf + used_bytes, STR_SIZE - used_bytes, "                ");
diff --git a/src/cuda-sim/ptx_loader.cc b/src/cuda-sim/ptx_loader.cc
index 4e91763e8..df354983e 100644
--- a/src/cuda-sim/ptx_loader.cc
+++ b/src/cuda-sim/ptx_loader.cc
@@ -95,7 +95,7 @@ void gpgpu_context::print_ptx_file(const char *p, unsigned source_num,
     const ptx_instruction *pI = ptx_parser->ptx_instruction_lookup(filename, n);
     char pc[64];
     if (pI && pI->get_PC())
-      snprintf(pc, 64, "%4u", pI->get_PC());
+      snprintf(pc, 64, "%4llu", pI->get_PC());
     else
       snprintf(pc, 64, "    ");
     printf("    _%u.ptx  %4u (pc=%s):  %s\n", source_num, n, pc, t);
@@ -240,7 +240,7 @@ void fix_duplicate_errors(char fname2[1024]) {
   unsigned oldlinenum = 1;
   unsigned linenum;
   char *startptr = ptxdata;
-  char *funcptr;
+  char *funcptr = NULL;
   char *tempptr = ptxdata - 1;
   char *lineptr = ptxdata - 1;
 
@@ -320,7 +320,7 @@ void fix_duplicate_errors(char fname2[1024]) {
 // we need the application name here too.
 char *get_app_binary_name() {
   char exe_path[1025];
-  char *self_exe_path;
+  char *self_exe_path = NULL;
 #ifdef __APPLE__
   // AMRUTH:  get apple device and check the result.
   printf("WARNING: not tested for Apple-mac devices \n");
diff --git a/src/cuda-sim/ptx_parser.cc b/src/cuda-sim/ptx_parser.cc
index 86a33c2d3..a80eeae64 100644
--- a/src/cuda-sim/ptx_parser.cc
+++ b/src/cuda-sim/ptx_parser.cc
@@ -206,7 +206,7 @@ void ptx_recognizer::end_function() {
   gpgpu_ptx_assemble(g_func_info->get_name(), g_func_info);
   g_current_symbol_table = g_global_symbol_table;
 
-  PTX_PARSE_DPRINTF("function %s, PC = %d\n", g_func_info->get_name().c_str(),
+  PTX_PARSE_DPRINTF("function %s, PC = %llu\n", g_func_info->get_name().c_str(),
                     g_func_info->get_start_PC());
 }
 
@@ -486,7 +486,7 @@ void ptx_recognizer::add_identifier(const char *identifier, int array_dim,
     case param_space_local:
       printf(
           "GPGPU-Sim PTX: allocating stack frame region for .param \"%s\" from "
-          "0x%x to 0x%lx\n",
+          "0x%llx to 0x%llx\n",
           identifier, g_current_symbol_table->get_local_next(),
           g_current_symbol_table->get_local_next() + num_bits / 8);
       fflush(stdout);
@@ -521,7 +521,7 @@ void ptx_recognizer::add_constptr(const char *identifier1,
 
   unsigned addr = s2->get_address();
 
-  printf("GPGPU-Sim PTX: moving \"%s\" from 0x%x to 0x%x (%s+%x)\n",
+  printf("GPGPU-Sim PTX: moving \"%s\" from 0x%llx to 0x%x (%s+%d)\n",
          identifier1, s1->get_address(), addr + offset, identifier2, offset);
 
   s1->set_address(addr + offset);
diff --git a/src/cuda-sim/ptx_sim.cc b/src/cuda-sim/ptx_sim.cc
index dc801f8ca..6503499fc 100644
--- a/src/cuda-sim/ptx_sim.cc
+++ b/src/cuda-sim/ptx_sim.cc
@@ -369,7 +369,7 @@ static void print_reg(FILE *fp, std::string name, ptx_reg_t value,
       fprintf(fp, ".u64 %llu [0x%llx]\n", value.u64, value.u64);
       break;
     case F16_TYPE:
-      fprintf(fp, ".f16 %f [0x%04x]\n", value.f16, (unsigned)value.u16);
+      fprintf(fp, ".f16 %f [0x%04x]\n", static_cast<float>(value.f16), (unsigned)value.u16);
       break;
     case F32_TYPE:
       fprintf(fp, ".f32 %.15lf [0x%08x]\n", value.f32, value.u32);
diff --git a/src/debug.cc b/src/debug.cc
index 29506bd75..e23ffd46d 100644
--- a/src/debug.cc
+++ b/src/debug.cc
@@ -124,7 +124,7 @@ void gpgpu_sim::gpgpu_debug() {
     fflush(stdout);
 
     char line[1024];
-    fgets(line, 1024, stdin);
+    char * ptr = fgets(line, 1024, stdin);
 
     char *tok = strtok(line, " \t\n");
     if (!strcmp(tok, "dp")) {
@@ -136,7 +136,11 @@ void gpgpu_sim::gpgpu_debug() {
       fflush(stdout);
     } else if (!strcmp(tok, "q") || !strcmp(tok, "quit")) {
       printf("\nreally quit GPGPU-Sim (y/n)?\n");
-      fgets(line, 1024, stdin);
+      ptr = fgets(line, 1024, stdin);
+      if(ptr == NULL ){
+        printf("can't read input\n");
+        exit(0);
+      }
       tok = strtok(line, " \t\n");
       if (!strcmp(tok, "y")) {
         exit(0);
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index 5af244b33..5a68f1355 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -2053,7 +2053,7 @@ void gpgpu_sim::cycle() {
           m_cluster[i]->get_current_occupancy(active, total);
         }
         DPRINTFG(LIVENESS,
-                 "uArch: inst.: %lld (ipc=%4.1f, occ=%0.4f\% [%llu / %llu]) "
+                 "uArch: inst.: %lld (ipc=%4.1f, occ=%0.4f%% [%llu / %llu]) "
                  "sim_rate=%u (inst/sec) elapsed = %u:%u:%02u:%02u / %s",
                  gpu_tot_sim_insn + gpu_sim_insn,
                  (double)gpu_sim_insn / (double)gpu_sim_cycle,
diff --git a/src/gpgpu-sim/local_interconnect.cc b/src/gpgpu-sim/local_interconnect.cc
index 0e204623b..df6bd7bae 100644
--- a/src/gpgpu-sim/local_interconnect.cc
+++ b/src/gpgpu-sim/local_interconnect.cc
@@ -159,8 +159,8 @@ void xbar_router::RR_Advance() {
   }
 
   if (verbose) {
-    printf("%d : cycle %d : conflicts = %d\n", m_id, cycles, conflict_sub);
-    printf("%d : cycle %d : passing reqs = %d\n", m_id, cycles, reqs);
+    printf("%d : cycle %llu : conflicts = %d\n", m_id, cycles, conflict_sub);
+    printf("%d : cycle %llu : passing reqs = %d\n", m_id, cycles, reqs);
   }
 
   // collect some stats about buffer util
@@ -217,7 +217,7 @@ void xbar_router::iSLIP_Advance() {
             out_buffers[_packet.output_deviceID].push(_packet);
             in_buffers[node_id].pop();
             if (verbose)
-              printf("%d : cycle %d : send req from %d to %d\n", m_id, cycles,
+              printf("%d : cycle %llu : send req from %d to %d\n", m_id, cycles,
                      node_id, i - _n_shader);
             if (grant_cycles_count == 1)
               next_node[i] = (++node_id % total_nodes);
@@ -228,7 +228,7 @@ void xbar_router::iSLIP_Advance() {
                   Packet _packet2 = in_buffers[node_id2].front();
 
                   if (_packet2.output_deviceID == i)
-                    printf("%d : cycle %d : cannot send req from %d to %d\n",
+                    printf("%d : cycle %llu : cannot send req from %d to %d\n",
                            m_id, cycles, node_id2, i - _n_shader);
                 }
               }
@@ -248,7 +248,7 @@ void xbar_router::iSLIP_Advance() {
   }
 
   if (verbose)
-    printf("%d : cycle %d : grant_cycles = %d\n", m_id, cycles, grant_cycles);
+    printf("%d : cycle %llu : grant_cycles = %d\n", m_id, cycles, grant_cycles);
 
   if (active && grant_cycles_count == 1)
     grant_cycles_count = grant_cycles;
@@ -256,8 +256,8 @@ void xbar_router::iSLIP_Advance() {
     grant_cycles_count--;
 
   if (verbose) {
-    printf("%d : cycle %d : conflicts = %d\n", m_id, cycles, conflict_sub);
-    printf("%d : cycle %d : passing reqs = %d\n", m_id, cycles, reqs);
+    printf("%d : cycle %llu : conflicts = %d\n", m_id, cycles, conflict_sub);
+    printf("%d : cycle %llu : passing reqs = %d\n", m_id, cycles, reqs);
   }
 
   // collect some stats about buffer util
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 4013ae91e..4ae0f628f 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -3082,7 +3082,7 @@ void warp_inst_t::print(FILE *fout) const {
     fprintf(fout, "bubble\n");
     return;
   } else
-    fprintf(fout, "0x%04x ", pc);
+    fprintf(fout, "0x%04llx ", pc);
   fprintf(fout, "w%02d[", m_warp_id);
   for (unsigned j = 0; j < m_config->warp_size; j++)
     fprintf(fout, "%c", (active(j) ? '1' : '0'));
@@ -3268,7 +3268,7 @@ void shader_core_ctx::display_pipeline(FILE *fout, int print_mem,
   if (!m_inst_fetch_buffer.m_valid)
     fprintf(fout, "bubble\n");
   else {
-    fprintf(fout, "w%2u : pc = 0x%x, nbytes = %u\n",
+    fprintf(fout, "w%2u : pc = 0x%llx, nbytes = %u\n",
             m_inst_fetch_buffer.m_warp_id, m_inst_fetch_buffer.m_pc,
             m_inst_fetch_buffer.m_nbytes);
   }
@@ -3934,7 +3934,7 @@ bool shd_warp_t::waiting() {
 
 void shd_warp_t::print(FILE *fout) const {
   if (!done_exit()) {
-    fprintf(fout, "w%02u npc: 0x%04x, done:%c%c%c%c:%2u i:%u s:%u a:%u (done: ",
+    fprintf(fout, "w%02u npc: 0x%04llx, done:%c%c%c%c:%2u i:%u s:%u a:%u (done: ",
             m_warp_id, m_next_pc, (functional_done() ? 'f' : ' '),
             (stores_done() ? 's' : ' '), (inst_in_pipeline() ? ' ' : 'i'),
             (done_exit() ? 'e' : ' '), n_completed, m_inst_in_pipeline,
@@ -4010,7 +4010,7 @@ void opndcoll_rfu_t::init(unsigned num_banks, shader_core_ctx *shader) {
 
   sub_core_model = shader->get_config()->sub_core_model;
   m_num_warp_scheds = shader->get_config()->gpgpu_num_sched_per_core;
-  unsigned reg_id;
+  unsigned reg_id = 0;
   if (sub_core_model) {
     assert(num_banks % shader->get_config()->gpgpu_num_sched_per_core == 0);
     assert(m_num_warp_scheds <= m_cu.size() &&
diff --git a/src/gpgpu-sim/stat-tool.cc b/src/gpgpu-sim/stat-tool.cc
index 0513d17ed..08bbe9e02 100644
--- a/src/gpgpu-sim/stat-tool.cc
+++ b/src/gpgpu-sim/stat-tool.cc
@@ -519,7 +519,7 @@ void thread_insn_span::print_span(FILE *fout) const {
   fprintf(fout, "%d: ", (int)m_cycle);
   span_count_map::const_iterator i_sc = m_insn_span_count.begin();
   for (; i_sc != m_insn_span_count.end(); ++i_sc) {
-    fprintf(fout, "%d ", i_sc->first);
+    fprintf(fout, "%llx ", i_sc->first);
   }
   fprintf(fout, "\n");
 }
diff --git a/src/intersim2/networks/kncube.cpp b/src/intersim2/networks/kncube.cpp
index 03e13e713..178c90594 100644
--- a/src/intersim2/networks/kncube.cpp
+++ b/src/intersim2/networks/kncube.cpp
@@ -231,7 +231,7 @@ void KNCube::InsertRandomFaults( const Configuration &config )
   int num_fails;
   unsigned long prev_seed;
 
-  int node, chan;
+  int node, chan = 0;
   int i, j, t, n, c;
   bool available;
 
diff --git a/src/intersim2/networks/qtree.cpp b/src/intersim2/networks/qtree.cpp
index 72149475e..37d3d7c7d 100644
--- a/src/intersim2/networks/qtree.cpp
+++ b/src/intersim2/networks/qtree.cpp
@@ -84,7 +84,7 @@ void QTree::_BuildNet( const Configuration& config )
 {
 
   ostringstream routerName;
-  int h, r, pos, port;
+  int h, r = 0 , pos, port;
 
   for (h = 0; h < _n; h++) {
     for (pos = 0 ; pos < powi( _k, h ) ; ++pos ) {

From ce5c443efa2ec72b7d7b88210eea71db89b29c89 Mon Sep 17 00:00:00 2001
From: Ahmad Alawneh <lahmos4@gmail.com>
Date: Tue, 6 Jun 2023 22:14:24 -0400
Subject: [PATCH 120/154] it is safe to to change m_updates to unsgined, the
 accessor returns unsigned anyway, it should fix warning in done() fun

---
 src/stream_manager.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/stream_manager.h b/src/stream_manager.h
index afcbb0e41..561f54b87 100644
--- a/src/stream_manager.h
+++ b/src/stream_manager.h
@@ -73,7 +73,7 @@ struct CUevent_st {
   int m_uid;
   bool m_blocking;
   bool m_done;
-  int m_updates;
+  unsigned int m_updates;
   unsigned int m_issued;
   time_t m_wallclock;
   double m_gpu_tot_sim_cycle;

From e700b1816492bb811e5aa12d1b1b0ec778e04235 Mon Sep 17 00:00:00 2001
From: Ahmad Alawneh <lahmos4@gmail.com>
Date: Tue, 6 Jun 2023 22:58:29 -0400
Subject: [PATCH 121/154] fix types, change int to unsigned int

---
 src/gpgpu-sim/gpu-sim.cc | 14 +++++++-------
 src/gpgpu-sim/shader.cc  |  6 +++---
 src/gpgpu-sim/shader.h   | 14 +++++++-------
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index 5a68f1355..ea50fa02a 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -586,26 +586,26 @@ void shader_core_config::reg_options(class OptionParser *opp) {
       "ID_OC_SP,ID_OC_DP,ID_OC_INT,ID_OC_SFU,ID_OC_MEM,OC_EX_SP,OC_EX_DP,OC_EX_"
       "INT,OC_EX_SFU,OC_EX_MEM,EX_WB,ID_OC_TENSOR_CORE,OC_EX_TENSOR_CORE",
       "1,1,1,1,1,1,1,1,1,1,1,1,1");
-  option_parser_register(opp, "-gpgpu_tensor_core_avail", OPT_INT32,
+  option_parser_register(opp, "-gpgpu_tensor_core_avail", OPT_UINT32,
                          &gpgpu_tensor_core_avail,
                          "Tensor Core Available (default=0)", "0");
-  option_parser_register(opp, "-gpgpu_num_sp_units", OPT_INT32,
+  option_parser_register(opp, "-gpgpu_num_sp_units", OPT_UINT32,
                          &gpgpu_num_sp_units, "Number of SP units (default=1)",
                          "1");
-  option_parser_register(opp, "-gpgpu_num_dp_units", OPT_INT32,
+  option_parser_register(opp, "-gpgpu_num_dp_units", OPT_UINT32,
                          &gpgpu_num_dp_units, "Number of DP units (default=0)",
                          "0");
-  option_parser_register(opp, "-gpgpu_num_int_units", OPT_INT32,
+  option_parser_register(opp, "-gpgpu_num_int_units", OPT_UINT32,
                          &gpgpu_num_int_units,
                          "Number of INT units (default=0)", "0");
-  option_parser_register(opp, "-gpgpu_num_sfu_units", OPT_INT32,
+  option_parser_register(opp, "-gpgpu_num_sfu_units", OPT_UINT32,
                          &gpgpu_num_sfu_units, "Number of SF units (default=1)",
                          "1");
-  option_parser_register(opp, "-gpgpu_num_tensor_core_units", OPT_INT32,
+  option_parser_register(opp, "-gpgpu_num_tensor_core_units", OPT_UINT32,
                          &gpgpu_num_tensor_core_units,
                          "Number of tensor_core units (default=1)", "0");
   option_parser_register(
-      opp, "-gpgpu_num_mem_units", OPT_INT32, &gpgpu_num_mem_units,
+      opp, "-gpgpu_num_mem_units", OPT_UINT32, &gpgpu_num_mem_units,
       "Number if ldst units (default=1) WARNING: not hooked up to anything",
       "1");
   option_parser_register(
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 4ae0f628f..fdc7f779c 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -107,7 +107,7 @@ void shader_core_ctx::create_front_pipeline() {
     m_pipeline_reg.push_back(
         register_set(m_config->pipe_widths[j], pipeline_stage_name_decode[j]));
   }
-  for (int j = 0; j < m_config->m_specialized_unit.size(); j++) {
+  for (unsigned j = 0; j < m_config->m_specialized_unit.size(); j++) {
     m_pipeline_reg.push_back(
         register_set(m_config->m_specialized_unit[j].id_oc_spec_reg_width,
                      m_config->m_specialized_unit[j].name));
@@ -115,7 +115,7 @@ void shader_core_ctx::create_front_pipeline() {
     m_specilized_dispatch_reg.push_back(
         &m_pipeline_reg[m_pipeline_reg.size() - 1]);
   }
-  for (int j = 0; j < m_config->m_specialized_unit.size(); j++) {
+  for (unsigned j = 0; j < m_config->m_specialized_unit.size(); j++) {
     m_pipeline_reg.push_back(
         register_set(m_config->m_specialized_unit[j].oc_ex_spec_reg_width,
                      m_config->m_specialized_unit[j].name));
@@ -140,7 +140,7 @@ void shader_core_ctx::create_front_pipeline() {
     if (m_config->gpgpu_num_int_units > 0)
       assert(m_config->gpgpu_num_sched_per_core ==
              m_pipeline_reg[ID_OC_INT].get_size());
-    for (int j = 0; j < m_config->m_specialized_unit.size(); j++) {
+    for (unsigned j = 0; j < m_config->m_specialized_unit.size(); j++) {
       if (m_config->m_specialized_unit[j].num_units > 0)
         assert(m_config->gpgpu_num_sched_per_core ==
                m_config->m_specialized_unit[j].id_oc_spec_reg_width);
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index deea1c93a..c486d13c0 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -1622,13 +1622,13 @@ class shader_core_config : public core_config {
   unsigned int gpgpu_operand_collector_num_out_ports_gen;
   unsigned int gpgpu_operand_collector_num_out_ports_int;
 
-  int gpgpu_num_sp_units;
-  int gpgpu_tensor_core_avail;
-  int gpgpu_num_dp_units;
-  int gpgpu_num_sfu_units;
-  int gpgpu_num_tensor_core_units;
-  int gpgpu_num_mem_units;
-  int gpgpu_num_int_units;
+  unsigned int gpgpu_num_sp_units;
+  unsigned int gpgpu_tensor_core_avail;
+  unsigned int gpgpu_num_dp_units;
+  unsigned int gpgpu_num_sfu_units;
+  unsigned int gpgpu_num_tensor_core_units;
+  unsigned int gpgpu_num_mem_units;
+  unsigned int gpgpu_num_int_units;
 
   // Shader core resources
   unsigned gpgpu_shader_registers;

From ccf6662429efcfcf28d1050455163e41553a31f6 Mon Sep 17 00:00:00 2001
From: Ahmad Alawneh <lahmos4@gmail.com>
Date: Wed, 7 Jun 2023 00:05:06 -0400
Subject: [PATCH 122/154] fix more Wsign warnings

---
 src/gpgpu-sim/addrdec.cc          |  2 +-
 src/gpgpu-sim/power_interface.cc  | 10 +++++-----
 src/gpgpu-sim/shader.cc           | 10 +++++-----
 src/gpgpu-sim/shader.h            |  4 ++--
 src/intersim2/networks/anynet.cpp |  2 +-
 src/intersim2/vc.cpp              |  2 +-
 6 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/gpgpu-sim/addrdec.cc b/src/gpgpu-sim/addrdec.cc
index 19714ec99..f4f83f9b1 100644
--- a/src/gpgpu-sim/addrdec.cc
+++ b/src/gpgpu-sim/addrdec.cc
@@ -519,7 +519,7 @@ void linear_to_raw_address_translation::sweep_test() const {
           h->second, raw_addr);
       abort();
     } else {
-      assert((int)tlx.chip < m_n_channel);
+      assert(tlx.chip < m_n_channel);
       // ensure that partition_address() returns the concatenated address
       if ((ADDR_CHIP_S != -1 and raw_addr >= (1ULL << ADDR_CHIP_S)) or
           (ADDR_CHIP_S == -1 and raw_addr >= (1ULL << addrdec_mklow[CHIP]))) {
diff --git a/src/gpgpu-sim/power_interface.cc b/src/gpgpu-sim/power_interface.cc
index 470f2f9cf..45a09bcd9 100644
--- a/src/gpgpu-sim/power_interface.cc
+++ b/src/gpgpu-sim/power_interface.cc
@@ -269,7 +269,7 @@ void calculate_hw_mcpat(const gpgpu_sim_config &config,
   if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L1_WM]))
     l1_write_misses = power_stats->get_l1d_write_misses(1) - power_stats->l1w_misses_kernel;
 
-    if(aggregate_power_stats){
+  if(aggregate_power_stats){
       power_stats->tot_inst_execution += power_stats->get_total_inst(1);
       power_stats->tot_int_inst_execution +=  power_stats->get_total_int_inst(1);
       power_stats->tot_fp_inst_execution +=  power_stats->get_total_fp_inst(1);
@@ -281,16 +281,16 @@ void calculate_hw_mcpat(const gpgpu_sim_config &config,
         l1_read_hits + l1_read_misses,
         l1_write_hits + l1_write_misses,
         power_stats->commited_inst_execution);
-    }
-    else{
-    wrapper->set_inst_power(
+  }
+  else{
+        wrapper->set_inst_power(
         shdr_config->gpgpu_clock_gated_lanes, cycle, //TODO: core.[0] cycles counts don't matter, remove this
         cycle, power_stats->get_total_inst(1),
         power_stats->get_total_int_inst(1), power_stats->get_total_fp_inst(1),
         l1_read_hits + l1_read_misses,
         l1_write_hits + l1_write_misses,
         power_stats->get_committed_inst(1));
-    }
+  }
 
     // Single RF for both int and fp ops -- activity factor set to 0 for Accelwattch HW and Accelwattch Hybrid because no HW Perf Stats for register files
     wrapper->set_regfile_power(power_stats->get_regfile_reads(1),
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index fdc7f779c..f756aecf5 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -1645,7 +1645,7 @@ void swl_scheduler::order_warps() {
 }
 
 void shader_core_ctx::read_operands() {
-  for (int i = 0; i < m_config->reg_file_port_throughput; ++i)
+  for (unsigned int i = 0; i < m_config->reg_file_port_throughput; ++i)
     m_operand_collector.step();
 }
 
@@ -1948,7 +1948,7 @@ mem_stage_stall_type ldst_unit::process_memory_access_queue_l1cache(
   if (inst.accessq_empty()) return result;
 
   if (m_config->m_L1D_config.l1_latency > 0) {
-    for (int j = 0; j < m_config->m_L1D_config.l1_banks;
+    for (unsigned int j = 0; j < m_config->m_L1D_config.l1_banks;
          j++) {  // We can handle at max l1_banks reqs per cycle
 
       if (inst.accessq_empty()) return result;
@@ -2001,7 +2001,7 @@ mem_stage_stall_type ldst_unit::process_memory_access_queue_l1cache(
 }
 
 void ldst_unit::L1_latency_queue_cycle() {
-  for (int j = 0; j < m_config->m_L1D_config.l1_banks; j++) {
+  for (unsigned int j = 0; j < m_config->m_L1D_config.l1_banks; j++) {
     if ((l1_latency_queue[j][0]) != NULL) {
       mem_fetch *mf_next = l1_latency_queue[j][0];
       std::list<cache_event> events;
@@ -2328,7 +2328,7 @@ sp_unit::sp_unit(register_set *result_port, const shader_core_config *config,
 
 specialized_unit::specialized_unit(register_set *result_port,
                                    const shader_core_config *config,
-                                   shader_core_ctx *core, unsigned supported_op,
+                                   shader_core_ctx *core, int supported_op,
                                    char *unit_name, unsigned latency,
                                    unsigned issue_reg_id)
     : pipelined_simd_unit(result_port, config, latency, core, issue_reg_id) {
@@ -3501,7 +3501,7 @@ void shader_core_ctx::cycle() {
   execute();
   read_operands();
   issue();
-  for (int i = 0; i < m_config->inst_fetch_throughput; ++i) {
+  for (unsigned int i = 0; i < m_config->inst_fetch_throughput; ++i) {
     decode();
     fetch();
   }
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index c486d13c0..fd4fc1ff6 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -1284,7 +1284,7 @@ class sp_unit : public pipelined_simd_unit {
 class specialized_unit : public pipelined_simd_unit {
  public:
   specialized_unit(register_set *result_port, const shader_core_config *config,
-                   shader_core_ctx *core, unsigned supported_op,
+                   shader_core_ctx *core, int supported_op,
                    char *unit_name, unsigned latency, unsigned issue_reg_id);
   virtual bool can_issue(const warp_inst_t &inst) const {
     if (inst.op != m_supported_op) {
@@ -1297,7 +1297,7 @@ class specialized_unit : public pipelined_simd_unit {
   bool is_issue_partitioned() { return true; }
 
  private:
-  unsigned m_supported_op;
+  int m_supported_op;
 };
 
 class simt_core_cluster;
diff --git a/src/intersim2/networks/anynet.cpp b/src/intersim2/networks/anynet.cpp
index 4db1dfbf5..d7c6f22b6 100644
--- a/src/intersim2/networks/anynet.cpp
+++ b/src/intersim2/networks/anynet.cpp
@@ -491,7 +491,7 @@ void AnyNet::readFile(){
   }
   sort(node_check.begin(), node_check.end());
   for(size_t i = 0; i<node_check.size(); i++){
-    if(node_check[i] != i){
+    if(node_check[i] != (int)i){
       cout<<"Anynet:booksim trafficmanager assumes sequential node numbering starting at 0\n";
       assert(false);
     }
diff --git a/src/intersim2/vc.cpp b/src/intersim2/vc.cpp
index 94e8c6bf6..4c9444526 100644
--- a/src/intersim2/vc.cpp
+++ b/src/intersim2/vc.cpp
@@ -82,7 +82,7 @@ void VC::AddFlit( Flit *f )
   assert(f);
 
   if(_expected_pid >= 0) {
-    if(f->pid != _expected_pid) {
+    if((long long int)f->pid != _expected_pid) {
       ostringstream err;
       err << "Received flit " << f->id << " with unexpected packet ID: " << f->pid 
 	  << " (expected: " << _expected_pid << ")";

From 5c16d70a65e3e68803074fb7d8ba0324fc2355bf Mon Sep 17 00:00:00 2001
From: Ahmad Alawneh <lahmos4@gmail.com>
Date: Mon, 12 Jun 2023 21:29:12 -0400
Subject: [PATCH 123/154] remove unused vars

---
 src/intersim2/networks/dragonfly.cpp      | 26 +++++++++++------------
 src/intersim2/networks/flatfly_onchip.cpp |  4 ++--
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/intersim2/networks/dragonfly.cpp b/src/intersim2/networks/dragonfly.cpp
index 01a2281d9..f5b637e85 100644
--- a/src/intersim2/networks/dragonfly.cpp
+++ b/src/intersim2/networks/dragonfly.cpp
@@ -111,7 +111,7 @@ int dragonfly_port(int rID, int source, int dest){
   int dest_grp_ID = int(dest/_grp_num_nodes);
   int grp_output=-1;
   int grp_RID=-1;
-  int group_dest=-1;
+  // int group_dest=-1;
   
   //which router within this group the packet needs to go to
   if (dest_grp_ID == grp_ID) {
@@ -123,7 +123,7 @@ int dragonfly_port(int rID, int source, int dest){
       grp_output = dest_grp_ID - 1;
     }
     grp_RID = int(grp_output /gP) + grp_ID * _grp_num_routers;
-    group_dest = grp_RID * gP;
+    // group_dest = grp_RID * gP;
   }
 
   //At the last hop
@@ -221,7 +221,7 @@ void DragonFlyNew::_BuildNet( const Configuration &config )
   int _input=-1;
   int _dim_ID=-1;
   int _num_ports_per_switch=-1;
-  int _dim_size=-1;
+  // int _dim_size=-1;
   int c;
 
   ostringstream router_name;
@@ -314,7 +314,7 @@ void DragonFlyNew::_BuildNet( const Configuration &config )
     // intra-group GROUP channels
     for ( int dim = 0; dim < _n; ++dim ) {
 
-      _dim_size = powi(_k,dim);
+      // _dim_size = powi(_k,dim);
 
       _dim_ID = ((int) (node / ( powi(_p, dim))));
 
@@ -356,16 +356,16 @@ void DragonFlyNew::_BuildNet( const Configuration &config )
 
 
     // add INPUT channels -- "optical" channels connecting the groups
-    int _grp_num_routers;
+    // int _grp_num_routers;
     int grp_output;
-    int grp_ID2;
+    // int grp_ID2;
 
     for ( int cnt = 0; cnt < _p; ++cnt ) {
       //	   _dim_ID
       grp_output = _dim_ID* _p + cnt;
 
-      _grp_num_routers = powi(_k, _n-1);
-      grp_ID2 = (int) ((grp_ID - 1) / (_k - 1));
+      // _grp_num_routers = powi(_k, _n-1);
+      // grp_ID2 = (int) ((grp_ID - 1) / (_k - 1));
 
       if ( grp_ID > grp_output)   {
 
@@ -495,8 +495,8 @@ void ugal_dragonflynew( const Router *r, const Flit *f, int in_channel,
   int debug = f->watch;
   int out_port = -1;
   int out_vc = 0;
-  int min_queue_size, min_hopcnt;
-  int nonmin_queue_size, nonmin_hopcnt;
+  int min_queue_size; //, min_hopcnt;
+  int nonmin_queue_size; //, nonmin_hopcnt;
   int intm_grp_ID;
   int intm_rID;
 
@@ -523,13 +523,13 @@ void ugal_dragonflynew( const Router *r, const Flit *f, int in_channel,
 	f->ph = 1;
       } else {
 	//congestion metrics using queue length, obtained by GetUsedCredit()
-	min_hopcnt = dragonflynew_hopcnt(f->src, f->dest);
+	// min_hopcnt = dragonflynew_hopcnt(f->src, f->dest);
 	min_router_output = dragonfly_port(rID, f->src, f->dest); 
       	min_queue_size = max(r->GetUsedCredit(min_router_output), 0) ; 
 
       
-	nonmin_hopcnt = dragonflynew_hopcnt(f->src, f->intm) +
-	  dragonflynew_hopcnt(f->intm,f->dest);
+	// nonmin_hopcnt = dragonflynew_hopcnt(f->src, f->intm) +
+	//   dragonflynew_hopcnt(f->intm,f->dest);
 	nonmin_router_output = dragonfly_port(rID, f->src, f->intm);
 	nonmin_queue_size = max(r->GetUsedCredit(nonmin_router_output), 0);
 
diff --git a/src/intersim2/networks/flatfly_onchip.cpp b/src/intersim2/networks/flatfly_onchip.cpp
index fd17c1a41..df4337175 100644
--- a/src/intersim2/networks/flatfly_onchip.cpp
+++ b/src/intersim2/networks/flatfly_onchip.cpp
@@ -1204,7 +1204,7 @@ void ugal_pni_flatfly_onchip( const Router *r, const Flit *f, int in_channel,
 int find_distance (int src, int dest) {
   int dist = 0;
   int _dim   = gN;
-  int _dim_size;
+  // int _dim_size;
   
   int src_tmp= (int) src / gC;
   int dest_tmp = (int) dest / gC;
@@ -1212,7 +1212,7 @@ int find_distance (int src, int dest) {
   
   //  cout << " HOP CNT between  src: " << src << " dest: " << dest;
   for (int d=0;d < _dim; d++) {
-    _dim_size = powi(gK, d )*gC;
+    // _dim_size = powi(gK, d )*gC;
     //if ((int)(src / _dim_size) !=  (int)(dest / _dim_size))
     //   dist++;
     src_id = src_tmp % gK;

From cb565024f23f1ab6b11d3717967b559aa49f3333 Mon Sep 17 00:00:00 2001
From: Ahmad Alawneh <lahmos4@gmail.com>
Date: Mon, 12 Jun 2023 22:00:22 -0400
Subject: [PATCH 124/154] more unused var fixes

---
 libcuda/cuda_runtime_api.cc | 9 +++++++++
 src/gpgpu-sim/shader.cc     | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/libcuda/cuda_runtime_api.cc b/libcuda/cuda_runtime_api.cc
index 12d3aac7d..bc7ad3802 100644
--- a/libcuda/cuda_runtime_api.cc
+++ b/libcuda/cuda_runtime_api.cc
@@ -464,6 +464,10 @@ static int get_app_cuda_version() {
       " | grep libcudart.so | sed  's/.*libcudart.so.\\(.*\\) =>.*/\\1/' > " +
       fname;
   int res = system(app_cuda_version_command.c_str());
+  if(res == -1){
+    printf("Error - Cannot detect the app's CUDA version.\n");
+    exit(1);
+  }
   FILE *cmd = fopen(fname, "r");
   char buf[256];
   while (fgets(buf, sizeof(buf), cmd) != 0) {
@@ -3235,6 +3239,11 @@ char *readfile(const std::string filename) {
   // allocate and copy the entire ptx
   char *ret = (char *)malloc((filesize + 1) * sizeof(char));
   int num = fread(ret, 1, filesize, fp);
+  if(num == 0){
+        std::cout << "ERROR: Could not read data from file %s\n"
+              << filename << std::endl;
+    assert(0);
+  }
   ret[filesize] = '\0';
   fclose(fp);
   return ret;
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index f756aecf5..79be85747 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -481,7 +481,7 @@ shader_core_ctx::shader_core_ctx(class gpgpu_sim *gpu,
   m_config = config;
   m_memory_config = mem_config;
   m_stats = stats;
-  unsigned warp_size = config->warp_size;
+  // unsigned warp_size = config->warp_size;
   Issue_Prio = 0;
 
   m_sid = shader_id;

From 40beac66a57f9477dd03369b79a26f9823f089ff Mon Sep 17 00:00:00 2001
From: Ahmad Alawneh <lahmos4@gmail.com>
Date: Mon, 12 Jun 2023 22:09:59 -0400
Subject: [PATCH 125/154] no return warnings ngs fix

---
 libcuda/cuda_runtime_api.cc | 1 +
 src/gpgpu-sim/dram.cc       | 1 +
 src/gpgpu-sim/gpu-cache.h   | 1 +
 3 files changed, 3 insertions(+)

diff --git a/libcuda/cuda_runtime_api.cc b/libcuda/cuda_runtime_api.cc
index bc7ad3802..5866b362e 100644
--- a/libcuda/cuda_runtime_api.cc
+++ b/libcuda/cuda_runtime_api.cc
@@ -3605,6 +3605,7 @@ unsigned CUDARTAPI __cudaPushCallConfiguration(dim3 gridDim, dim3 blockDim,
     announce_call(__my_func__);
   }
   cudaConfigureCallInternal(gridDim, blockDim, sharedMem, stream);
+  return 0;
 }
 
 cudaError_t CUDARTAPI __cudaPopCallConfiguration(dim3 *gridDim, dim3 *blockDim,
diff --git a/src/gpgpu-sim/dram.cc b/src/gpgpu-sim/dram.cc
index 662c2ed3f..53c823870 100644
--- a/src/gpgpu-sim/dram.cc
+++ b/src/gpgpu-sim/dram.cc
@@ -880,4 +880,5 @@ unsigned dram_t::get_bankgrp_number(unsigned i) {
   } else {
     assert(1);
   }
+  return 0; // we should never get here
 }
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 4bbf7e2b3..9a4856b9e 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -499,6 +499,7 @@ struct sector_cache_block : public cache_block_t {
     for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; ++i) {
       if (sector_mask.to_ulong() & (1 << i)) return i;
     }
+    return SECTOR_CHUNCK_SIZE; //error
   }
 };
 

From 24a35fbd683606efabae8d60a3283dc2bd2a66b0 Mon Sep 17 00:00:00 2001
From: Ahmad Alawneh <lahmos4@gmail.com>
Date: Mon, 12 Jun 2023 22:30:33 -0400
Subject: [PATCH 126/154] order warnings fix

---
 src/gpgpu-sim/shader.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index fd4fc1ff6..8c75cf0cc 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -351,8 +351,8 @@ class scheduler_unit {  // this can be copied freely, so can be used in std
         m_sfu_out(sfu_out),
         m_int_out(int_out),
         m_tensor_core_out(tensor_core_out),
-        m_spec_cores_out(spec_cores_out),
         m_mem_out(mem_out),
+        m_spec_cores_out(spec_cores_out),
         m_id(id) {}
   virtual ~scheduler_unit() {}
   virtual void add_supervised_warp_id(int i) {

From 9eaf173e6801a6bbb0f4acd13ea064fbd3054be8 Mon Sep 17 00:00:00 2001
From: Aaron M Barnes <barnes88@purdue.edu>
Date: Tue, 13 Jun 2023 12:59:52 -0400
Subject: [PATCH 127/154] rename cache bankconflict stat to be more descriptive

---
 src/gpgpu-sim/shader.cc | 8 ++++----
 src/gpgpu-sim/shader.h  | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index ca26abbd1..a9732d85b 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -642,7 +642,7 @@ void shader_core_stats::print(FILE *fout) const {
   fprintf(fout, "gpgpu_n_param_mem_insn = %d\n", gpgpu_n_param_insn);
 
   fprintf(fout, "gpgpu_n_shmem_bkconflict = %d\n", gpgpu_n_shmem_bkconflict);
-  fprintf(fout, "gpgpu_n_cache_bkconflict = %d\n", gpgpu_n_cache_bkconflict);
+  fprintf(fout, "gpgpu_n_l1cache_bkconflict = %d\n", gpgpu_n_l1cache_bkconflict);
 
   fprintf(fout, "gpgpu_n_intrawarp_mshr_merge = %d\n",
           gpgpu_n_intrawarp_mshr_merge);
@@ -840,8 +840,8 @@ void shader_core_stats::visualizer_print(gzFile visualizer_file) {
   gzprintf(visualizer_file, "\n");
 
   // overall cache miss rates
-  gzprintf(visualizer_file, "gpgpu_n_cache_bkconflict: %d\n",
-           gpgpu_n_cache_bkconflict);
+  gzprintf(visualizer_file, "gpgpu_n_l1cache_bkconflict: %d\n",
+           gpgpu_n_l1cache_bkconflict);
   gzprintf(visualizer_file, "gpgpu_n_shmem_bkconflict: %d\n",
            gpgpu_n_shmem_bkconflict);
 
@@ -1978,7 +1978,7 @@ mem_stage_stall_type ldst_unit::process_memory_access_queue_l1cache(
         inst.accessq_pop_back();
       } else {
         result = BK_CONF;
-        m_stats->gpgpu_n_cache_bkconflict++;
+        m_stats->gpgpu_n_l1cache_bkconflict++;
         delete mf;
         break;  // do not try again, just break from the loop and try the next
                 // cycle
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index deea1c93a..986105ed3 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -1735,7 +1735,7 @@ struct shader_core_stats_pod {
   unsigned gpgpu_n_const_insn;
   unsigned gpgpu_n_param_insn;
   unsigned gpgpu_n_shmem_bkconflict;
-  unsigned gpgpu_n_cache_bkconflict;
+  unsigned gpgpu_n_l1cache_bkconflict;
   int gpgpu_n_intrawarp_mshr_merge;
   unsigned gpgpu_n_cmem_portconflict;
   unsigned gpu_stall_shd_mem_breakdown[N_MEM_STAGE_ACCESS_TYPE]

From 301be9e59c6c934f4e194cf6c95dd0c60b3894cc Mon Sep 17 00:00:00 2001
From: Fangjia Shen <50934207+FJShen@users.noreply.github.com>
Date: Sat, 17 Jun 2023 19:03:31 -0400
Subject: [PATCH 128/154] 137 drop sector cache flexibility (#57)

Addresses accel-sim issue 137. For sector cache, the sector size must be 32B (hard-coded and not configurable) and cache line size must be set to 128B; a runtime parameter check will terminate simulation if the cache line size is not 128B.
---
 .../tested-cfgs/SM2_GTX480/gpgpusim.config    |  2 +-
 .../SM3_KEPLER_TITAN/gpgpusim.config          |  2 +-
 .../tested-cfgs/SM6_TITANX/gpgpusim.config    |  2 +-
 .../tested-cfgs/SM75_RTX2060/gpgpusim.config  |  2 +-
 .../SM75_RTX2060_S/gpgpusim.config            |  2 +-
 configs/tested-cfgs/SM7_GV100/gpgpusim.config |  2 +-
 configs/tested-cfgs/SM7_QV100/gpgpusim.config |  2 +-
 .../tested-cfgs/SM7_TITANV/gpgpusim.config    |  2 +-
 .../tested-cfgs/SM86_RTX3070/gpgpusim.config  |  2 +-
 src/gpgpu-sim/gpu-cache.h                     | 22 ++++++++++++++-----
 10 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/configs/tested-cfgs/SM2_GTX480/gpgpusim.config b/configs/tested-cfgs/SM2_GTX480/gpgpusim.config
index 609a9ef1b..bc01821db 100644
--- a/configs/tested-cfgs/SM2_GTX480/gpgpusim.config
+++ b/configs/tested-cfgs/SM2_GTX480/gpgpusim.config
@@ -56,7 +56,7 @@
 
 
 # In Fermi, the cache and shared memory can be configured to 16kb:48kb(default) or 48kb:16kb
-# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
 # ** Optional parameter - Required when mshr_type==Texture Fifo
 # Note: Hashing set index function (H) only applies to a set size of 32 or 64. 
 -gpgpu_cache:dl1  N:32:128:4,L:L:m:N:H,S:64:8,8
diff --git a/configs/tested-cfgs/SM3_KEPLER_TITAN/gpgpusim.config b/configs/tested-cfgs/SM3_KEPLER_TITAN/gpgpusim.config
index c83159f5f..ef47ddfd9 100644
--- a/configs/tested-cfgs/SM3_KEPLER_TITAN/gpgpusim.config
+++ b/configs/tested-cfgs/SM3_KEPLER_TITAN/gpgpusim.config
@@ -99,7 +99,7 @@
 # Greedy then oldest scheduler
 -gpgpu_scheduler gto
 
-# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
 # ** Optional parameter - Required when mshr_type==Texture Fifo
 # Note: Hashing set index function (H) only applies to a set size of 32 or 64. 
 # The defulat is to disable the L1 cache, unless cache modifieres are used
diff --git a/configs/tested-cfgs/SM6_TITANX/gpgpusim.config b/configs/tested-cfgs/SM6_TITANX/gpgpusim.config
index 652f0a09e..7d3e2d47e 100644
--- a/configs/tested-cfgs/SM6_TITANX/gpgpusim.config
+++ b/configs/tested-cfgs/SM6_TITANX/gpgpusim.config
@@ -123,7 +123,7 @@
 -gpgpu_dual_issue_diff_exec_units 1
 
 ## L1/shared memory configuration
-# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
 # ** Optional parameter - Required when mshr_type==Texture Fifo
 # Note: Hashing set index function (H) only applies to a set size of 32 or 64. 
 # The defulat is to disable the L1 cache, unless cache modifieres are used
diff --git a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
index 158b97e17..6ff4b6c08 100644
--- a/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060/gpgpusim.config
@@ -83,7 +83,7 @@
 -gpgpu_dual_issue_diff_exec_units 1
 
 ## L1/shared memory configuration
-# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
 # ** Optional parameter - Required when mshr_type==Texture Fifo
 # In adaptive cache, we adaptively assign the remaining shared memory to L1 cache 
 # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
diff --git a/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config b/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config
index 89435a919..08ac75277 100644
--- a/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config
+++ b/configs/tested-cfgs/SM75_RTX2060_S/gpgpusim.config
@@ -128,7 +128,7 @@
 -gpgpu_num_reg_banks 16
 -gpgpu_reg_file_port_throughput 2
 
-# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
 # ** Optional parameter - Required when mshr_type==Texture Fifo
 -gpgpu_adaptive_cache_config 0
 -gpgpu_l1_banks 4
diff --git a/configs/tested-cfgs/SM7_GV100/gpgpusim.config b/configs/tested-cfgs/SM7_GV100/gpgpusim.config
index 8d2b10199..1595c6901 100644
--- a/configs/tested-cfgs/SM7_GV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_GV100/gpgpusim.config
@@ -137,7 +137,7 @@
 -gpgpu_dual_issue_diff_exec_units 1
 
 ## L1/shared memory configuration
-# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
 # ** Optional parameter - Required when mshr_type==Texture Fifo
 # Defualt config is 32KB DL1 and 96KB shared memory
 # In Volta, we assign the remaining shared memory to L1 cache 
diff --git a/configs/tested-cfgs/SM7_QV100/gpgpusim.config b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
index 1b55aafe3..b3384afcb 100644
--- a/configs/tested-cfgs/SM7_QV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_QV100/gpgpusim.config
@@ -137,7 +137,7 @@
 -gpgpu_dual_issue_diff_exec_units 1
 
 ## L1/shared memory configuration
-# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
 # ** Optional parameter - Required when mshr_type==Texture Fifo
 # Defualt config is 32KB DL1 and 96KB shared memory
 # In Volta, we assign the remaining shared memory to L1 cache 
diff --git a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
index af561de59..c37aaf053 100644
--- a/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_TITANV/gpgpusim.config
@@ -107,7 +107,7 @@
 -gpgpu_dual_issue_diff_exec_units 1
 
 ## L1/shared memory configuration
-# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
 # ** Optional parameter - Required when mshr_type==Texture Fifo
 # Defualt config is 32KB DL1 and 96KB shared memory
 # In Volta, we assign the remaining shared memory to L1 cache 
diff --git a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
index aee01308d..d26b1a621 100644
--- a/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
+++ b/configs/tested-cfgs/SM86_RTX3070/gpgpusim.config
@@ -83,7 +83,7 @@
 -gpgpu_dual_issue_diff_exec_units 1
 
 ## L1/shared memory configuration
-# <nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
+# <sector?>:<nsets>:<bsize>:<assoc>,<rep>:<wr>:<alloc>:<wr_alloc>:<set_index_fn>,<mshr>:<N>:<merge>,<mq>:**<fifo_entry>
 # ** Optional parameter - Required when mshr_type==Texture Fifo
 # In adaptive cache, we adaptively assign the remaining shared memory to L1 cache 
 # For more info, see https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x 
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 4bbf7e2b3..aa693b52b 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -563,10 +563,12 @@ class cache_config {
     char ct, rp, wp, ap, mshr_type, wap, sif;
 
     int ntok =
-        sscanf(config, "%c:%u:%u:%u,%c:%c:%c:%c:%c,%c:%u:%u,%u:%u,%u", &ct,
-               &m_nset, &m_line_sz, &m_assoc, &rp, &wp, &ap, &wap, &sif,
-               &mshr_type, &m_mshr_entries, &m_mshr_max_merge,
-               &m_miss_queue_size, &m_result_fifo_entries, &m_data_port_width);
+        sscanf(config, "%c:%u:%u:%u,%c:%c:%c:%c:%c,%c:%u:%u,%u:%u,%u", 
+            &ct, &m_nset, &m_line_sz, &m_assoc, 
+            &rp, &wp, &ap, &wap, &sif,
+            &mshr_type, &m_mshr_entries, &m_mshr_max_merge,
+            &m_miss_queue_size, &m_result_fifo_entries, 
+            &m_data_port_width);
 
     if (ntok < 12) {
       if (!strcmp(config, "none")) {
@@ -721,9 +723,17 @@ class cache_config {
           "Invalid cache configuration: FETCH_ON_WRITE and LAZY_FETCH_ON_READ "
           "cannot work properly with ON_FILL policy. Cache must be ON_MISS. ");
     }
+
     if (m_cache_type == SECTOR) {
-      assert(m_line_sz / SECTOR_SIZE == SECTOR_CHUNCK_SIZE &&
-             m_line_sz % SECTOR_SIZE == 0);
+      bool cond = 
+            m_line_sz / SECTOR_SIZE == SECTOR_CHUNCK_SIZE &&
+            m_line_sz % SECTOR_SIZE == 0;
+      if(!cond){
+          std::cerr<<"error: For sector cache, the simulator uses hard-coded "
+             "SECTOR_SIZE and SECTOR_CHUNCK_SIZE. The line size "
+             "must be product of both values.\n";
+          assert(0);
+        }
     }
 
     // default: port to data array width and granularity = line size

From 68a91076b2aab8f60bae551d6df6b3a8aa411463 Mon Sep 17 00:00:00 2001
From: Ahmad Alawneh <lahmos4@gmail.com>
Date: Tue, 20 Jun 2023 14:42:00 -0400
Subject: [PATCH 129/154] fix most c warnings

---
 src/cuda-sim/cuda-sim.cc            | 2 +-
 src/cuda-sim/instructions.cc        | 2 +-
 src/cuda-sim/ptx_ir.h               | 1 +
 src/gpgpu-sim/addrdec.cc            | 2 +-
 src/gpgpu-sim/gpu-sim.cc            | 2 +-
 src/gpgpu-sim/local_interconnect.cc | 4 ++--
 src/gpgpu-sim/mem_fetch.cc          | 8 ++++----
 src/gpgpu-sim/shader_trace.h        | 2 +-
 src/stream_manager.cc               | 2 ++
 9 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/cuda-sim/cuda-sim.cc b/src/cuda-sim/cuda-sim.cc
index b063512bf..888cf7750 100644
--- a/src/cuda-sim/cuda-sim.cc
+++ b/src/cuda-sim/cuda-sim.cc
@@ -1531,7 +1531,7 @@ void function_info::ptx_jit_config(
   std::string filename_c(filename + "_c");
   snprintf(buff, 1024, "c++filt %s > %s", get_name().c_str(),
            filename_c.c_str());
-  assert(system(buff) != NULL);
+  assert(system(buff) != 0);
   FILE *fp = fopen(filename_c.c_str(), "r");
   char * ptr = fgets(buff, 1024, fp);
   if(ptr == NULL ){
diff --git a/src/cuda-sim/instructions.cc b/src/cuda-sim/instructions.cc
index e22d88a81..4981c9994 100644
--- a/src/cuda-sim/instructions.cc
+++ b/src/cuda-sim/instructions.cc
@@ -1948,7 +1948,7 @@ void mma_impl(const ptx_instruction *pI, core_t *core, warp_inst_t inst) {
             hex_val = (v[k / 2].s64 & 0xffff);
           else
             hex_val = ((v[k / 2].s64 & 0xffff0000) >> 16);
-          nw_v[k].f16 = *((half *)&hex_val);
+          nw_v[k].f16 = *(reinterpret_cast<half*>(hex_val));
         }
       }
       if (!((operand_num == 3) && (type2 == F32_TYPE))) {
diff --git a/src/cuda-sim/ptx_ir.h b/src/cuda-sim/ptx_ir.h
index 825175964..7ba717118 100644
--- a/src/cuda-sim/ptx_ir.h
+++ b/src/cuda-sim/ptx_ir.h
@@ -1248,6 +1248,7 @@ class function_info {
   const ptx_version &get_ptx_version() const {
     return m_symtab->get_ptx_version();
   }
+  virtual ~function_info(){}
   unsigned get_sm_target() const { return m_symtab->get_sm_target(); }
   bool is_extern() const { return m_extern; }
   void set_name(const char *name) { m_name = name; }
diff --git a/src/gpgpu-sim/addrdec.cc b/src/gpgpu-sim/addrdec.cc
index f4f83f9b1..db27c825b 100644
--- a/src/gpgpu-sim/addrdec.cc
+++ b/src/gpgpu-sim/addrdec.cc
@@ -584,7 +584,7 @@ unsigned next_powerOf2(unsigned n) {
   n = n - 1;
 
   // do till only one bit is left
-  while (n & n - 1) n = n & (n - 1);  // unset rightmost bit
+  while (n & (n - 1)) n = n & (n - 1);  // unset rightmost bit
 
   // n is now a power of two (less than n)
 
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index ea50fa02a..47c0b4a89 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -80,7 +80,7 @@ class gpgpu_sim_wrapper {};
 #include <sstream>
 #include <string>
 
-#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+// #define MAX(a, b) (((a) > (b)) ? (a) : (b)) //redefined 
 
 bool g_interactive_debugger_enabled = false;
 
diff --git a/src/gpgpu-sim/local_interconnect.cc b/src/gpgpu-sim/local_interconnect.cc
index df6bd7bae..fe7bc74fb 100644
--- a/src/gpgpu-sim/local_interconnect.cc
+++ b/src/gpgpu-sim/local_interconnect.cc
@@ -148,8 +148,8 @@ void xbar_router::RR_Advance() {
       }
     }
   }
-
-  next_node_id = (++next_node_id % total_nodes);
+  next_node_id = next_node_id + 1 ;
+  next_node_id = (next_node_id % total_nodes);
 
   conflicts += conflict_sub;
   if (active) {
diff --git a/src/gpgpu-sim/mem_fetch.cc b/src/gpgpu-sim/mem_fetch.cc
index 456d891dd..0d86046ad 100644
--- a/src/gpgpu-sim/mem_fetch.cc
+++ b/src/gpgpu-sim/mem_fetch.cc
@@ -84,10 +84,10 @@ mem_fetch::~mem_fetch() { m_status = MEM_FETCH_DELETED; }
 #undef MF_TUP_END
 
 void mem_fetch::print(FILE *fp, bool print_inst) const {
-  if (this == NULL) {
-    fprintf(fp, " <NULL mem_fetch pointer>\n");
-    return;
-  }
+  // if (this == NULL) { // doenst make sense!
+  //   fprintf(fp, " <NULL mem_fetch pointer>\n");
+  //   return;
+  // }
   fprintf(fp, "  mf: uid=%6u, sid%02u:w%02u, part=%u, ", m_request_uid, m_sid,
           m_wid, m_raw_addr.chip);
   m_access.print(fp);
diff --git a/src/gpgpu-sim/shader_trace.h b/src/gpgpu-sim/shader_trace.h
index e7486d8b0..367262c90 100644
--- a/src/gpgpu-sim/shader_trace.h
+++ b/src/gpgpu-sim/shader_trace.h
@@ -38,7 +38,7 @@
 #define SCHED_PRINT_STR SHADER_PRINT_STR "Scheduler %d - "
 #define SHADER_DTRACE(x) \
   (DTRACE(x) &&          \
-   (Trace::sampling_core == get_sid() || Trace::sampling_core == -1))
+   (Trace::sampling_core == (int)get_sid() || Trace::sampling_core == -1))
 
 // Intended to be called from inside components of a shader core.
 // Depends on a get_sid() function
diff --git a/src/stream_manager.cc b/src/stream_manager.cc
index e99bf8783..0ce3c6a74 100644
--- a/src/stream_manager.cc
+++ b/src/stream_manager.cc
@@ -227,6 +227,8 @@ void stream_operation::print(FILE *fp) const {
     case stream_no_op:
       fprintf(fp, "no-op");
       break;
+    default:
+      break;
   }
 }
 

From 5831da33a0421de19a34b628813be86bc73b9f91 Mon Sep 17 00:00:00 2001
From: Ahmad Alawneh <lahmos4@gmail.com>
Date: Tue, 20 Jun 2023 15:07:37 -0400
Subject: [PATCH 130/154] ignore lex warnings

---
 cuobjdump_to_ptxplus/Makefile | 4 ++--
 src/cuda-sim/Makefile         | 8 ++++----
 src/intersim2/Makefile        | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/cuobjdump_to_ptxplus/Makefile b/cuobjdump_to_ptxplus/Makefile
index e95136a95..0eb7d1e1d 100644
--- a/cuobjdump_to_ptxplus/Makefile
+++ b/cuobjdump_to_ptxplus/Makefile
@@ -28,10 +28,10 @@ $(OUTPUT_DIR)/cuobjdump_to_ptxplus: $(OUTPUT_DIR)/cuobjdumpInst.o $(OUTPUT_DIR)/
 
 
 $(OUTPUT_DIR)/lex.ptx_.c : ptx.l
-	${LEX} ${LEXFLAGS} -o$(OUTPUT_DIR)/lex.ptx_.c ptx.l
+	${LEX} ${LEXFLAGS} -o$(OUTPUT_DIR)/lex.ptx_.c ptx.l 2> /dev/null
 
 $(OUTPUT_DIR)/ptx.tab.c : ptx.y
-	${YACC} ${YFLAGS} --name-prefix=ptx_ -v ptx.y  --file-prefix=$(OUTPUT_DIR)/ptx
+	${YACC} ${YFLAGS} --name-prefix=ptx_ -v ptx.y  --file-prefix=$(OUTPUT_DIR)/ptx 2> /dev/null
 
 $(OUTPUT_DIR)/ptx.tab.h :$(OUTPUT_DIR)/ptx.tab.c
 
diff --git a/src/cuda-sim/Makefile b/src/cuda-sim/Makefile
index 01bc4807f..541cf8f26 100644
--- a/src/cuda-sim/Makefile
+++ b/src/cuda-sim/Makefile
@@ -91,16 +91,16 @@ $(OUTPUT_DIR)/lex.ptxinfo_.o: $(OUTPUT_DIR)/lex.ptxinfo_.c $(OUTPUT_DIR)/ptxinfo
 	$(CPP) -c $(CXX_OPT) $(OUTPUT_DIR)/lex.ptxinfo_.c -o $(OUTPUT_DIR)/lex.ptxinfo_.o
 
 $(OUTPUT_DIR)/ptx.tab.c: ptx.y
-	bison --name-prefix=ptx_ -v -d ptx.y --file-prefix=$(OUTPUT_DIR)/ptx
+	bison --name-prefix=ptx_ -v -d ptx.y --file-prefix=$(OUTPUT_DIR)/ptx 2> /dev/null
 
 $(OUTPUT_DIR)/ptxinfo.tab.c: ptxinfo.y
-	bison --name-prefix=ptxinfo_ -v -d ptxinfo.y --file-prefix=$(OUTPUT_DIR)/ptxinfo
+	bison --name-prefix=ptxinfo_ -v -d ptxinfo.y --file-prefix=$(OUTPUT_DIR)/ptxinfo 2> /dev/null
 
 $(OUTPUT_DIR)/lex.ptx_.c: ptx.l
-	flex --outfile=$(OUTPUT_DIR)/lex.ptx_.c ptx.l 
+	flex --outfile=$(OUTPUT_DIR)/lex.ptx_.c ptx.l 2> /dev/null
 
 $(OUTPUT_DIR)/lex.ptxinfo_.c: ptxinfo.l
-	flex --outfile=$(OUTPUT_DIR)/lex.ptxinfo_.c ptxinfo.l 
+	flex --outfile=$(OUTPUT_DIR)/lex.ptxinfo_.c ptxinfo.l 2> /dev/null
 
 clean:
 	rm -f *~ *.o *.gcda *.gcno *.gcov libgpgpu_ptx_sim.a \
diff --git a/src/intersim2/Makefile b/src/intersim2/Makefile
index 3eeeb7041..dad436aa6 100644
--- a/src/intersim2/Makefile
+++ b/src/intersim2/Makefile
@@ -136,10 +136,10 @@ depend:
 	makedepend -f$(OBJDIR)/Makefile.makedepend -I$(INCPATH) -p$(OBJDIR)/ $(ALL_SRCS) 2> /dev/null
 
 ${LEX_OBJS}: $(OBJDIR)/lex.yy.c $(OBJDIR)/y.tab.h
-	$(CC) $(CPPFLAGS) -c $< -o $@
+	$(CC) -Wno-unused-function $(CPPFLAGS) -c $< -o $@
 
 ${YACC_OBJS}: $(OBJDIR)/y.tab.c $(OBJDIR)/y.tab.h
-	$(CC) $(CPPFLAGS) -c $< -o $@
+	$(CC) -Wno-unused-function $(CPPFLAGS) -c $< -o $@
 
 ${OBJDIR}/%.o: %.cpp 
 	$(CXX) $(CPPFLAGS) -c $< -o $@

From a0c12f5d63504c67c8bdfb1a6cc689b4ab7867a6 Mon Sep 17 00:00:00 2001
From: Connie120 <conniekang1023@gmail.com>
Date: Wed, 20 Sep 2023 14:53:37 -0400
Subject: [PATCH 131/154] LDGSTS, LDGDEPBAR and DEPBAR Implementations (#62)

---
 src/abstract_hardware_model.h |  21 ++++++
 src/gpgpu-sim/shader.cc       | 129 +++++++++++++++++++++++++++++++++-
 src/gpgpu-sim/shader.h        |  48 +++++++++++++
 3 files changed, 196 insertions(+), 2 deletions(-)

diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index 3b95829b4..ebf6535ea 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -1056,6 +1056,13 @@ class warp_inst_t : public inst_t {
     m_uid = 0;
     m_empty = true;
     m_config = NULL;
+
+    // Ni: 
+    m_is_ldgsts = false;
+    m_is_ldgdepbar = false;
+    m_is_depbar = false;
+
+    m_depbar_group_no = 0;
   }
   warp_inst_t(const core_config *config) {
     m_uid = 0;
@@ -1069,6 +1076,13 @@ class warp_inst_t : public inst_t {
     m_is_printf = false;
     m_is_cdp = 0;
     should_do_atomic = true;
+
+    // Ni: 
+    m_is_ldgsts = false;
+    m_is_ldgdepbar = false;
+    m_is_depbar = false;
+
+    m_depbar_group_no = 0;
   }
   virtual ~warp_inst_t() {}
 
@@ -1251,6 +1265,13 @@ class warp_inst_t : public inst_t {
   // Jin: cdp support
  public:
   int m_is_cdp;
+
+  // Ni: add boolean to indicate whether the instruction is ldgsts
+  bool m_is_ldgsts;
+  bool m_is_ldgdepbar;
+  bool m_is_depbar;
+
+  unsigned int m_depbar_group_no;
 };
 
 void move_warp(warp_inst_t *&dst, warp_inst_t *&src);
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 229b305c1..67540e083 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -532,7 +532,6 @@ void shader_core_ctx::reinit(unsigned start_thread, unsigned end_thread,
 void shader_core_ctx::init_warps(unsigned cta_id, unsigned start_thread,
                                  unsigned end_thread, unsigned ctaid,
                                  int cta_size, kernel_info_t &kernel) {
-  //
   address_type start_pc = next_pc(start_thread);
   unsigned kernel_id = kernel.get_uid();
   if (m_config->model == POST_DOMINATOR) {
@@ -1046,6 +1045,25 @@ void shader_core_ctx::issue_warp(register_set &pipe_reg_set,
   m_stats->shader_cycle_distro[2 + (*pipe_reg)->active_count()]++;
   func_exec_inst(**pipe_reg);
 
+  // Add LDGSTS instructions into a buffer
+  unsigned int ldgdepbar_id = m_warp[warp_id]->m_ldgdepbar_id;
+  if (next_inst->m_is_ldgsts) {
+    if (m_warp[warp_id]->m_ldgdepbar_buf.size() == ldgdepbar_id + 1) {
+      m_warp[warp_id]->m_ldgdepbar_buf[ldgdepbar_id].push_back(*next_inst);
+    }
+    else {
+      assert(m_warp[warp_id]->m_ldgdepbar_buf.size() < ldgdepbar_id + 1);
+      std::vector<warp_inst_t> l;
+      l.push_back(*next_inst);
+      m_warp[warp_id]->m_ldgdepbar_buf.push_back(l);
+    }
+    // If the mask of the instruction is all 0, then the address is also 0, 
+    // so that there's no need to check through the writeback
+    if (next_inst->get_active_mask() == 0) {
+      (m_warp[warp_id]->m_ldgdepbar_buf.back()).back().pc = -1;
+    }
+  }
+
   if (next_inst->op == BARRIER_OP) {
     m_warp[warp_id]->store_info_of_last_inst_at_barrier(*pipe_reg);
     m_barriers.warp_reaches_barrier(m_warp[warp_id]->get_cta_id(), warp_id,
@@ -1053,6 +1071,37 @@ void shader_core_ctx::issue_warp(register_set &pipe_reg_set,
 
   } else if (next_inst->op == MEMORY_BARRIER_OP) {
     m_warp[warp_id]->set_membar();
+  } else if (next_inst->m_is_ldgdepbar) { // Add for LDGDEPBAR
+    m_warp[warp_id]->m_ldgdepbar_id++;
+  } else if (next_inst->m_is_depbar) {  // Add for DEPBAR
+    // Set to true immediately when a DEPBAR instruction is met
+    m_warp[warp_id]->m_waiting_ldgsts = true;
+    m_warp[warp_id]->m_depbar_group = next_inst->m_depbar_group_no; // set in trace_driven.cc
+
+    // Record the last group that's possbily being monitored by this DEPBAR instr
+    m_warp[warp_id]->m_depbar_start_id = m_warp[warp_id]->m_ldgdepbar_id - 1;
+    
+    // Record the last group that's actually being monitored by this DEPBAR instr
+    unsigned int end_group = m_warp[warp_id]->m_ldgdepbar_id - m_warp[warp_id]->m_depbar_group;
+
+    // Check for the case that the LDGSTSs monitored have finished when encountering the 
+    // DEPBAR instruction 
+    bool done_flag = true;
+    for (int i = 0; i < end_group; i++) {
+      for (int j = 0; j < m_warp[warp_id]->m_ldgdepbar_buf[i].size(); j++) {
+        if (m_warp[warp_id]->m_ldgdepbar_buf[i][j].pc != -1) {
+          done_flag = false;
+          goto UpdateDEPBAR;
+        }
+      }
+    }
+  
+  UpdateDEPBAR:
+    if (done_flag) {
+      if (m_warp[warp_id]->m_waiting_ldgsts) {
+        m_warp[warp_id]->m_waiting_ldgsts = false;
+      }
+    }
   }
 
   updateSIMTStack(warp_id, *pipe_reg);
@@ -1796,12 +1845,50 @@ void ldst_unit::get_L1T_sub_stats(struct cache_sub_stats &css) const {
   if (m_L1T) m_L1T->get_sub_stats(css);
 }
 
+// Add this function to unset depbar
+void shader_core_ctx::unset_depbar(const warp_inst_t &inst) {
+  bool done_flag = true;
+  unsigned int end_group = m_warp[inst.warp_id()]->m_depbar_start_id == 0 ? 
+    m_warp[inst.warp_id()]->m_ldgdepbar_buf.size() :
+    (m_warp[inst.warp_id()]->m_depbar_start_id - m_warp[inst.warp_id()]->m_depbar_group + 1);
+
+  if (inst.m_is_ldgsts) { 
+    for (int i = 0; i < m_warp[inst.warp_id()]->m_ldgdepbar_buf.size(); i++) {
+      for (int j = 0; j < m_warp[inst.warp_id()]->m_ldgdepbar_buf[i].size(); j++) {
+        if (m_warp[inst.warp_id()]->m_ldgdepbar_buf[i][j].pc == inst.pc) {
+          // Handle the case that same pc results in multiple LDGSTS instructions
+          if (m_warp[inst.warp_id()]->m_ldgdepbar_buf[i][j].get_addr(0) == inst.get_addr(0)) {
+            m_warp[inst.warp_id()]->m_ldgdepbar_buf[i][j].pc = -1;
+            goto DoneWB;
+          }
+        }  
+      }
+    }
+
+  DoneWB:
+    for (int i = 0; i < end_group; i++) {
+      for (int j = 0; j < m_warp[inst.warp_id()]->m_ldgdepbar_buf[i].size(); j++) {
+        if (m_warp[inst.warp_id()]->m_ldgdepbar_buf[i][j].pc != -1) {
+          done_flag = false;
+          goto UpdateDEPBAR;
+        }
+      }
+    }
+  
+  UpdateDEPBAR:
+    if (done_flag) {
+      if (m_warp[inst.warp_id()]->m_waiting_ldgsts) {
+        m_warp[inst.warp_id()]->m_waiting_ldgsts = false;
+      }
+    }
+  }
+}
+
 void shader_core_ctx::warp_inst_complete(const warp_inst_t &inst) {
 #if 0
       printf("[warp_inst_complete] uid=%u core=%u warp=%u pc=%#x @ time=%llu \n",
              inst.get_uid(), m_sid, inst.warp_id(), inst.pc,  m_gpu->gpu_tot_sim_cycle +  m_gpu->gpu_sim_cycle);
 #endif
-
   if (inst.op_pipe == SP__OP)
     m_stats->m_num_sp_committed[m_sid]++;
   else if (inst.op_pipe == SFU__OP)
@@ -1907,6 +1994,14 @@ mem_stage_stall_type ldst_unit::process_cache_access(
     if (inst.is_load()) {
       for (unsigned r = 0; r < MAX_OUTPUT_VALUES; r++)
         if (inst.out[r] > 0) m_pending_writes[inst.warp_id()][inst.out[r]]--;
+      
+      // release LDGSTS
+      if (inst.m_is_ldgsts) {
+        m_pending_ldgsts[inst.warp_id()][inst.pc][inst.get_addr(0)]--;
+        if (m_pending_ldgsts[inst.warp_id()][inst.pc][inst.get_addr(0)] == 0) {
+          m_core->unset_depbar(inst);
+        }
+      }
     }
     if (!write_sent) delete mf;
   } else if (status == RESERVATION_FAIL) {
@@ -2035,6 +2130,14 @@ void ldst_unit::L1_latency_queue_cycle() {
                 m_core->warp_inst_complete(mf_next->get_inst());
               }
             }
+
+          // release LDGSTS
+          if (mf_next->get_inst().m_is_ldgsts) {
+            m_pending_ldgsts[mf_next->get_inst().warp_id()][mf_next->get_inst().pc][mf_next->get_inst().get_addr(0)]--;
+            if (m_pending_ldgsts[mf_next->get_inst().warp_id()][mf_next->get_inst().pc][mf_next->get_inst().get_addr(0)] == 0) {
+              m_core->unset_depbar(mf_next->get_inst());
+            }
+          }
         }
 
         // For write hit in WB policy
@@ -2571,10 +2674,21 @@ void ldst_unit::writeback() {
             insn_completed = true;
           }
         }
+        else if (m_next_wb.m_is_ldgsts) { // for LDGSTS instructions where no output register is used
+          m_pending_ldgsts[m_next_wb.warp_id()][m_next_wb.pc][m_next_wb.get_addr(0)]--;
+          if (m_pending_ldgsts[m_next_wb.warp_id()][m_next_wb.pc][m_next_wb.get_addr(0)] == 0) {
+            insn_completed = true;
+          }
+          break;
+        }
       }
       if (insn_completed) {
         m_core->warp_inst_complete(m_next_wb);
+        if (m_next_wb.m_is_ldgsts) {
+          m_core->unset_depbar(m_next_wb);
+        }
       }
+
       m_next_wb.clear();
       m_last_inst_gpu_sim_cycle = m_core->get_gpu()->gpu_sim_cycle;
       m_last_inst_gpu_tot_sim_cycle = m_core->get_gpu()->gpu_tot_sim_cycle;
@@ -2796,6 +2910,14 @@ void ldst_unit::cycle() {
         if (!pending_requests) {
           m_core->warp_inst_complete(*m_dispatch_reg);
           m_scoreboard->releaseRegisters(m_dispatch_reg);
+
+          // release LDGSTS
+          if (m_dispatch_reg->m_is_ldgsts) {
+            // m_pending_ldgsts[m_dispatch_reg->warp_id()][m_dispatch_reg->pc][m_dispatch_reg->get_addr(0)]--;
+            if (m_pending_ldgsts[m_dispatch_reg->warp_id()][m_dispatch_reg->pc][m_dispatch_reg->get_addr(0)] == 0) {
+              m_core->unset_depbar(*m_dispatch_reg);
+            }
+          }
         }
         m_core->dec_inst_in_pipeline(warp_id);
         m_dispatch_reg->clear();
@@ -3930,6 +4052,8 @@ bool shd_warp_t::waiting() {
     // the functional execution of the atomic when it hits DRAM can cause
     // the wrong register to be read.
     return true;
+  } else if (m_waiting_ldgsts) {  // Waiting for LDGSTS to finish
+    return true;
   }
   return false;
 }
@@ -4050,6 +4174,7 @@ int register_bank(int regnum, int wid, unsigned num_banks,
 
 bool opndcoll_rfu_t::writeback(warp_inst_t &inst) {
   assert(!inst.empty());
+
   std::list<unsigned> regs = m_shader->get_regs_written(inst);
   for (unsigned op = 0; op < MAX_REG_OPERANDS; op++) {
     int reg_num = inst.arch_reg.dst[op];  // this math needs to match that used
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 381e2c962..089730267 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -123,6 +123,20 @@ class shd_warp_t {
     // Jin: cdp support
     m_cdp_latency = 0;
     m_cdp_dummy = false;
+
+    // Ni: Initialize ldgdepbar_id
+    m_ldgdepbar_id = 0;
+    m_depbar_start_id = 0;
+    m_depbar_group = 0;
+
+    // Ni: Set waiting to false
+    m_waiting_ldgsts = false;
+
+    // Ni: Clear m_ldgdepbar_buf
+    for (int i = 0; i < m_ldgdepbar_buf.size(); i++) {
+      m_ldgdepbar_buf[i].clear();
+    }
+    m_ldgdepbar_buf.clear();
   }
   void init(address_type start_pc, unsigned cta_id, unsigned wid,
             const std::bitset<MAX_WARP_SIZE> &active,
@@ -140,6 +154,20 @@ class shd_warp_t {
     // Jin: cdp support
     m_cdp_latency = 0;
     m_cdp_dummy = false;
+
+    // Ni: Initialize ldgdepbar_id
+    m_ldgdepbar_id = 0;
+    m_depbar_start_id = 0;
+    m_depbar_group = 0;
+
+    // Ni: Set waiting to false
+    m_waiting_ldgsts = false;
+
+    // Ni: Clear m_ldgdepbar_buf
+    for (int i = 0; i < m_ldgdepbar_buf.size(); i++) {
+      m_ldgdepbar_buf[i].clear();
+    }
+    m_ldgdepbar_buf.clear();
   }
 
   bool functional_done() const;
@@ -288,6 +316,14 @@ class shd_warp_t {
  public:
   unsigned int m_cdp_latency;
   bool m_cdp_dummy;
+
+  // Ni: LDGDEPBAR barrier support
+  public:
+    unsigned int m_ldgdepbar_id;  // LDGDEPBAR barrier ID
+    std::vector<std::vector<warp_inst_t>> m_ldgdepbar_buf;  // LDGDEPBAR barrier buffer
+    unsigned int m_depbar_start_id;
+    unsigned int m_depbar_group;
+    bool m_waiting_ldgsts; // Ni: Whether the warp is waiting for the LDGSTS instrs to finish
 };
 
 inline unsigned hw_tid_from_wid(unsigned wid, unsigned warp_size, unsigned i) {
@@ -1314,6 +1350,15 @@ class ldst_unit : public pipelined_simd_unit {
             const memory_config *mem_config, class shader_core_stats *stats,
             unsigned sid, unsigned tpc);
 
+  // Add a structure to record the LDGSTS instructions,
+  // similar to m_pending_writes, but since LDGSTS does not have a output register
+  // to write to, so a new structure needs to be added
+  /* A multi-level map: unsigned (warp_id) -> unsigned (pc) -> unsigned (addr) -> unsigned (count)
+   */
+  std::map<unsigned /*warp_id*/,
+           std::map<unsigned /*pc*/, 
+                  std::map<unsigned /*addr*/, unsigned /*count*/>>>
+      m_pending_ldgsts;
   // modifiers
   virtual void issue(register_set &inst);
   bool is_issue_partitioned() { return false; }
@@ -2069,6 +2114,9 @@ class shader_core_ctx : public core_t {
   // modifiers
   virtual void warp_exit(unsigned warp_id);
 
+  // Ni: Unset ldgdepbar
+  void unset_depbar(const warp_inst_t &inst);
+
   // accessors
   virtual bool warp_waiting_at_barrier(unsigned warp_id) const;
   void get_pdom_stack_top_info(unsigned tid, unsigned *pc, unsigned *rpc) const;

From d09254ec92ba12ae8af2b8a0a1704985fd2383eb Mon Sep 17 00:00:00 2001
From: Fangjia Shen <50934207+FJShen@users.noreply.github.com>
Date: Mon, 18 Dec 2023 15:50:35 -0500
Subject: [PATCH 132/154] Update gpgpusim.config

Fixed an outdated comment line
---
 configs/tested-cfgs/SM7_GV100/gpgpusim.config | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/tested-cfgs/SM7_GV100/gpgpusim.config b/configs/tested-cfgs/SM7_GV100/gpgpusim.config
index 1595c6901..26ce0eb58 100644
--- a/configs/tested-cfgs/SM7_GV100/gpgpusim.config
+++ b/configs/tested-cfgs/SM7_GV100/gpgpusim.config
@@ -130,7 +130,7 @@
 
 # Volta has four schedulers per core
 -gpgpu_num_sched_per_core 4
-# Greedy then oldest scheduler
+# Loose round robin scheduler
 -gpgpu_scheduler lrr
 ## In Volta, a warp scheduler can issue 1 inst per cycle
 -gpgpu_max_insn_issue_per_warp 1
@@ -234,4 +234,4 @@
 # tracing functionality
 #-trace_enabled 1
 #-trace_components WARP_SCHEDULER,SCOREBOARD
-#-trace_sampling_core 0
\ No newline at end of file
+#-trace_sampling_core 0

From 3c95cd1d2fb19e75fad6cda1ab2220ce1850196b Mon Sep 17 00:00:00 2001
From: JRPan <25518778+JRPan@users.noreply.github.com>
Date: Mon, 22 Jan 2024 13:13:12 -0500
Subject: [PATCH 133/154] Adding Github Actino CI

---
 .github/workflows/main.yml | 63 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 .github/workflows/main.yml

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
new file mode 100644
index 000000000..115b11b23
--- /dev/null
+++ b/.github/workflows/main.yml
@@ -0,0 +1,63 @@
+# This is a basic workflow to help you get started with Actions
+
+name: CI
+
+# Controls when the workflow will run
+on:
+  # Triggers the workflow on push or pull request events but only for the mydev branch
+  push:
+  pull_request:
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  build-GTX480:
+    runs-on: ubuntu-latest
+    container:
+      image: tgrogers/gpgpu-sim_regress:volta_update
+      env:
+        CONFIG: GTX480
+        CUDA_INSTALL_PATH: /usr/local/cuda-4.2/
+        PTXAS_CUDA_INSTALL_PATH: /usr/local/cuda-4.2/
+        GPUAPPS_ROOT: /home/runner/gpgpu-sim_simulations/benchmarks
+        
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v2
+      - name: Run Simulation
+        run: /bin/bash $GITHUB_WORKSPACE/travis.sh
+  build-TITANV:
+    runs-on: ubuntu-latest
+    container:
+      image: tgrogers/gpgpu-sim_regress:volta_update
+      env:
+        CONFIG: TITANV
+        CUDA_INSTALL_PATH: /usr/local/cuda-9.1/
+        PTXAS_CUDA_INSTALL_PATH: /usr/local/cuda-9.1/
+        GPUAPPS_ROOT: /home/runner/gpgpu-sim_simulations/benchmarks
+        
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v2
+      - name: Run Simulation
+        run: /bin/bash $GITHUB_WORKSPACE/travis.sh
+  build-TITANV-LOCALXBAR:
+    runs-on: ubuntu-latest
+    container:
+      image: tgrogers/gpgpu-sim_regress:volta_update
+      env:
+        CONFIG: TITANV-LOCALXBAR
+        CUDA_INSTALL_PATH: /usr/local/cuda-9.1/
+        PTXAS_CUDA_INSTALL_PATH: /usr/local/cuda-9.1/
+        GPUAPPS_ROOT: /home/runner/gpgpu-sim_simulations/benchmarks
+        
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v2
+      - name: Run Simulation
+        run: /bin/bash $GITHUB_WORKSPACE/travis.sh

From b2f0ebee9a964b93882188d84e6ccd0f61996087 Mon Sep 17 00:00:00 2001
From: JRPan <25518778+JRPan@users.noreply.github.com>
Date: Wed, 24 Jan 2024 16:16:57 -0500
Subject: [PATCH 134/154] update CI scripts

---
 .github/workflows/main.yml  | 36 ++++++++++++++++++------------------
 travis.sh => short-tests.sh |  2 --
 2 files changed, 18 insertions(+), 20 deletions(-)
 rename travis.sh => short-tests.sh (94%)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 115b11b23..a5a736adb 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -16,48 +16,48 @@ jobs:
   build-GTX480:
     runs-on: ubuntu-latest
     container:
-      image: tgrogers/gpgpu-sim_regress:volta_update
+      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
       env:
         CONFIG: GTX480
-        CUDA_INSTALL_PATH: /usr/local/cuda-4.2/
-        PTXAS_CUDA_INSTALL_PATH: /usr/local/cuda-4.2/
-        GPUAPPS_ROOT: /home/runner/gpgpu-sim_simulations/benchmarks
+        # CUDA_INSTALL_PATH: /usr/local/cuda-4.2/
+        # PTXAS_CUDA_INSTALL_PATH: /usr/local/cuda-4.2/
+        # GPUAPPS_ROOT: /home/runner/gpgpu-sim_simulations/benchmarks
         
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
-      - uses: actions/checkout@v2
+      - uses: actions/checkout
       - name: Run Simulation
-        run: /bin/bash $GITHUB_WORKSPACE/travis.sh
+        run: /bin/bash $GITHUB_WORKSPACE/short-tests.sh
   build-TITANV:
     runs-on: ubuntu-latest
     container:
-      image: tgrogers/gpgpu-sim_regress:volta_update
+      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
       env:
         CONFIG: TITANV
-        CUDA_INSTALL_PATH: /usr/local/cuda-9.1/
-        PTXAS_CUDA_INSTALL_PATH: /usr/local/cuda-9.1/
-        GPUAPPS_ROOT: /home/runner/gpgpu-sim_simulations/benchmarks
+        # CUDA_INSTALL_PATH: /usr/local/cuda-9.1/
+        # PTXAS_CUDA_INSTALL_PATH: /usr/local/cuda-9.1/
+        # GPUAPPS_ROOT: /home/runner/gpgpu-sim_simulations/benchmarks
         
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
-      - uses: actions/checkout@v2
+      - uses: actions/checkout
       - name: Run Simulation
-        run: /bin/bash $GITHUB_WORKSPACE/travis.sh
+        run: /bin/bash $GITHUB_WORKSPACE/short-tests.sh
   build-TITANV-LOCALXBAR:
     runs-on: ubuntu-latest
     container:
-      image: tgrogers/gpgpu-sim_regress:volta_update
+      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
       env:
         CONFIG: TITANV-LOCALXBAR
-        CUDA_INSTALL_PATH: /usr/local/cuda-9.1/
-        PTXAS_CUDA_INSTALL_PATH: /usr/local/cuda-9.1/
-        GPUAPPS_ROOT: /home/runner/gpgpu-sim_simulations/benchmarks
+        # CUDA_INSTALL_PATH: /usr/local/cuda-9.1/
+        # PTXAS_CUDA_INSTALL_PATH: /usr/local/cuda-9.1/
+        # GPUAPPS_ROOT: /home/runner/gpgpu-sim_simulations/benchmarks
         
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
-      - uses: actions/checkout@v2
+      - uses: actions/checkout
       - name: Run Simulation
-        run: /bin/bash $GITHUB_WORKSPACE/travis.sh
+        run: /bin/bash $GITHUB_WORKSPACE/short-tests.sh
diff --git a/travis.sh b/short-tests.sh
similarity index 94%
rename from travis.sh
rename to short-tests.sh
index bbdd19acf..bb1c6695a 100755
--- a/travis.sh
+++ b/short-tests.sh
@@ -17,8 +17,6 @@ export PATH=$CUDA_INSTALL_PATH/bin:$PATH
 source ./setup_environment
 make -j
 
-pip install psutil
-rm -rf accel-sim-framework
 git clone https://github.com/accel-sim/accel-sim-framework.git
 ./accel-sim-framework/util/job_launching/run_simulations.py -C $CONFIG -B rodinia_2.0-ft -N regress -l local
 ./accel-sim-framework/util/job_launching/monitor_func_test.py -v -N regress -j procman

From 2bbfb8b3e2d6a5db1079c8a1077a08ad208ad38d Mon Sep 17 00:00:00 2001
From: JRPan <25518778+JRPan@users.noreply.github.com>
Date: Wed, 24 Jan 2024 16:18:01 -0500
Subject: [PATCH 135/154] uses actions/checkout@v4

---
 .github/workflows/main.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index a5a736adb..382095e0c 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -26,7 +26,7 @@ jobs:
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
-      - uses: actions/checkout
+      - uses: actions/checkout@v4
       - name: Run Simulation
         run: /bin/bash $GITHUB_WORKSPACE/short-tests.sh
   build-TITANV:
@@ -42,7 +42,7 @@ jobs:
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
-      - uses: actions/checkout
+      - uses: actions/checkout@v4
       - name: Run Simulation
         run: /bin/bash $GITHUB_WORKSPACE/short-tests.sh
   build-TITANV-LOCALXBAR:
@@ -58,6 +58,6 @@ jobs:
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
       # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
-      - uses: actions/checkout
+      - uses: actions/checkout@v4
       - name: Run Simulation
         run: /bin/bash $GITHUB_WORKSPACE/short-tests.sh

From 77aefacafa0af7d45c407d772bd493397c6e1ae5 Mon Sep 17 00:00:00 2001
From: JRPan <25518778+JRPan@users.noreply.github.com>
Date: Fri, 26 Jan 2024 15:08:26 -0500
Subject: [PATCH 136/154] fix dubious ownership

---
 short-tests.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/short-tests.sh b/short-tests.sh
index bb1c6695a..44f265a96 100755
--- a/short-tests.sh
+++ b/short-tests.sh
@@ -13,6 +13,8 @@ if [ ! -n "$GPUAPPS_ROOT" ]; then
 	exit;
 fi
 
+git config --system --add safe.directory '*'
+
 export PATH=$CUDA_INSTALL_PATH/bin:$PATH
 source ./setup_environment
 make -j

From 1bdb39acb89ce1203d4fc96a00ce3c3f51fe72b8 Mon Sep 17 00:00:00 2001
From: JRPan <25518778+JRPan@users.noreply.github.com>
Date: Fri, 26 Jan 2024 15:49:30 -0500
Subject: [PATCH 137/154] remove fermi and add newer gen cards

---
 .github/workflows/main.yml | 51 +++++++++++++++++++++++++++-----------
 1 file changed, 36 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 382095e0c..742a90613 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -13,15 +13,12 @@ on:
 
 # A workflow run is made up of one or more jobs that can run sequentially or in parallel
 jobs:
-  build-GTX480:
+  build-TITANV:
     runs-on: ubuntu-latest
     container:
       image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
       env:
-        CONFIG: GTX480
-        # CUDA_INSTALL_PATH: /usr/local/cuda-4.2/
-        # PTXAS_CUDA_INSTALL_PATH: /usr/local/cuda-4.2/
-        # GPUAPPS_ROOT: /home/runner/gpgpu-sim_simulations/benchmarks
+        CONFIG: TITANV
         
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
@@ -29,15 +26,13 @@ jobs:
       - uses: actions/checkout@v4
       - name: Run Simulation
         run: /bin/bash $GITHUB_WORKSPACE/short-tests.sh
-  build-TITANV:
+
+  build-TITANV-LOCALXBAR:
     runs-on: ubuntu-latest
     container:
       image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
       env:
-        CONFIG: TITANV
-        # CUDA_INSTALL_PATH: /usr/local/cuda-9.1/
-        # PTXAS_CUDA_INSTALL_PATH: /usr/local/cuda-9.1/
-        # GPUAPPS_ROOT: /home/runner/gpgpu-sim_simulations/benchmarks
+        CONFIG: TITANV-LOCALXBAR
         
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
@@ -45,15 +40,41 @@ jobs:
       - uses: actions/checkout@v4
       - name: Run Simulation
         run: /bin/bash $GITHUB_WORKSPACE/short-tests.sh
-  build-TITANV-LOCALXBAR:
+
+  build-QV100:
     runs-on: ubuntu-latest
     container:
       image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
       env:
-        CONFIG: TITANV-LOCALXBAR
-        # CUDA_INSTALL_PATH: /usr/local/cuda-9.1/
-        # PTXAS_CUDA_INSTALL_PATH: /usr/local/cuda-9.1/
-        # GPUAPPS_ROOT: /home/runner/gpgpu-sim_simulations/benchmarks
+        CONFIG: QV100
+        
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v4
+      - name: Run Simulation
+        run: /bin/bash $GITHUB_WORKSPACE/short-tests.sh
+
+  build-2060:
+    runs-on: ubuntu-latest
+    container:
+      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      env:
+        CONFIG: RTX2060
+        
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v4
+      - name: Run Simulation
+        run: /bin/bash $GITHUB_WORKSPACE/short-tests.sh
+
+  build-3070:
+    runs-on: ubuntu-latest
+    container:
+      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      env:
+        CONFIG: RTX3070
         
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:

From d935bd167dd5806cc9518eae69176c868bffc0d9 Mon Sep 17 00:00:00 2001
From: JRPan <25518778+JRPan@users.noreply.github.com>
Date: Mon, 5 Feb 2024 16:12:47 -0500
Subject: [PATCH 138/154] rename ci tests

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 742a90613..c639ff3fb 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -1,6 +1,6 @@
 # This is a basic workflow to help you get started with Actions
 
-name: CI
+name: Short-Tests
 
 # Controls when the workflow will run
 on:

From bc8061fd1e3c26be37e2cbb83ff9ca26e6f4dead Mon Sep 17 00:00:00 2001
From: WilliamMTK <China_Aisa@live.com>
Date: Thu, 4 Apr 2024 18:13:54 -0400
Subject: [PATCH 139/154] Migrate gpgpu-sim build system to cmake (#66)

* migrate_cmake: add package dependency checking

* migrate_cmake: port setup_environment to CMake

* migrate_cmake: break dependency checking and env export gen to different .cmake files

* migrate_cmake: use CUDAToolkit_FOUND to test for CUDA compiler

* migrate_cmake: use CUDAToolkit_FOUND to test for CUDA compiler

* migrate_cmake: use CUDAToolkit_FOUND to test for CUDA compiler

* migrate_cmake: properly parse for cuda version number

* migrate_cmake: set highest CUDA supported to be 11.10.x

* migrate_cmake: specify top level CMake file

* migrate_cmake: add libcuda cmake file

* migrate_cmake: use global compiler options and definitions

* migrate_cmake: add cmake file to src

* migrate_cmake: add cmake files for cuda-sim folder

* migrate_cmake: add cmake files to gpgpu-sim folder

* migrate_cmake: add cmake files for intersim

* migrate_cmake: add short test using cmake

* migrate_cmake: bump CXX standard requirement to 17

* Add cmake files for accelwattch

* migrate_cmake: remove use of GLOB to grab source files

* migrate_cmake: comment out the write protection on generated instructions.h

* migrate_cmake: create sym folder and add newline to generated setup file

* migrate_cmake: fix some path issues

* migrate_cmake: let cmake thinks flex and bison generate CXX files

* migrate_cmake: fix not linking pthread properly

* migrate_cmake: remove debug message

* migrate_cmake: add empty libopencl cmake file

* migrate_cmake: install phase and runtime version detect

	* Added install phase to install the shared object
	  and add symlinks

	* Changes with CUDA toolkit will be detected and
	  triggered a rebuild

	* GPGPU-Sim detailed version string will be updated
	  on each build

* Typo fix and fix correct bin dir

* Replace gcc -> g++ in intersim

* ignore setup

* check CMAKE_BUILD_TYPE

* set DCMAKE_BUILD_TYPE

---------

Co-authored-by: JRPAN <25518778+JRPan@users.noreply.github.com>
---
 .github/workflows/cmake.yml              |  83 +++++++++++
 .gitignore                               |   1 +
 CMakeLists.txt                           | 167 +++++++++++++++++++++++
 gpgpusim_check.cmake                     | 136 ++++++++++++++++++
 gpgpusim_gen_build_string.cmake          |  27 ++++
 gpgpusim_gen_setup_environment.cmake     |  31 +++++
 gpgpusim_install.cmake                   |   2 +
 gpgpusim_unset_cuda.cmake                |  60 ++++++++
 libcuda/CMakeLists.txt                   |  26 ++++
 libopencl/CMakeLists.txt                 |   0
 short-tests-cmake.sh                     |  27 ++++
 src/CMakeLists.txt                       |  19 +++
 src/accelwattch/CMakeLists.txt           |  46 +++++++
 src/cuda-sim/CMakeLists.txt              |  78 +++++++++++
 src/gpgpu-sim/CMakeLists.txt             |  36 +++++
 src/intersim2/CMakeLists.txt             | 106 ++++++++++++++
 src/intersim2/Makefile                   |   2 +-
 src/intersim2/config_utils.cpp           |  10 +-
 src/intersim2/config_utils.hpp           |   2 +-
 src/intersim2/interconnect_interface.cpp |   2 +-
 version.in                               |   1 +
 21 files changed, 854 insertions(+), 8 deletions(-)
 create mode 100644 .github/workflows/cmake.yml
 create mode 100644 CMakeLists.txt
 create mode 100644 gpgpusim_check.cmake
 create mode 100644 gpgpusim_gen_build_string.cmake
 create mode 100644 gpgpusim_gen_setup_environment.cmake
 create mode 100644 gpgpusim_install.cmake
 create mode 100644 gpgpusim_unset_cuda.cmake
 create mode 100644 libcuda/CMakeLists.txt
 create mode 100644 libopencl/CMakeLists.txt
 create mode 100755 short-tests-cmake.sh
 create mode 100644 src/CMakeLists.txt
 create mode 100644 src/accelwattch/CMakeLists.txt
 create mode 100644 src/cuda-sim/CMakeLists.txt
 create mode 100644 src/gpgpu-sim/CMakeLists.txt
 create mode 100644 src/intersim2/CMakeLists.txt
 create mode 100644 version.in

diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
new file mode 100644
index 000000000..ab9bfd019
--- /dev/null
+++ b/.github/workflows/cmake.yml
@@ -0,0 +1,83 @@
+# Workflow with cmake build system
+name: Short-Tests-CMake
+
+# Controls when the workflow will run
+on:
+  # Triggers the workflow on push or pull request events but only for the mydev branch
+  push:
+  pull_request:
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  build-TITANV:
+    runs-on: ubuntu-latest
+    container:
+      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      env:
+        CONFIG: TITANV
+        
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v4
+      - name: Run Simulation
+        run: /bin/bash $GITHUB_WORKSPACE/short-tests-cmake.sh
+
+  build-TITANV-LOCALXBAR:
+    runs-on: ubuntu-latest
+    container:
+      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      env:
+        CONFIG: TITANV-LOCALXBAR
+        
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v4
+      - name: Run Simulation
+        run: /bin/bash $GITHUB_WORKSPACE/short-tests-cmake.sh
+
+  build-QV100:
+    runs-on: ubuntu-latest
+    container:
+      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      env:
+        CONFIG: QV100
+        
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v4
+      - name: Run Simulation
+        run: /bin/bash $GITHUB_WORKSPACE/short-tests-cmake.sh
+
+  build-2060:
+    runs-on: ubuntu-latest
+    container:
+      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      env:
+        CONFIG: RTX2060
+        
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v4
+      - name: Run Simulation
+        run: /bin/bash $GITHUB_WORKSPACE/short-tests-cmake.sh
+
+  build-3070:
+    runs-on: ubuntu-latest
+    container:
+      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+      env:
+        CONFIG: RTX3070
+        
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v4
+      - name: Run Simulation
+        run: /bin/bash $GITHUB_WORKSPACE/short-tests-cmake.sh
diff --git a/.gitignore b/.gitignore
index 4b343c557..340277af8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -61,3 +61,4 @@ debug_tools/WatchYourStep/ptxjitplus/*.ptx
 accel-sim-framework/
 gpu-app-collection/
 
+setup
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 000000000..95ca8e085
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,167 @@
+cmake_minimum_required(VERSION 3.17)
+
+# Project name and version
+project(GPGPU-Sim 
+        VERSION 4.2.0 
+        DESCRIPTION "cycle-level simulator modeling contemporary graphics processing units (GPUs)" 
+        HOMEPAGE_URL https://github.com/accel-sim/gpgpu-sim_distribution
+        LANGUAGES CXX)
+
+# Specify the C++ standard
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+
+# GPGPU-Sim build option
+option(GPGPUSIM_ENABLE_TRACE "Whether to enable GPGPU-Sim debug tracing" ON)
+
+# GPGPU-Sim conditional build variable
+set(GPGPUSIM_USE_POWER_MODEL OFF)
+set(GPGPUSIM_USE_OPENCL OFF)
+
+# Check for dependencies
+include(gpgpusim_check.cmake)
+
+# Create version file
+add_custom_target(gen_build_string ALL 
+                    COMMAND ${CMAKE_COMMAND} -D INPUT_DIR=${CMAKE_CURRENT_SOURCE_DIR} -D OUTPUT_DIR=${CMAKE_BINARY_DIR} -P ${CMAKE_CURRENT_SOURCE_DIR}/gpgpusim_gen_build_string.cmake
+                    COMMENT "Generating build string file to ${CMAKE_CURRENT_BINARY_DIR}")
+
+# CMake target
+# GPGPU-Sim CUDA Runtime lib
+# Use the entrypoint object files sources else CMake will complain
+add_library(cudart SHARED $<TARGET_OBJECTS:gpgpusim_entrypoint>)
+add_library(entrypoint STATIC $<TARGET_OBJECTS:gpgpusim_entrypoint>)
+
+# Add global C/CXX compilation flags and definitions
+# TODO Specify more build modes like gem5 with fast opt?
+if(${CMAKE_BUILD_TYPE} STREQUAL "Debug")
+    add_compile_definitions(DEBUG=1)
+    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:-Wall;-Wno-unused-function;-Wno-sign-compare;-g;-fPIC>")
+    add_compile_options("$<$<COMPILE_LANGUAGE:C>:-Wall;-Wno-unused-function;-Wno-sign-compare;-ggdb;-fPIC>")
+else()
+    add_compile_definitions(DEBUG=0)
+    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:-O3;-g;-Wall;-Wno-unused-function;-Wno-sign-compare;-fPIC>")
+    add_compile_options("$<$<COMPILE_LANGUAGE:C>:-Wall;-Wno-unused-function;-Wno-sign-compare;-fPIC>")
+endif()
+
+# Add CUDA version
+add_compile_definitions(CUDART_VERSION=${CUDA_VERSION_NUMBER})
+
+# OpenCL support
+if(GPGPUSIM_USE_OPENCL)
+    add_compile_definitions(OPENGL_SUPPORT)
+endif()
+
+# Tracing support
+if(GPGPUSIM_ENABLE_TRACE)
+    add_compile_definitions(TRACING_ON=1)
+endif()
+
+# Add subdirectory
+add_subdirectory(src)
+add_subdirectory(libcuda)
+add_subdirectory(libopencl)
+
+# Set linker option for libcudart.so
+if(APPLE)
+    target_link_options(cudart PUBLIC "-Wl,-headerpad_max_install_names,-undefined,dynamic_lookup,-compatibility_version,1.1,-current_version,1.1;-lm;-lz;-pthread")
+else()
+    target_link_options(cudart PUBLIC
+        "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/linux-so-version.txt;-lm;-lz;-lGL;-pthread")
+        target_link_options(entrypoint PUBLIC
+        "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/linux-so-version.txt;-lm;-lz;-lGL;-pthread")
+endif()
+# cuda: CUDA API lib
+# ptxsim: cuda-sim, functional simulator
+# gpgpusim: gpu simulator (gpgpu-sim)
+# intersim: interconnect simulator
+# accelwattch: power simulator
+# Rest of source files in src/ will be created with gpgpusim_entrypoint target
+target_link_libraries(cudart PUBLIC cuda ptxsim gpgpusim intersim)
+target_link_libraries(entrypoint PUBLIC cuda ptxsim gpgpusim intersim)
+if(GPGPUSIM_USE_POWER_MODEL)
+target_link_libraries(cudart PUBLIC cuda ptxsim gpgpusim intersim accelwattch)
+target_link_libraries(entrypoint PUBLIC cuda ptxsim gpgpusim intersim accelwattch)
+endif()
+
+# TODO Conditionally build for Opencl?
+# if(GPGPUSIM_USE_OPENCL)
+# add_library(OpenCL)
+# endif()
+
+# Install and post-install
+# Get configure
+set(GPGPUSIM_CONFIG "gcc-${CMAKE_CXX_COMPILER_VERSION}/cuda-${CUDA_VERSION_NUMBER}/${GPGPUSIM_BUILD_MODE}")
+
+# Env var setup script
+include(gpgpusim_gen_setup_environment.cmake)
+
+# Installation
+set(GPGPUSIM_INSTALL_PATH ${PROJECT_SOURCE_DIR}/lib/${GPGPUSIM_CONFIG})
+install(TARGETS cudart DESTINATION ${GPGPUSIM_INSTALL_PATH})
+
+# Installing symlinks
+install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.2\)")
+install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.3\)")
+install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.4\)")
+install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.5.0\)")
+install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.5.5\)")
+install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.6.0\)")
+install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.6.5\)")
+install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.7.0\)")
+install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.7.5\)")
+install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.8.0\)")
+install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.9.0\)")
+install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.9.1\)")
+    install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.9.2\)")
+    install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.10.0\)")
+    install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.10.1\)")
+install(CODE "execute_process\(\
+    COMMAND ${CMAKE_COMMAND} -E create_symlink \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart> \
+    ${GPGPUSIM_INSTALL_PATH}/$<TARGET_FILE_NAME:cudart>.11.0\)")
\ No newline at end of file
diff --git a/gpgpusim_check.cmake b/gpgpusim_check.cmake
new file mode 100644
index 000000000..5da46c979
--- /dev/null
+++ b/gpgpusim_check.cmake
@@ -0,0 +1,136 @@
+# Dependency checking
+# Unset FindCUDA variables so that it 
+# gets reconfigured 
+include(gpgpusim_unset_cuda.cmake)
+
+find_package(Git REQUIRED)
+find_package(BISON REQUIRED)
+find_package(FLEX REQUIRED)
+find_package(ZLIB REQUIRED)
+find_package(CUDAToolkit REQUIRED)
+find_package(Doxygen)
+find_package(Python3)
+
+# GPGPU-Sim additional checking and info
+message(CHECK_START "Additional settings for ${CMAKE_PROJECT_NAME}")
+list(APPEND CMAKE_MESSAGE_INDENT "  ")
+
+# Check for OS
+message(CHECK_START "Checking for OS")
+if((NOT APPLE) AND (NOT UNIX) AND (NOT LINUX))
+    message(FATAL_ERROR "${CMAKE_SYSTEM_NAME} not supported")
+else()
+    message(CHECK_PASS ${CMAKE_SYSTEM_NAME})
+endif()
+
+# Check for version
+message(CHECK_START "Checking GPGPU-Sim version")
+message(CHECK_PASS "${CMAKE_PROJECT_VERSION}")
+
+# Check for git commit hash
+message(CHECK_START "Checking git commit hash")
+# Get the latest abbreviated commit hash of the working branch
+execute_process(
+    COMMAND git log -1 --format=%H
+    WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
+    OUTPUT_VARIABLE GPGPUSIM_CONFIG_GIT_HASH
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    RESULT_VARIABLE GPGPUSIM_CHECK_GIT_HASH
+)
+if(${GPGPUSIM_CHECK_GIT_HASH})
+    message(CHECK_FAIL "not a git repo")
+else()
+    message(CHECK_PASS "${GPGPUSIM_CONFIG_GIT_HASH}")
+endif()
+
+# Check for compiler and version
+message(CHECK_START "Checking CXX compiler")
+if(NOT (${CMAKE_CXX_COMPILER_ID} STREQUAL GNU))
+    message(CHECK_FAIL "GPGPU-Sim only tested with GCC: ${CMAKE_CXX_COMPILER_ID}")
+else()
+    message(CHECK_PASS "${CMAKE_CXX_COMPILER}")
+endif()
+message(CHECK_START "Checking CXX compiler version")
+message(CHECK_PASS "${CMAKE_CXX_COMPILER_VERSION}")
+set(GPGPSIM_CC_VERSION )
+
+# Check for CUDA nvcc and version 
+# Check already done with find_package, here just to display the path and version
+message(CHECK_START "Checking CUDA compiler")
+if(NOT CUDAToolkit_FOUND)
+    message(CHECK_FAIL "not found")
+else()
+    message(CHECK_PASS "${CUDAToolkit_NVCC_EXECUTABLE}")
+    message(CHECK_START "Checking CUDA compiler version")
+    message(CHECK_PASS "${CUDAToolkit_VERSION}")
+    if((CUDAToolkit_VERSION VERSION_LESS 2.0.3) OR (CUDAToolkit_VERSION VERSION_GREATER 11.10.0))
+        message(FATAL_ERROR "GPGPU-Sim ${CMAKE_PROJECT_VERSION} not tested with CUDA version ${CUDAToolkit_VERSION} (please see README)")
+    endif()
+endif()
+
+# Check for Power model
+# TODO How to configure the project to look for it?
+message(CHECK_START "Checking for GPGPU-Sim power model")
+if(IS_DIRECTORY ${PROJECT_SOURCE_DIR}/src/accelwattch)
+    if(NOT EXISTS ${PROJECT_SOURCE_DIR}/src/accelwattch/gpgpu_sim.verify)
+        message(FATAL_ERROR "gpgpu_sim.verify not found in ${PROJECT_SOURCE_DIR}/src/accelwattch/")
+    endif()
+    message(CHECK_PASS "${PROJECT_SOURCE_DIR}/src/accelwattch/")
+    set(GPGPUSIM_USE_POWER_MODEL True)
+    set(GPGPUSIM_POWER_MODEL ${PROJECT_SOURCE_DIR}/src/accelwattch)
+elseif(DEFINED ${GPGPUSIM_POWER_MODEL})
+    if(NOT EXISTS ${GPGPUSIM_POWER_MODEL}/gpgpu_sim.verify)
+        message(FATAL_ERROR "gpgpu_sim.verify not found in ${GPGPUSIM_POWER_MODEL} - Either incorrect directory or incorrect McPAT version")
+    endif()
+    message(CHECK_PASS "${GPGPUSIM_POWER_MODEL}")
+    set(GPGPUSIM_USE_POWER_MODEL True)
+else()
+    message(CHECK_PASS "configured without a power model")
+endif()
+
+# Set Build path
+# Get CUDA version
+set(CUDA_VERSION_STRING "${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}")
+# execute_process(
+#     COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} --version
+#     COMMAND awk "/release/ {print $5;}"
+#     COMMAND sed "s/,//"
+#     WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
+#     OUTPUT_VARIABLE CUDA_VERSION_STRING
+#     OUTPUT_STRIP_TRAILING_WHITESPACE
+# )
+
+# CMake cannot do formatted string output, so we just use the good old `awk`
+# math(EXPR CUDA_VERSION_NUMBER_MAJOR "${CUDAToolkit_VERSION_MAJOR} * 10")
+# math(EXPR CUDA_VERSION_NUMBER_MINOR "${CUDAToolkit_VERSION_MINOR} * 10")
+# set(CUDA_VERSION_NUMBER "${CUDA_VERSION_NUMBER_MAJOR}${CUDA_VERSION_NUMBER_MINOR}")
+execute_process(
+    COMMAND echo ${CUDA_VERSION_STRING}
+    COMMAND sed "s/\\./ /"
+    COMMAND awk "{printf(\"%02u%02u\", 10*int($1), 10*$2);}"
+    WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
+    OUTPUT_VARIABLE CUDA_VERSION_NUMBER
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+# Get debug or release
+# Set with -DCMAKE_BUILD_TYPE=Debug|Release to change build type
+message(CHECK_START "Checking for CMAKE_BUILD_TYPE")
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release)
+    set(GPGPUSIM_BUILD_MODE "release" CACHE STRING "" FORCE)
+else()
+    string(TOLOWER "${CMAKE_BUILD_TYPE}" GPGPUSIM_BUILD_MODE)
+    set(CMAKE_BUILD_TYPE Debug)
+endif()
+message(CHECK_PASS "${CMAKE_BUILD_TYPE}")
+# TODO: Make this step an installation phase that handle copying so and creating symlinks
+message(STATUS "Setting binary directory to ${CMAKE_BINARY_DIR}")
+
+# TODO OpenCL check/support?
+
+list(POP_BACK CMAKE_MESSAGE_INDENT)
+message(CHECK_PASS "done")
+message(STATUS "Be sure to run 'source setup' "
+               "before you run CUDA program with GPGPU-Sim or building with external "
+               "simulator like SST")
\ No newline at end of file
diff --git a/gpgpusim_gen_build_string.cmake b/gpgpusim_gen_build_string.cmake
new file mode 100644
index 000000000..4559570c4
--- /dev/null
+++ b/gpgpusim_gen_build_string.cmake
@@ -0,0 +1,27 @@
+# Get hash
+execute_process(
+    COMMAND git log -1 --format=%h
+    WORKING_DIRECTORY ${INPUT_DIR}
+    OUTPUT_VARIABLE GPGPUSIM_GIT_HASH
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+# Get diff
+execute_process(
+    COMMAND git diff --numstat
+    COMMAND wc
+    COMMAND sed -re "s/^\\s+([0-9]+).*/\\1./"
+    WORKING_DIRECTORY ${INPUT_DIR}
+    OUTPUT_VARIABLE GPGPUSIM_GIT_DIFF
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+execute_process(
+    COMMAND git diff --numstat --staged
+    COMMAND wc
+    COMMAND sed -re "s/^\\s+([0-9]+).*/\\1./"
+    WORKING_DIRECTORY ${INPUT_DIR}
+    OUTPUT_VARIABLE GPGPUSIM_GIT_DIFF_STAGED
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+set(GPGPUSIM_BUILD_STRING "gpgpu-sim_git-commit-${GPGPUSIM_GIT_HASH}_modified_${GPGPUSIM_GIT_DIFF}${GPGPUSIM_GIT_DIFF_STAGED}")
+configure_file(${INPUT_DIR}/version.in ${OUTPUT_DIR}/detailed_version)
diff --git a/gpgpusim_gen_setup_environment.cmake b/gpgpusim_gen_setup_environment.cmake
new file mode 100644
index 000000000..e74a7f5c4
--- /dev/null
+++ b/gpgpusim_gen_setup_environment.cmake
@@ -0,0 +1,31 @@
+# Need to create a setup script to set some variables for others to interact with
+set(SETUP_SCRIPT_FILENAME "setup")
+message(STATUS "Writing setup commands to '${SETUP_SCRIPT_FILENAME}'")
+file(WRITE ${SETUP_SCRIPT_FILENAME} "export GPGPUSIM_SETUP_ENVIRONMENT_WAS_RUN=1\n")
+file(APPEND ${SETUP_SCRIPT_FILENAME} "export GPGPUSIM_ROOT=${PROJECT_SOURCE_DIR}\n")
+file(APPEND ${SETUP_SCRIPT_FILENAME} "export GPGPUSIM_CONFIG=${GPGPUSIM_CONFIG}\n")
+file(APPEND ${SETUP_SCRIPT_FILENAME} "export CUDA_INSTALL_PATH=${CUDAToolkit_TARGET_DIR}\n")
+file(APPEND ${SETUP_SCRIPT_FILENAME} "export PATH=`echo $PATH | sed 's#$GPGPUSIM_ROOT/bin:$CUDA_INSTALL_PATH/bin:##'`\n")
+file(APPEND ${SETUP_SCRIPT_FILENAME} "export PATH=$GPGPUSIM_ROOT/bin:$CUDA_INSTALL_PATH/bin:$PATH\n")
+file(APPEND ${SETUP_SCRIPT_FILENAME} "export CUDA_VERSION_NUMBER=${CUDA_VERSION_NUMBER}\n")
+if(CUDA_VERSION_NUMBER GREATER_EQUAL 6000)
+    file(APPEND ${SETUP_SCRIPT_FILENAME} "export PTX_SIM_USE_PTX_FILE=1.ptx\n")
+    file(APPEND ${SETUP_SCRIPT_FILENAME} "export PTX_SIM_KERNELFILE=_1.ptx\n")
+    file(APPEND ${SETUP_SCRIPT_FILENAME} "export CUOBJDUMP_SIM_FILE=jj\n")
+endif()
+# TODO What about OpenCL support?
+
+# setting LD_LIBRARY_PATH as follows enables GPGPU-Sim to be invoked by 
+# native CUDA and OpenCL applications. GPGPU-Sim is dynamically linked
+# against instead of the CUDA toolkit.  This replaces this cumbersome
+# static link setup in prior GPGPU-Sim releases.
+# Create a softlink for backward support
+if(APPLE)
+file(APPEND ${SETUP_SCRIPT_FILENAME} "export DYLD_LIBRARY_PATH=`echo $DYLD_LIBRARY_PATH | sed -Ee 's#'$GPGPUSIM_ROOT'\/lib\/[0-9]+\/(debug|release):##'`\n")
+file(APPEND ${SETUP_SCRIPT_FILENAME} "export DYLD_LIBRARY_PATH=$GPGPUSIM_ROOT/lib/$GPGPUSIM_CONFIG:$DYLD_LIBRARY_PATH\n")
+else()
+file(APPEND ${SETUP_SCRIPT_FILENAME} "export LD_LIBRARY_PATH=`echo $LD_LIBRARY_PATH | sed -re 's#'$GPGPUSIM_ROOT'\/lib\/[0-9]+\/(debug|release):##'`\n")
+file(APPEND ${SETUP_SCRIPT_FILENAME} "export LD_LIBRARY_PATH=$GPGPUSIM_ROOT/lib/$GPGPUSIM_CONFIG:$LD_LIBRARY_PATH\n")
+endif()
+
+# TODO ignore the OPENCL_REMOTE_GPU_HOST part?
\ No newline at end of file
diff --git a/gpgpusim_install.cmake b/gpgpusim_install.cmake
new file mode 100644
index 000000000..1590bf369
--- /dev/null
+++ b/gpgpusim_install.cmake
@@ -0,0 +1,2 @@
+# TODO Create the build/gcc-X.X/cuda-XXXX/release folder and put so to it
+# TODO Also create symlinks to the libcudart.so
\ No newline at end of file
diff --git a/gpgpusim_unset_cuda.cmake b/gpgpusim_unset_cuda.cmake
new file mode 100644
index 000000000..4eaef9107
--- /dev/null
+++ b/gpgpusim_unset_cuda.cmake
@@ -0,0 +1,60 @@
+# Unset these variable to force a re-search on possible CUDA version changes
+unset(CUDAToolkit_BIN_DIR CACHE)
+unset(CUDAToolkit_CUPTI_INCLUDE_DIR CACHE)
+unset(CUDAToolkit_NVCC_EXECUTABLE CACHE)
+unset(CUDAToolkit_rt_LIBRARY CACHE)
+unset(CUDA_CUDART CACHE)
+unset(CUDA_OpenCL_LIBRARY CACHE)
+unset(CUDA_cublasLt_LIBRARY CACHE)
+unset(CUDA_cublasLt_static_LIBRARY CACHE)
+unset(CUDA_cublas_LIBRARY CACHE)
+unset(CUDA_cublas_static_LIBRARY CACHE)
+unset(CUDA_cuda_driver_LIBRARY CACHE)
+unset(CUDA_cudart_LIBRARY CACHE)
+unset(CUDA_cudart_static_LIBRARY CACHE)
+unset(CUDA_cufft_LIBRARY CACHE)
+unset(CUDA_cufft_static_LIBRARY CACHE)
+unset(CUDA_cufft_static_nocallback_LIBRARY CACHE)
+unset(CUDA_cufftw_LIBRARY CACHE)
+unset(CUDA_cufftw_static_LIBRARY CACHE)
+unset(CUDA_culibos_LIBRARY CACHE)
+unset(CUDA_cupti_LIBRARY CACHE)
+unset(CUDA_cupti_static_LIBRARY CACHE)
+unset(CUDA_curand_LIBRARY CACHE)
+unset(CUDA_curand_static_LIBRARY CACHE)
+unset(CUDA_cusolver_LIBRARY CACHE)
+unset(CUDA_cusolver_lapack_static_LIBRARY CACHE)
+unset(CUDA_cusolver_static_LIBRARY CACHE)
+unset(CUDA_cusparse_LIBRARY CACHE)
+unset(CUDA_cusparse_static_LIBRARY CACHE)
+unset(CUDA_nppc_LIBRARY CACHE)
+unset(CUDA_nppc_static_LIBRARY CACHE)
+unset(CUDA_nppial_LIBRARY CACHE)
+unset(CUDA_nppial_static_LIBRARY CACHE)
+unset(CUDA_nppicc_LIBRARY CACHE)
+unset(CUDA_nppicc_static_LIBRARY CACHE)
+unset(CUDA_nppicom_LIBRARY CACHE)
+unset(CUDA_nppicom_static_LIBRARY CACHE)
+unset(CUDA_nppidei_LIBRARY CACHE)
+unset(CUDA_nppidei_static_LIBRARY CACHE)
+unset(CUDA_nppif_LIBRARY CACHE)
+unset(CUDA_nppif_static_LIBRARY CACHE)
+unset(CUDA_nppig_LIBRARY CACHE)
+unset(CUDA_nppig_static_LIBRARY CACHE)
+unset(CUDA_nppim_LIBRARY CACHE)
+unset(CUDA_nppim_static_LIBRARY CACHE)
+unset(CUDA_nppist_LIBRARY CACHE)
+unset(CUDA_nppist_static_LIBRARY CACHE)
+unset(CUDA_nppisu_LIBRARY CACHE)
+unset(CUDA_nppisu_static_LIBRARY CACHE)
+unset(CUDA_nppitc_LIBRARY CACHE)
+unset(CUDA_nppitc_static_LIBRARY CACHE)
+unset(CUDA_npps_LIBRARY CACHE)
+unset(CUDA_npps_static_LIBRARY CACHE)
+unset(CUDA_nvToolsExt_LIBRARY CACHE)
+unset(CUDA_nvgraph_LIBRARY CACHE)
+unset(CUDA_nvgraph_static_LIBRARY CACHE)
+unset(CUDA_nvjpeg_LIBRARY CACHE)
+unset(CUDA_nvjpeg_static_LIBRARY CACHE)
+unset(CUDA_nvml_LIBRARY CACHE)
+unset(CUDA_nvrtc_LIBRARY CACHE)
\ No newline at end of file
diff --git a/libcuda/CMakeLists.txt b/libcuda/CMakeLists.txt
new file mode 100644
index 000000000..c4ba4d181
--- /dev/null
+++ b/libcuda/CMakeLists.txt
@@ -0,0 +1,26 @@
+# Specify Flex and Bison target
+BISON_TARGET(cuobjdump_parser cuobjdump.y ${CMAKE_CURRENT_BINARY_DIR}/cuobjdump_parser.c
+            COMPILE_FLAGS "-t -d -v --report=all -p cuobjdump_ --file-prefix=${CMAKE_CURRENT_BINARY_DIR}/cuobjdump")
+FLEX_TARGET(cuobjdump_lexer cuobjdump.l ${CMAKE_CURRENT_BINARY_DIR}/cuobjdump_lexer.c
+            COMPILE_FLAGS "-B -P cuobjdump_")
+ADD_FLEX_BISON_DEPENDENCY(cuobjdump_lexer cuobjdump_parser)
+
+# Set generated source files to CXX
+set_source_files_properties(${BISON_cuobjdump_parser_OUTPUT_SOURCE} 
+                            ${FLEX_cuobjdump_lexer_OUTPUTS}
+                            PROPERTIES LANGUAGE CXX)
+
+# Create libcuda.a with all source files
+add_library(cuda STATIC 
+    cuda_runtime_api.cc
+    ${BISON_cuobjdump_parser_OUTPUT_SOURCE} ${FLEX_cuobjdump_lexer_OUTPUTS})
+
+    # Add current dir to include path
+# Also add flex/bison generated header files
+target_include_directories(cuda PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+
+# Add cuda include path for own reference
+target_include_directories(cuda PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
+
+# Add project build dir to include path
+target_include_directories(cuda PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
diff --git a/libopencl/CMakeLists.txt b/libopencl/CMakeLists.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/short-tests-cmake.sh b/short-tests-cmake.sh
new file mode 100755
index 000000000..e41444156
--- /dev/null
+++ b/short-tests-cmake.sh
@@ -0,0 +1,27 @@
+if [ ! -n "$CUDA_INSTALL_PATH" ]; then
+	echo "ERROR ** Install CUDA Toolkit and set CUDA_INSTALL_PATH.";
+	exit;
+fi
+
+if [ ! -n "$CONFIG" ]; then
+	echo "ERROR ** set the CONFIG env variable to one of those found in ./accel-sim-framework/util/job_launching/configs/define-standard-cfgs.yml";
+	exit;
+fi
+
+if [ ! -n "$GPUAPPS_ROOT" ]; then
+	echo "ERROR ** GPUAPPS_ROOT to a location where the apps have been compiled";
+	exit;
+fi
+
+git config --system --add safe.directory '*'
+
+export PATH=$CUDA_INSTALL_PATH/bin:$PATH
+
+cmake -B build
+cmake --build build -j
+cmake --install build
+source setup
+
+git clone https://github.com/accel-sim/accel-sim-framework.git
+./accel-sim-framework/util/job_launching/run_simulations.py -C $CONFIG -B rodinia_2.0-ft -N regress -l local
+./accel-sim-framework/util/job_launching/monitor_func_test.py -v -N regress -j procman
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 000000000..5849629e8
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,19 @@
+# gpgpusim_entrypoint objects
+add_library(gpgpusim_entrypoint OBJECT
+    abstract_hardware_model.cc
+    debug.cc
+    gpgpusim_entrypoint.cc
+    option_parser.cc
+    statwrapper.cc
+    stream_manager.cc
+    trace.cc)
+
+# Add current folder and CUDA include to include path
+target_include_directories(gpgpusim_entrypoint PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(gpgpusim_entrypoint PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
+
+# Add subdir
+add_subdirectory(accelwattch)
+add_subdirectory(cuda-sim)
+add_subdirectory(gpgpu-sim)
+add_subdirectory(intersim2)
\ No newline at end of file
diff --git a/src/accelwattch/CMakeLists.txt b/src/accelwattch/CMakeLists.txt
new file mode 100644
index 000000000..cb7dd7178
--- /dev/null
+++ b/src/accelwattch/CMakeLists.txt
@@ -0,0 +1,46 @@
+set(GPGPUSIM_ACCELWATTCH_NTHREADS "4" CACHE STRING "Accelwattch MCPAT thread count")
+add_library(accelwattch STATIC 
+            cacti/Ucache.cc
+            XML_Parse.cc
+            cacti/arbiter.cc
+            cacti/area.cc
+            array.cc
+            cacti/bank.cc
+            cacti/basic_circuit.cc
+            basic_components.cc
+            cacti/cacti_interface.cc
+            cacti/component.cc
+            core.cc
+            cacti/crossbar.cc
+            cacti/decoder.cc
+            cacti/htree2.cc
+            interconnect.cc
+            cacti/io.cc
+            iocontrollers.cc
+            logic.cc
+            main.cc
+            cacti/mat.cc
+            memoryctrl.cc
+            noc.cc
+            cacti/nuca.cc
+            cacti/parameter.cc
+            processor.cc
+            cacti/router.cc
+            sharedcache.cc
+            cacti/subarray.cc
+            cacti/technology.cc
+            cacti/uca.cc
+            cacti/wire.cc
+            xmlParser.cc
+            gpgpu_sim_wrapper.cc)
+target_include_directories(accelwattch PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(accelwattch PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/cacti)
+# Compile options
+target_compile_options(accelwattch PRIVATE "-Wno-unknown-pragmas")
+if($<CONFIG:Debug>)
+    target_compile_definitions(NTHREADS=1)
+else()
+    target_compile_options(accelwattch PRIVATE "-msse2;-mfpmath=sse")
+    target_compile_definitions(accelwattch PRIVATE -DNTHREADS=${GPGPUSIM_ACCELWATTCH_NTHREADS})
+endif()
+target_link_options(accelwattch PRIVATE "-lm;-lpthread;-lz")
\ No newline at end of file
diff --git a/src/cuda-sim/CMakeLists.txt b/src/cuda-sim/CMakeLists.txt
new file mode 100644
index 000000000..3378b7743
--- /dev/null
+++ b/src/cuda-sim/CMakeLists.txt
@@ -0,0 +1,78 @@
+# Specify Flex and Bison target
+BISON_TARGET(ptx_parser ptx.y ${CMAKE_CURRENT_BINARY_DIR}/ptx.tab.c
+            COMPILE_FLAGS "--name-prefix=ptx_ -v -d --file-prefix=${CMAKE_CURRENT_BINARY_DIR}/ptx")
+BISON_TARGET(ptxinfo_parser ptxinfo.y ${CMAKE_CURRENT_BINARY_DIR}/ptxinfo.tab.c
+            COMPILE_FLAGS "--name-prefix=ptxinfo_ -v -d --file-prefix=${CMAKE_CURRENT_BINARY_DIR}/ptxinfo")
+FLEX_TARGET(ptx_lexer ptx.l ${CMAKE_CURRENT_BINARY_DIR}/lex.ptx_.c)
+FLEX_TARGET(ptxinfo_lexer ptxinfo.l ${CMAKE_CURRENT_BINARY_DIR}/lex.ptxinfo_.c)
+ADD_FLEX_BISON_DEPENDENCY(ptx_lexer ptx_parser)
+ADD_FLEX_BISON_DEPENDENCY(ptxinfo_lexer ptxinfo_parser)
+
+# The flex and bison are using CXX, need to set their generated files to CXX so that
+# they can be compiled and linked
+set_source_files_properties(${BISON_ptx_parser_OUTPUT_SOURCE} 
+                            ${FLEX_ptx_lexer_OUTPUTS}
+                            ${BISON_ptxinfo_parser_OUTPUT_SOURCE} 
+                            ${FLEX_ptxinfo_lexer_OUTPUTS}
+                            PROPERTIES LANGUAGE CXX)
+# Create libptxsim.a
+add_library(ptxsim STATIC
+            cuda_device_printf.cc
+            cuda_device_runtime.cc
+            cuda-sim.cc
+            instructions.cc
+            memory.cc
+            ptx_ir.cc
+            ptx_loader.cc
+            ptx_parser.cc
+            ptx_sim.cc
+            ptx-stats.cc
+            decuda_pred_table/decuda_pred_table.cc
+            ${BISON_ptx_parser_OUTPUT_SOURCE} ${FLEX_ptx_lexer_OUTPUTS}
+            ${BISON_ptxinfo_parser_OUTPUT_SOURCE} ${FLEX_ptxinfo_lexer_OUTPUTS})
+
+# Define this for all source files, though we just need it for parser
+target_compile_definitions(ptxsim PRIVATE YYDEBUG)
+target_include_directories(ptxsim PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/decuda_pred_table)
+target_include_directories(ptxsim PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
+target_include_directories(ptxsim PRIVATE ${CMAKE_BINARY_DIR})
+
+# ptxsim need buildstring
+add_dependencies(ptxsim gen_build_string)
+
+# Create instructions.h using custom command
+add_custom_target(gen_instructions_h DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/instructions.h)
+add_custom_command(
+    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/instructions.h
+    COMMAND touch ${CMAKE_CURRENT_BINARY_DIR}/instructions.h
+    COMMAND chmod +w ${CMAKE_CURRENT_BINARY_DIR}/instructions.h
+    COMMAND echo "// DO NOT EDIT THIS FILE! IT IS AUTOMATICALLY GENERATED BY THE MAKEFILE (see target for instructions.h)" > ${CMAKE_CURRENT_BINARY_DIR}/instructions.h
+    COMMAND echo "#include \"ptx_ir.h\"" >> ${CMAKE_CURRENT_BINARY_DIR}/instructions.h
+    COMMAND echo "#ifndef instructions_h_included" >> ${CMAKE_CURRENT_BINARY_DIR}/instructions.h
+    COMMAND echo "#define instructions_h_included" >> ${CMAKE_CURRENT_BINARY_DIR}/instructions.h
+    COMMAND cat ${CMAKE_CURRENT_SOURCE_DIR}/instructions.cc | grep "_impl(" | sed "s/{.*//" | sed "s/$/;/"  >> ${CMAKE_CURRENT_BINARY_DIR}/instructions.h
+    COMMAND echo "#endif" >> ${CMAKE_CURRENT_BINARY_DIR}/instructions.h
+    # COMMAND chmod -w ${CMAKE_CURRENT_BINARY_DIR}/instructions.h
+    DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/instructions.cc
+    VERBATIM
+)
+add_dependencies(ptxsim gen_instructions_h)
+
+# Create ptx_parser_decode.def using custom command
+add_custom_target(gen_ptx_parser_decode DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/ptx_parser_decode.def)
+if(UNIX)
+    add_custom_command(
+        OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ptx_parser_decode.def
+        COMMAND cat ${CMAKE_CURRENT_BINARY_DIR}/ptx.tab.h | grep "=" | sed "s/^[ ]\\+//" | sed -E "s/\\s+\\/\\*.+\\*\\///" | sed "s/[=,]//g" | sed "s/\\([_A-Z1-9]\\+\\)[ ]\\+\\([0-9]\\+\\)/\\1 \\1/" | sed "s/^/DEF(/" | sed "s/ /,\"/" | sed "s/$/\")/" | sed "/YYerror/d;/YYEOF/d;/YYEMPTY/d;/YYUNDEF/d;" > ${CMAKE_CURRENT_BINARY_DIR}/ptx_parser_decode.def
+        DEPENDS ${BISON_ptx_parser_OUTPUTS}
+        VERBATIM
+    )
+else()
+    add_custom_command(
+        OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ptx_parser_decode.def
+        COMMAND cat ${CMAKE_CURRENT_BINARY_DIR}/ptx.tab.h | grep "=" | sed -E "s/^ +//" | sed -E "s/\\s+\\/\\*.+\\*\\///" | sed "s/[=,]//g" | sed -E "s/([_A-Z1-9]+).*/\\1 \\1/" | sed "s/^/DEF(/" | sed "s/ /,\"/" | sed "s/$/\")/" | sed "/YYerror/d;/YYEOF/d;/YYEMPTY/d;/YYUNDEF/d;" > ${CMAKE_CURRENT_BINARY_DIR}/ptx_parser_decode.def
+        DEPENDS ${BISON_ptx_parser_OUTPUTS}
+        VERBATIM
+    )
+endif()
+add_dependencies(ptxsim gen_ptx_parser_decode)
diff --git a/src/gpgpu-sim/CMakeLists.txt b/src/gpgpu-sim/CMakeLists.txt
new file mode 100644
index 000000000..04f197307
--- /dev/null
+++ b/src/gpgpu-sim/CMakeLists.txt
@@ -0,0 +1,36 @@
+# Exclude power_interface.cc if no power model
+list(APPEND gpgpusim_SRC addrdec.cc
+    dram.cc
+    dram_sched.cc
+    gpu-cache.cc
+    gpu-misc.cc
+    gpu-sim.cc
+    hashing.cc
+    histogram.cc
+    icnt_wrapper.cc
+    l2cache.cc
+    local_interconnect.cc
+    mem_fetch.cc
+    mem_latency_stat.cc
+    power_interface.cc
+    power_stat.cc
+    scoreboard.cc
+    shader.cc
+    stack.cc
+    stat-tool.cc
+    traffic_breakdown.cc
+    visualizer.cc)
+if(NOT GPGPUSIM_USE_POWER_MODEL)
+    list(REMOVE_ITEM ${gpgpusim_SRC} power_interface.cc)
+endif()
+
+# Create libgpgpusim.a
+add_library(gpgpusim STATIC ${gpgpusim_SRC})
+target_include_directories(gpgpusim PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(gpgpusim PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
+
+if(GPGPUSIM_USE_POWER_MODEL)
+target_compile_definitions(gpgpusim PRIVATE GPGPUSIM_POWER_MODEL)
+target_include_directories(gpgpusim PRIVATE ${GPGPUSIM_POWER_MODEL})
+endif()
+
diff --git a/src/intersim2/CMakeLists.txt b/src/intersim2/CMakeLists.txt
new file mode 100644
index 000000000..c3da1b1da
--- /dev/null
+++ b/src/intersim2/CMakeLists.txt
@@ -0,0 +1,106 @@
+option(GPGPUSIM_INTERSIM_STANDALONE "Whether to also build intersim in standalone mode" OFF)
+
+# Specify Flex and Bison target
+BISON_TARGET(intersim_config_parser config.y ${CMAKE_CURRENT_BINARY_DIR}/y.tab.c
+            COMPILE_FLAGS "-y -d --file-prefix=${CMAKE_CURRENT_BINARY_DIR}/y")
+FLEX_TARGET(intersim_config_lexer config.l ${CMAKE_CURRENT_BINARY_DIR}/lex.yy.c)
+ADD_FLEX_BISON_DEPENDENCY(intersim_config_lexer intersim_config_parser)
+
+# Set generated source files to CXX
+set_source_files_properties(${BISON_intersim_config_parser_OUTPUT_SOURCE} 
+                            ${FLEX_intersim_config_lexer_OUTPUTS}
+                            PROPERTIES LANGUAGE CXX)
+
+# Create booksim or libintersim.a
+# Shared include path
+list(APPEND intersim_INC ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR}/allocators
+    ${CMAKE_CURRENT_SOURCE_DIR}/arbiters
+    ${CMAKE_CURRENT_SOURCE_DIR}/networks
+    ${CMAKE_CURRENT_SOURCE_DIR}/power
+    ${CMAKE_CURRENT_SOURCE_DIR}/routers
+    ${PROJECT_SOURCE_DIR}/src)
+
+# Shared source files
+list(APPEND intersim_SRC
+        ${BISON_intersim_config_parser_OUTPUT_SOURCE}
+        ${FLEX_intersim_config_lexer_OUTPUTS}
+        allocators/allocator.cpp
+        allocators/islip.cpp
+        allocators/loa.cpp
+        allocators/maxsize.cpp
+        allocators/pim.cpp
+        allocators/selalloc.cpp
+        allocators/separable.cpp
+        allocators/separable_input_first.cpp
+        allocators/separable_output_first.cpp
+        allocators/wavefront.cpp
+        arbiters/arbiter.cpp
+        arbiters/matrix_arb.cpp
+        arbiters/prio_arb.cpp
+        arbiters/roundrobin_arb.cpp
+        arbiters/tree_arb.cpp
+        batchtrafficmanager.cpp
+        booksim_config.cpp
+        buffer.cpp
+        buffer_state.cpp
+        config_utils.cpp
+        credit.cpp
+        flitchannel.cpp
+        flit.cpp
+        gputrafficmanager.cpp
+        injection.cpp
+        interconnect_interface.cpp
+        intersim_config.cpp
+        main.cpp
+        misc_utils.cpp
+        module.cpp
+        networks/anynet.cpp
+        networks/cmesh.cpp
+        networks/dragonfly.cpp
+        networks/fattree.cpp
+        networks/flatfly_onchip.cpp
+        networks/fly.cpp
+        networks/kncube.cpp
+        networks/network.cpp
+        networks/qtree.cpp
+        networks/tree4.cpp
+        outputset.cpp
+        packet_reply_info.cpp
+        power/buffer_monitor.cpp
+        power/power_module.cpp
+        power/switch_monitor.cpp
+        rng_double_wrapper.cpp
+        rng_wrapper.cpp
+        routefunc.cpp
+        routers/chaos_router.cpp
+        routers/event_router.cpp
+        routers/iq_router.cpp
+        routers/router.cpp
+        stats.cpp
+        traffic.cpp
+        trafficmanager.cpp
+        vc.cpp)
+
+# If standalone, also build for it
+if(GPGPUSIM_INTERSIM_STANDALONE)
+    list(REMOVE_ITEM ${intersim_SRC} interconnect_interface.cpp)
+    add_executable(booksim ${intersim_SRC})
+    target_include_directories(booksim PUBLIC 
+        ${intersim_INC})
+    target_include_directories(booksim PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
+    target_include_directories(booksim PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
+    # Remove globally set TRACING_ON flag
+    target_compile_options(booksim PRIVATE -UTRACING_ON)
+endif()
+
+# Specify sources for libintersim.a
+add_library(intersim STATIC ${intersim_SRC})
+target_include_directories(intersim PUBLIC 
+    ${intersim_INC}
+    ${PROJECT_SOURCE_DIR}/src/gpgpu-sim)
+target_include_directories(intersim PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
+target_include_directories(intersim PUBLIC ${CUDAToolkit_INCLUDE_DIRS})
+target_compile_definitions(intersim PRIVATE CREATE_LIBRARY)
+# Remove globally set TRACING_ON flag
+target_compile_options(intersim PRIVATE -UTRACING_ON)
diff --git a/src/intersim2/Makefile b/src/intersim2/Makefile
index dad436aa6..a7485e23f 100644
--- a/src/intersim2/Makefile
+++ b/src/intersim2/Makefile
@@ -28,7 +28,7 @@
 # Makefile
 #
 CXX = g++
-CC = gcc
+CC = g++
 CREATE_LIBRARY ?= 0
 INTERFACE = interconnect_interface.cpp
 DEBUG ?= 0
diff --git a/src/intersim2/config_utils.cpp b/src/intersim2/config_utils.cpp
index fad5fceb1..a896a93a6 100644
--- a/src/intersim2/config_utils.cpp
+++ b/src/intersim2/config_utils.cpp
@@ -199,27 +199,27 @@ Configuration * Configuration::GetTheConfig()
 
 //============================================================
 
-extern "C" void config_error( char const * msg, int lineno )
+void config_error( char * msg, int lineno )
 {
   Configuration::GetTheConfig( )->ParseError( msg, lineno );
 }
 
-extern "C" void config_assign_string( char const * field, char const * value )
+ void config_assign_string( char const * field, char const * value )
 {
   Configuration::GetTheConfig()->Assign(field, value);
 }
 
-extern "C" void config_assign_int( char const * field, int value )
+void config_assign_int( char const * field, int value )
 {
   Configuration::GetTheConfig()->Assign(field, value);
 }
 
-extern "C" void config_assign_float( char const * field, double value )
+void config_assign_float( char const * field, double value )
 {
   Configuration::GetTheConfig()->Assign(field, value);
 }
 
-extern "C" int config_input(char * line, int max_size)
+int config_input(char * line, int max_size)
 {
   return Configuration::GetTheConfig()->Input(line, max_size);
 }
diff --git a/src/intersim2/config_utils.hpp b/src/intersim2/config_utils.hpp
index de3343bb0..1d960b6ab 100644
--- a/src/intersim2/config_utils.hpp
+++ b/src/intersim2/config_utils.hpp
@@ -35,7 +35,7 @@
 #include<map>
 #include<vector>
 
-extern "C" int yyparse();
+int yyparse();
 
 class Configuration {
   static Configuration * theConfig;
diff --git a/src/intersim2/interconnect_interface.cpp b/src/intersim2/interconnect_interface.cpp
index 1e1a2d73b..438852e0a 100644
--- a/src/intersim2/interconnect_interface.cpp
+++ b/src/intersim2/interconnect_interface.cpp
@@ -200,7 +200,7 @@ void InterconnectInterface::Push(unsigned input_deviceID, unsigned output_device
 void* InterconnectInterface::Pop(unsigned deviceID)
 {
   int icntID = _node_map[deviceID];
-#if DEBUG
+#if 0
   cout<<"Call interconnect POP  " << output<<endl;
 #endif
 
diff --git a/version.in b/version.in
new file mode 100644
index 000000000..2935c6f56
--- /dev/null
+++ b/version.in
@@ -0,0 +1 @@
+const char *g_gpgpusim_build_string="@GPGPUSIM_BUILD_STRING@";
\ No newline at end of file

From b70d93016908c10968786eab79b7855c2b436064 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Fri, 5 Apr 2024 16:48:59 -0400
Subject: [PATCH 140/154] CMAKE_BUILD_TYPE should be inside ${}

---
 gpgpusim_check.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpgpusim_check.cmake b/gpgpusim_check.cmake
index 5da46c979..22a62f232 100644
--- a/gpgpusim_check.cmake
+++ b/gpgpusim_check.cmake
@@ -116,7 +116,7 @@ execute_process(
 # Get debug or release
 # Set with -DCMAKE_BUILD_TYPE=Debug|Release to change build type
 message(CHECK_START "Checking for CMAKE_BUILD_TYPE")
-if(NOT CMAKE_BUILD_TYPE)
+if(NOT ${CMAKE_BUILD_TYPE})
     set(CMAKE_BUILD_TYPE Release)
     set(GPGPUSIM_BUILD_MODE "release" CACHE STRING "" FORCE)
 else()

From 570d75cf711831fda65e3edb5efc836f9f5624a0 Mon Sep 17 00:00:00 2001
From: JRPAN <25518778+JRPan@users.noreply.github.com>
Date: Mon, 8 Apr 2024 15:43:54 -0400
Subject: [PATCH 141/154] Fix Build Type

---
 gpgpusim_check.cmake | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/gpgpusim_check.cmake b/gpgpusim_check.cmake
index 22a62f232..486d66dc9 100644
--- a/gpgpusim_check.cmake
+++ b/gpgpusim_check.cmake
@@ -116,12 +116,11 @@ execute_process(
 # Get debug or release
 # Set with -DCMAKE_BUILD_TYPE=Debug|Release to change build type
 message(CHECK_START "Checking for CMAKE_BUILD_TYPE")
-if(NOT ${CMAKE_BUILD_TYPE})
+if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release)
     set(GPGPUSIM_BUILD_MODE "release" CACHE STRING "" FORCE)
 else()
     string(TOLOWER "${CMAKE_BUILD_TYPE}" GPGPUSIM_BUILD_MODE)
-    set(CMAKE_BUILD_TYPE Debug)
 endif()
 message(CHECK_PASS "${CMAKE_BUILD_TYPE}")
 # TODO: Make this step an installation phase that handle copying so and creating symlinks

From 036b1305fc825fe29c3174dd814c413407db8584 Mon Sep 17 00:00:00 2001
From: Wonhyuk Yang <vvghjk1234@gmail.com>
Date: Sat, 22 Jun 2024 09:50:06 +0900
Subject: [PATCH 142/154] Add missing increment part for m_pending_ldgsts with
 minor fix (#72)

LDGSTS/LDGDEPBAR was introduced #62, but it's increment part was deleted
by mistake. So add it.

In some applications, ldgsts may not exist between ldgdepbar. In
such cases, add exception handling logic to insert an empty vector.

Reported-by: Okkyun Woo <okkyun.w@postech.ac.kr>

Signed-off-by: Wonhyuk Yang <wonhyuk@postech.ac.kr>
---
 src/gpgpu-sim/shader.cc | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 67540e083..b1609d5b8 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -1073,6 +1073,12 @@ void shader_core_ctx::issue_warp(register_set &pipe_reg_set,
     m_warp[warp_id]->set_membar();
   } else if (next_inst->m_is_ldgdepbar) { // Add for LDGDEPBAR
     m_warp[warp_id]->m_ldgdepbar_id++;
+    // If there are no added LDGSTS, insert an empty vector
+    if (m_warp[warp_id]->m_ldgdepbar_buf.size() != ldgdepbar_id + 1) {
+      assert(m_warp[warp_id]->m_ldgdepbar_buf.size() < ldgdepbar_id + 1);
+      std::vector<warp_inst_t> l;
+      m_warp[warp_id]->m_ldgdepbar_buf.push_back(l);
+    }
   } else if (next_inst->m_is_depbar) {  // Add for DEPBAR
     // Set to true immediately when a DEPBAR instruction is met
     m_warp[warp_id]->m_waiting_ldgsts = true;
@@ -2642,6 +2648,9 @@ void ldst_unit::issue(register_set &reg_set) {
         m_pending_writes[warp_id][reg_id] += n_accesses;
       }
     }
+    if (inst->m_is_ldgsts) {
+      m_pending_ldgsts[warp_id][inst->pc][inst->get_addr(0)] += n_accesses;
+    }
   }
 
   inst->op_pipe = MEM__OP;

From 6aa7ed16ed4c244bebaf8942f8666bbd94a2c757 Mon Sep 17 00:00:00 2001
From: Shreyas Singh <shreyas42singh@gmail.com>
Date: Sat, 22 Jun 2024 00:06:55 -0500
Subject: [PATCH 143/154] Added guard to check if L2 is writeback or not (#73)

---
 src/gpgpu-sim/l2cache.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index 44d793cbc..5b63765a6 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -559,7 +559,7 @@ void memory_sub_partition::cache_cycle(unsigned cycle) {
             if (mf->get_access_type() == L1_WRBK_ACC) {
               m_request_tracker.erase(mf);
               delete mf;
-            } else {
+            } else if (m_config->m_L2_config.get_write_policy() == WRITE_BACK) {
               mf->set_reply();
               mf->set_status(IN_PARTITION_L2_TO_ICNT_QUEUE,
                              m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);

From 55419d7098a433122bf4d940cf38af17e33f045a Mon Sep 17 00:00:00 2001
From: Aaron Barnes <42706182+barnes88@users.noreply.github.com>
Date: Fri, 12 Jul 2024 15:35:38 -0600
Subject: [PATCH 144/154] Reg bank patch (#41)

* remove implicit casting, cleanup unused bank_warp_shift parameter

* update cu init function prototype

* remove m_bank_warp_shift from function call
---
 src/gpgpu-sim/shader.cc | 22 +++++++++-------------
 src/gpgpu-sim/shader.h  | 17 +++++++----------
 2 files changed, 16 insertions(+), 23 deletions(-)

diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index b1609d5b8..855aa1c14 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -4138,10 +4138,7 @@ void opndcoll_rfu_t::init(unsigned num_banks, shader_core_ctx *shader) {
   // for( unsigned n=0; n<m_num_ports;n++ )
   //    m_dispatch_units[m_output[n]].init( m_num_collector_units[n] );
   m_num_banks = num_banks;
-  m_bank_warp_shift = 0;
   m_warp_size = shader->get_config()->warp_size;
-  m_bank_warp_shift = (unsigned)(int)(log(m_warp_size + 0.5) / log(2.0));
-  assert((m_bank_warp_shift == 5) || (m_warp_size != 32));
 
   sub_core_model = shader->get_config()->sub_core_model;
   m_num_warp_scheds = shader->get_config()->gpgpu_num_sched_per_core;
@@ -4159,7 +4156,7 @@ void opndcoll_rfu_t::init(unsigned num_banks, shader_core_ctx *shader) {
       unsigned cusPerSched = m_cu.size() / m_num_warp_scheds;
       reg_id = j / cusPerSched;
     }
-    m_cu[j]->init(j, num_banks, m_bank_warp_shift, shader->get_config(), this,
+    m_cu[j]->init(j, num_banks, shader->get_config(), this,
                   sub_core_model, reg_id, m_num_banks_per_sched);
   }
   for (unsigned j = 0; j < m_dispatch_units.size(); j++) {
@@ -4168,11 +4165,11 @@ void opndcoll_rfu_t::init(unsigned num_banks, shader_core_ctx *shader) {
   m_initialized = true;
 }
 
-int register_bank(int regnum, int wid, unsigned num_banks,
-                  unsigned bank_warp_shift, bool sub_core_model,
+unsigned register_bank(int regnum, int wid, unsigned num_banks,
+                  bool sub_core_model,
                   unsigned banks_per_sched, unsigned sched_id) {
   int bank = regnum;
-  if (bank_warp_shift) bank += wid;
+  bank += wid;
   if (sub_core_model) {
     unsigned bank_num = (bank % banks_per_sched) + (sched_id * banks_per_sched);
     assert(bank_num < num_banks);
@@ -4190,12 +4187,12 @@ bool opndcoll_rfu_t::writeback(warp_inst_t &inst) {
                                           // in function_info::ptx_decode_inst
     if (reg_num >= 0) {                   // valid register
       unsigned bank = register_bank(reg_num, inst.warp_id(), m_num_banks,
-                                    m_bank_warp_shift, sub_core_model,
+                                    sub_core_model,
                                     m_num_banks_per_sched, inst.get_schd_id());
       if (m_arbiter.bank_idle(bank)) {
         m_arbiter.allocate_bank_for_write(
             bank,
-            op_t(&inst, reg_num, m_num_banks, m_bank_warp_shift, sub_core_model,
+            op_t(&inst, reg_num, m_num_banks, sub_core_model,
                  m_num_banks_per_sched, inst.get_schd_id()));
         inst.arch_reg.dst[op] = -1;
       } else {
@@ -4305,7 +4302,7 @@ void opndcoll_rfu_t::allocate_reads() {
     unsigned reg = rr.get_reg();
     unsigned wid = rr.get_wid();
     unsigned bank =
-        register_bank(reg, wid, m_num_banks, m_bank_warp_shift, sub_core_model,
+        register_bank(reg, wid, m_num_banks, sub_core_model,
                       m_num_banks_per_sched, rr.get_sid());
     m_arbiter.allocate_for_read(bank, rr);
     read_ops[bank] = rr;
@@ -4357,7 +4354,7 @@ void opndcoll_rfu_t::collector_unit_t::dump(
 }
 
 void opndcoll_rfu_t::collector_unit_t::init(
-    unsigned n, unsigned num_banks, unsigned log2_warp_size,
+    unsigned n, unsigned num_banks,
     const core_config *config, opndcoll_rfu_t *rfu, bool sub_core_model,
     unsigned reg_id, unsigned banks_per_sched) {
   m_rfu = rfu;
@@ -4365,7 +4362,6 @@ void opndcoll_rfu_t::collector_unit_t::init(
   m_num_banks = num_banks;
   assert(m_warp == NULL);
   m_warp = new warp_inst_t(config);
-  m_bank_warp_shift = log2_warp_size;
   m_sub_core_model = sub_core_model;
   m_reg_id = reg_id;
   m_num_banks_per_sched = banks_per_sched;
@@ -4393,7 +4389,7 @@ bool opndcoll_rfu_t::collector_unit_t::allocate(register_set *pipeline_reg_set,
       }
       if (reg_num >= 0 && new_reg) {          // valid register
         prev_regs.push_back(reg_num);
-        m_src_op[op] = op_t(this, op, reg_num, m_num_banks, m_bank_warp_shift,
+        m_src_op[op] = op_t(this, op, reg_num, m_num_banks,
                             m_sub_core_model, m_num_banks_per_sched,
                             (*pipeline_reg)->get_schd_id());
         m_not_ready.set(op);
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 089730267..95e142e13 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -336,8 +336,8 @@ inline unsigned wid_from_hw_tid(unsigned tid, unsigned warp_size) {
 const unsigned WARP_PER_CTA_MAX = 64;
 typedef std::bitset<WARP_PER_CTA_MAX> warp_set_t;
 
-int register_bank(int regnum, int wid, unsigned num_banks,
-                  unsigned bank_warp_shift, bool sub_core_model,
+unsigned register_bank(int regnum, int wid, unsigned num_banks,
+                  bool sub_core_model,
                   unsigned banks_per_sched, unsigned sched_id);
 
 class shader_core_ctx;
@@ -681,7 +681,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
    public:
     op_t() { m_valid = false; }
     op_t(collector_unit_t *cu, unsigned op, unsigned reg, unsigned num_banks,
-         unsigned bank_warp_shift, bool sub_core_model,
+        bool sub_core_model,
          unsigned banks_per_sched, unsigned sched_id) {
       m_valid = true;
       m_warp = NULL;
@@ -689,11 +689,11 @@ class opndcoll_rfu_t {  // operand collector based register file unit
       m_operand = op;
       m_register = reg;
       m_shced_id = sched_id;
-      m_bank = register_bank(reg, cu->get_warp_id(), num_banks, bank_warp_shift,
+      m_bank = register_bank(reg, cu->get_warp_id(), num_banks,
                              sub_core_model, banks_per_sched, sched_id);
     }
     op_t(const warp_inst_t *warp, unsigned reg, unsigned num_banks,
-         unsigned bank_warp_shift, bool sub_core_model,
+         bool sub_core_model,
          unsigned banks_per_sched, unsigned sched_id) {
       m_valid = true;
       m_warp = warp;
@@ -701,7 +701,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
       m_cu = NULL;
       m_operand = -1;
       m_shced_id = sched_id;
-      m_bank = register_bank(reg, warp->warp_id(), num_banks, bank_warp_shift,
+      m_bank = register_bank(reg, warp->warp_id(), num_banks,
                              sub_core_model, banks_per_sched, sched_id);
     }
 
@@ -934,7 +934,6 @@ class opndcoll_rfu_t {  // operand collector based register file unit
       m_not_ready.reset();
       m_warp_id = -1;
       m_num_banks = 0;
-      m_bank_warp_shift = 0;
     }
     // accessors
     bool ready() const;
@@ -951,7 +950,7 @@ class opndcoll_rfu_t {  // operand collector based register file unit
     unsigned get_reg_id() const { return m_reg_id; }
 
     // modifiers
-    void init(unsigned n, unsigned num_banks, unsigned log2_warp_size,
+    void init(unsigned n, unsigned num_banks,
               const core_config *config, opndcoll_rfu_t *rfu,
               bool m_sub_core_model, unsigned reg_id,
               unsigned num_banks_per_sched);
@@ -973,7 +972,6 @@ class opndcoll_rfu_t {  // operand collector based register file unit
     op_t *m_src_op;
     std::bitset<MAX_REG_OPERANDS * 2> m_not_ready;
     unsigned m_num_banks;
-    unsigned m_bank_warp_shift;
     opndcoll_rfu_t *m_rfu;
 
     unsigned m_num_banks_per_sched;
@@ -1025,7 +1023,6 @@ class opndcoll_rfu_t {  // operand collector based register file unit
   unsigned m_num_collector_sets;
   // unsigned m_num_collectors;
   unsigned m_num_banks;
-  unsigned m_bank_warp_shift;
   unsigned m_warp_size;
   std::vector<collector_unit_t *> m_cu;
   arbiter_t m_arbiter;

From e1afc53b51d24afcfd8b8aab15e4ba5d99b4a772 Mon Sep 17 00:00:00 2001
From: Aaron Barnes <42706182+barnes88@users.noreply.github.com>
Date: Tue, 16 Jul 2024 12:33:16 -0600
Subject: [PATCH 145/154] Auto clang format (#74)

* add automated clang formatter

* Automated clang-format

* use /bin/bash and add print

* use default checkout ref

* Format only after tests are success

* Run CI on merge group

---------

Co-authored-by: barnes88 <barnes88@users.noreply.github.com>
Co-authored-by: JRPAN <25518778+JRPan@users.noreply.github.com>
---
 .github/workflows/cmake.yml          |   3 +
 .github/workflows/main.yml           |  26 +
 format-code.sh                       |   1 +
 libcuda/cuda_runtime_api.cc          |  16 +-
 src/abstract_hardware_model.cc       |  20 +-
 src/abstract_hardware_model.h        |  81 +--
 src/accelwattch/XML_Parse.cc         | 130 ++--
 src/accelwattch/XML_Parse.h          |  40 +-
 src/accelwattch/gpgpu_sim_wrapper.cc | 927 +++++++++++++++------------
 src/accelwattch/gpgpu_sim_wrapper.h  |  69 +-
 src/accelwattch/processor.cc         |   4 +-
 src/accelwattch/xmlParser.cc         |  28 +-
 src/cuda-sim/cuda-sim.cc             | 251 ++++----
 src/cuda-sim/instructions.cc         |  15 +-
 src/cuda-sim/ptx_ir.cc               |  15 +-
 src/cuda-sim/ptx_ir.h                |   2 +-
 src/cuda-sim/ptx_sim.cc              |   3 +-
 src/debug.cc                         |   4 +-
 src/gpgpu-sim/dram.cc                | 104 +--
 src/gpgpu-sim/dram.h                 |   9 +-
 src/gpgpu-sim/gpu-cache.cc           |  34 +-
 src/gpgpu-sim/gpu-cache.h            |  40 +-
 src/gpgpu-sim/gpu-sim.cc             | 216 ++++---
 src/gpgpu-sim/gpu-sim.h              |  17 +-
 src/gpgpu-sim/l2cache.cc             |  14 +-
 src/gpgpu-sim/l2cache.h              |   9 +-
 src/gpgpu-sim/local_interconnect.cc  |   2 +-
 src/gpgpu-sim/power_interface.cc     | 695 ++++++++++----------
 src/gpgpu-sim/power_interface.h      |  33 +-
 src/gpgpu-sim/power_stat.cc          | 525 +++++++++------
 src/gpgpu-sim/power_stat.h           | 869 ++++++++++++-------------
 src/gpgpu-sim/shader.cc              | 331 +++++-----
 src/gpgpu-sim/shader.h               | 470 +++++++-------
 33 files changed, 2694 insertions(+), 2309 deletions(-)

diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
index ab9bfd019..c0a22ebf2 100644
--- a/.github/workflows/cmake.yml
+++ b/.github/workflows/cmake.yml
@@ -5,6 +5,9 @@ name: Short-Tests-CMake
 on:
   # Triggers the workflow on push or pull request events but only for the mydev branch
   push:
+    branches-ignore:
+      - "gh-readonly-queue**"
+  merge_group:
   pull_request:
 
   # Allows you to run this workflow manually from the Actions tab
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index c639ff3fb..39f65c94c 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -6,6 +6,9 @@ name: Short-Tests
 on:
   # Triggers the workflow on push or pull request events but only for the mydev branch
   push:
+    branches-ignore:
+      - "gh-readonly-queue**"
+  merge_group:
   pull_request:
 
   # Allows you to run this workflow manually from the Actions tab
@@ -82,3 +85,26 @@ jobs:
       - uses: actions/checkout@v4
       - name: Run Simulation
         run: /bin/bash $GITHUB_WORKSPACE/short-tests.sh
+  format-code:
+    runs-on: ubuntu-latest
+    needs: [build-TITANV, build-TITANV-LOCALXBAR, build-QV100, build-2060, build-3070]
+    permissions:
+      # Give the default GITHUB_TOKEN write permission to commit and push the
+      # added or changed files to the repository.
+      contents: write
+    steps:
+      - uses: actions/checkout@v4
+      # Other steps that change files in the repository go here
+      # …
+      - name: Run clang-format
+        run: |
+          sudo apt-get install -y clang-format
+          /bin/bash ./format-code.sh
+      - uses: stefanzweifel/git-auto-commit-action@v5
+        with:
+          # Optional. Commit message for the created commit.
+          # Defaults to "Apply automatic changes"
+          commit_message: Automated clang-format
+          # Optional. Option used by `git-status` to determine if the repository is 
+          # dirty. See https://git-scm.com/docs/git-status#_options
+          status_options: '--untracked-files=no'
\ No newline at end of file
diff --git a/format-code.sh b/format-code.sh
index ac753f059..acd33ab1c 100755
--- a/format-code.sh
+++ b/format-code.sh
@@ -1,5 +1,6 @@
 # This bash script formats GPGPU-Sim using clang-format
 THIS_DIR="$( cd "$( dirname "$BASH_SOURCE" )" && pwd )"
+echo "Running clang-format on $THIS_DIR"
 clang-format -i ${THIS_DIR}/libcuda/*.h
 clang-format -i ${THIS_DIR}/libcuda/*.cc
 clang-format -i ${THIS_DIR}/src/*.h
diff --git a/libcuda/cuda_runtime_api.cc b/libcuda/cuda_runtime_api.cc
index 5866b362e..b64c3d9e2 100644
--- a/libcuda/cuda_runtime_api.cc
+++ b/libcuda/cuda_runtime_api.cc
@@ -133,16 +133,16 @@
 #if (CUDART_VERSION < 8000)
 #include "__cudaFatFormat.h"
 #endif
-#include "gpgpu_context.h"
-#include "cuda_api_object.h"
-#include "../src/gpgpu-sim/gpu-sim.h"
-#include "../src/cuda-sim/ptx_loader.h"
+#include "../src/abstract_hardware_model.h"
 #include "../src/cuda-sim/cuda-sim.h"
 #include "../src/cuda-sim/ptx_ir.h"
+#include "../src/cuda-sim/ptx_loader.h"
 #include "../src/cuda-sim/ptx_parser.h"
+#include "../src/gpgpu-sim/gpu-sim.h"
 #include "../src/gpgpusim_entrypoint.h"
 #include "../src/stream_manager.h"
-#include "../src/abstract_hardware_model.h"
+#include "cuda_api_object.h"
+#include "gpgpu_context.h"
 
 #include <pthread.h>
 #include <semaphore.h>
@@ -464,7 +464,7 @@ static int get_app_cuda_version() {
       " | grep libcudart.so | sed  's/.*libcudart.so.\\(.*\\) =>.*/\\1/' > " +
       fname;
   int res = system(app_cuda_version_command.c_str());
-  if(res == -1){
+  if (res == -1) {
     printf("Error - Cannot detect the app's CUDA version.\n");
     exit(1);
   }
@@ -3239,8 +3239,8 @@ char *readfile(const std::string filename) {
   // allocate and copy the entire ptx
   char *ret = (char *)malloc((filesize + 1) * sizeof(char));
   int num = fread(ret, 1, filesize, fp);
-  if(num == 0){
-        std::cout << "ERROR: Could not read data from file %s\n"
+  if (num == 0) {
+    std::cout << "ERROR: Could not read data from file %s\n"
               << filename << std::endl;
     assert(0);
   }
diff --git a/src/abstract_hardware_model.cc b/src/abstract_hardware_model.cc
index ed7347de7..fd056c6d1 100644
--- a/src/abstract_hardware_model.cc
+++ b/src/abstract_hardware_model.cc
@@ -1,17 +1,18 @@
-// Copyright (c) 2009-2021, Tor M. Aamodt, Inderpreet Singh, Timothy Rogers, Vijay Kandiah, Nikos Hardavellas, 
-// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
-// The University of British Columbia, Northwestern University, Purdue University
-// All rights reserved.
+// Copyright (c) 2009-2021, Tor M. Aamodt, Inderpreet Singh, Timothy Rogers,
+// Vijay Kandiah, Nikos Hardavellas, Mahmoud Khairy, Junrui Pan, Timothy G.
+// Rogers The University of British Columbia, Northwestern University, Purdue
+// University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// 1. Redistributions of source code must retain the above copyright notice, this
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
 //    list of conditions and the following disclaimer;
 // 2. Redistributions in binary form must reproduce the above copyright notice,
 //    this list of conditions and the following disclaimer in the documentation
 //    and/or other materials provided with the distribution;
-// 3. Neither the names of The University of British Columbia, Northwestern 
+// 3. Neither the names of The University of British Columbia, Northwestern
 //    University nor the names of their contributors may be used to
 //    endorse or promote products derived from this software without specific
 //    prior written permission.
@@ -28,7 +29,6 @@
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 
-
 #include "abstract_hardware_model.h"
 #include <sys/stat.h>
 #include <algorithm>
@@ -284,7 +284,7 @@ void warp_inst_t::broadcast_barrier_reduction(
 void warp_inst_t::generate_mem_accesses() {
   if (empty() || op == MEMORY_BARRIER_OP || m_mem_accesses_created) return;
   if (!((op == LOAD_OP) || (op == TENSOR_CORE_LOAD_OP) || (op == STORE_OP) ||
-        (op == TENSOR_CORE_STORE_OP) ))
+        (op == TENSOR_CORE_STORE_OP)))
     return;
   if (m_warp_active_mask.count() == 0) return;  // predicated off
 
@@ -292,8 +292,8 @@ void warp_inst_t::generate_mem_accesses() {
 
   assert(is_load() || is_store());
 
-  //if((space.get_type() != tex_space) && (space.get_type() != const_space))
-    assert(m_per_scalar_thread_valid);  // need address information per thread
+  // if((space.get_type() != tex_space) && (space.get_type() != const_space))
+  assert(m_per_scalar_thread_valid);  // need address information per thread
 
   bool is_write = is_store();
 
diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index ebf6535ea..e5f3b7859 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -1,17 +1,18 @@
-// Copyright (c) 2009-2021, Tor M. Aamodt, Inderpreet Singh, Vijay Kandiah, Nikos Hardavellas, 
-// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
-// The University of British Columbia, Northwestern University, Purdue University
+// Copyright (c) 2009-2021, Tor M. Aamodt, Inderpreet Singh, Vijay Kandiah,
+// Nikos Hardavellas, Mahmoud Khairy, Junrui Pan, Timothy G. Rogers The
+// University of British Columbia, Northwestern University, Purdue University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// 1. Redistributions of source code must retain the above copyright notice, this
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
 //    list of conditions and the following disclaimer;
 // 2. Redistributions in binary form must reproduce the above copyright notice,
 //    this list of conditions and the following disclaimer in the documentation
 //    and/or other materials provided with the distribution;
-// 3. Neither the names of The University of British Columbia, Northwestern 
+// 3. Neither the names of The University of British Columbia, Northwestern
 //    University nor the names of their contributors may be used to
 //    endorse or promote products derived from this software without specific
 //    prior written permission.
@@ -64,24 +65,24 @@ enum _memory_space_t {
 #ifndef COEFF_STRUCT
 #define COEFF_STRUCT
 
-struct PowerscalingCoefficients{
-    double int_coeff;
-    double int_mul_coeff;
-    double int_mul24_coeff;
-    double int_mul32_coeff;
-    double int_div_coeff;
-    double fp_coeff;
-    double dp_coeff;
-    double fp_mul_coeff;
-    double fp_div_coeff;
-    double dp_mul_coeff;
-    double dp_div_coeff;
-    double sqrt_coeff;
-    double log_coeff;
-    double sin_coeff;
-    double exp_coeff;
-    double tensor_coeff;
-    double tex_coeff;
+struct PowerscalingCoefficients {
+  double int_coeff;
+  double int_mul_coeff;
+  double int_mul24_coeff;
+  double int_mul32_coeff;
+  double int_div_coeff;
+  double fp_coeff;
+  double dp_coeff;
+  double fp_mul_coeff;
+  double fp_div_coeff;
+  double dp_mul_coeff;
+  double dp_div_coeff;
+  double sqrt_coeff;
+  double log_coeff;
+  double sin_coeff;
+  double exp_coeff;
+  double tensor_coeff;
+  double tex_coeff;
 };
 #endif
 
@@ -974,18 +975,22 @@ class inst_t {
             memory_op == memory_store);
   }
 
-  bool is_fp() const { return ((sp_op == FP__OP));}    //VIJAY
-  bool is_fpdiv() const { return ((sp_op == FP_DIV_OP));} 
-  bool is_fpmul() const { return ((sp_op == FP_MUL_OP));} 
-  bool is_dp() const { return ((sp_op == DP___OP));}    
-  bool is_dpdiv() const { return ((sp_op == DP_DIV_OP));} 
-  bool is_dpmul() const { return ((sp_op == DP_MUL_OP));}
-  bool is_imul() const { return ((sp_op == INT_MUL_OP));} 
-  bool is_imul24() const { return ((sp_op == INT_MUL24_OP));} 
-  bool is_imul32() const { return ((sp_op == INT_MUL32_OP));} 
-  bool is_idiv() const { return ((sp_op == INT_DIV_OP));}   
-  bool is_sfu() const {return ((sp_op == FP_SQRT_OP) || (sp_op == FP_LG_OP)  || (sp_op == FP_SIN_OP)  || (sp_op == FP_EXP_OP) || (sp_op == TENSOR__OP));}
-  bool is_alu() const {return (sp_op == INT__OP);}
+  bool is_fp() const { return ((sp_op == FP__OP)); }  // VIJAY
+  bool is_fpdiv() const { return ((sp_op == FP_DIV_OP)); }
+  bool is_fpmul() const { return ((sp_op == FP_MUL_OP)); }
+  bool is_dp() const { return ((sp_op == DP___OP)); }
+  bool is_dpdiv() const { return ((sp_op == DP_DIV_OP)); }
+  bool is_dpmul() const { return ((sp_op == DP_MUL_OP)); }
+  bool is_imul() const { return ((sp_op == INT_MUL_OP)); }
+  bool is_imul24() const { return ((sp_op == INT_MUL24_OP)); }
+  bool is_imul32() const { return ((sp_op == INT_MUL32_OP)); }
+  bool is_idiv() const { return ((sp_op == INT_DIV_OP)); }
+  bool is_sfu() const {
+    return ((sp_op == FP_SQRT_OP) || (sp_op == FP_LG_OP) ||
+            (sp_op == FP_SIN_OP) || (sp_op == FP_EXP_OP) ||
+            (sp_op == TENSOR__OP));
+  }
+  bool is_alu() const { return (sp_op == INT__OP); }
 
   unsigned get_num_operands() const { return num_operands; }
   unsigned get_num_regs() const { return num_regs; }
@@ -1010,7 +1015,7 @@ class inst_t {
   operation_pipeline op_pipe;  // code (uarch visible) identify the pipeline of
                                // the operation (SP, SFU or MEM)
   mem_operation mem_op;        // code (uarch visible) identify memory type
-  bool const_cache_operand;   // has a load from constant memory as an operand
+  bool const_cache_operand;    // has a load from constant memory as an operand
   _memory_op_t memory_op;      // memory_op used by ptxplus
   unsigned num_operands;
   unsigned num_regs;  // count vector operand as one register operand
@@ -1057,7 +1062,7 @@ class warp_inst_t : public inst_t {
     m_empty = true;
     m_config = NULL;
 
-    // Ni: 
+    // Ni:
     m_is_ldgsts = false;
     m_is_ldgdepbar = false;
     m_is_depbar = false;
@@ -1077,7 +1082,7 @@ class warp_inst_t : public inst_t {
     m_is_cdp = 0;
     should_do_atomic = true;
 
-    // Ni: 
+    // Ni:
     m_is_ldgsts = false;
     m_is_ldgdepbar = false;
     m_is_depbar = false;
diff --git a/src/accelwattch/XML_Parse.cc b/src/accelwattch/XML_Parse.cc
index eaec74806..801de6ff9 100644
--- a/src/accelwattch/XML_Parse.cc
+++ b/src/accelwattch/XML_Parse.cc
@@ -30,14 +30,13 @@
  ***************************************************************************/
 /********************************************************************
  *      Modified by:
- * Jingwen Leng, University of Texas, Austin                
- * Syed Gilani, University of Wisconsin–Madison         
+ * Jingwen Leng, University of Texas, Austin
+ * Syed Gilani, University of Wisconsin–Madison
  * Tayler Hetherington, University of British Columbia
  * Ahmed ElTantawy, University of British Columbia
  * Vijay Kandiah, Northwestern University
  ********************************************************************/
 
-
 #include "XML_Parse.h"
 #include <stdio.h>
 #include <string>
@@ -45,14 +44,18 @@
 
 using namespace std;
 
-const char * perf_count_label[] = {
-  "TOT_INST,", "FP_INT,", "IC_H,", "IC_M,", "DC_RH,", "DC_RM,", "DC_WH,", "DC_WM,",
-  "TC_H,", "TC_M,", "CC_H,", "CC_M,", "SHRD_ACC,", "REG_RD,", "REG_WR,", "NON_REG_OPs,",
-  "INT_ACC,", "FPU_ACC,", "DPU_ACC,", "INT_MUL24_ACC,", "INT_MUL32_ACC,", "INT_MUL_ACC,","INT_DIV_ACC,", 
-  "FP_MUL_ACC,", "FP_DIV_ACC,", "FP_SQRT_ACC,", "FP_LG_ACC,", "FP_SIN_ACC,", "FP_EXP_ACC,", "DP_MUL_ACC,", 
-  "DP_DIV_ACC,", "TENSOR_ACC,", "TEX_ACC,", "MEM_RD,","MEM_WR,", "MEM_PRE,", "L2_RH,", "L2_RM,", "L2_WH,",
-  "L2_WM,", "NOC_A,", "PIPE_A,", "IDLE_CORE_N,", "constant_power"};
-
+const char* perf_count_label[] = {
+    "TOT_INST,",      "FP_INT,",      "IC_H,",        "IC_M,",
+    "DC_RH,",         "DC_RM,",       "DC_WH,",       "DC_WM,",
+    "TC_H,",          "TC_M,",        "CC_H,",        "CC_M,",
+    "SHRD_ACC,",      "REG_RD,",      "REG_WR,",      "NON_REG_OPs,",
+    "INT_ACC,",       "FPU_ACC,",     "DPU_ACC,",     "INT_MUL24_ACC,",
+    "INT_MUL32_ACC,", "INT_MUL_ACC,", "INT_DIV_ACC,", "FP_MUL_ACC,",
+    "FP_DIV_ACC,",    "FP_SQRT_ACC,", "FP_LG_ACC,",   "FP_SIN_ACC,",
+    "FP_EXP_ACC,",    "DP_MUL_ACC,",  "DP_DIV_ACC,",  "TENSOR_ACC,",
+    "TEX_ACC,",       "MEM_RD,",      "MEM_WR,",      "MEM_PRE,",
+    "L2_RH,",         "L2_RM,",       "L2_WH,",       "L2_WM,",
+    "NOC_A,",         "PIPE_A,",      "IDLE_CORE_N,", "constant_power"};
 
 void ParseXML::parse(char* filepath) {
   unsigned int i, j, k, m, n;
@@ -171,8 +174,6 @@ void ParseXML::parse(char* filepath) {
       continue;
     }
 
-
-
     if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
                "static_cat1_flane") == 0) {
       sys.static_cat1_flane =
@@ -355,7 +356,6 @@ void ParseXML::parse(char* filepath) {
       continue;
     }
 
-
     if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
                "target_chip_area") == 0) {
       sys.target_chip_area =
@@ -615,106 +615,106 @@ void ParseXML::parse(char* filepath) {
           atof(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
-    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
-                "INT_ACC")==0) {
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "INT_ACC") == 0) {
       sys.scaling_coefficients[INT_ACC] =
-          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
-    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
-                "FP_ACC")==0) {
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "FP_ACC") == 0) {
       sys.scaling_coefficients[FP_ACC] =
-          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
-    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
-                "DP_ACC")==0) {
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "DP_ACC") == 0) {
       sys.scaling_coefficients[DP_ACC] =
-          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
-    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
-                "INT_MUL24_ACC")==0) {
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "INT_MUL24_ACC") == 0) {
       sys.scaling_coefficients[INT_MUL24_ACC] =
-          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
-    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
-                "INT_MUL32_ACC")==0) {
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "INT_MUL32_ACC") == 0) {
       sys.scaling_coefficients[INT_MUL32_ACC] =
-          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
-    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
-                "INT_MUL_ACC")==0) {
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "INT_MUL_ACC") == 0) {
       sys.scaling_coefficients[INT_MUL_ACC] =
-          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
-    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
-                "INT_DIV_ACC")==0) {
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "INT_DIV_ACC") == 0) {
       sys.scaling_coefficients[INT_DIV_ACC] =
-          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
-    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
-                "FP_MUL_ACC")==0) {
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "FP_MUL_ACC") == 0) {
       sys.scaling_coefficients[FP_MUL_ACC] =
-          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
-    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
-                "FP_DIV_ACC")==0) {
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "FP_DIV_ACC") == 0) {
       sys.scaling_coefficients[FP_DIV_ACC] =
-          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
-    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
-                "FP_SQRT_ACC")==0) {
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "FP_SQRT_ACC") == 0) {
       sys.scaling_coefficients[FP_SQRT_ACC] =
-          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
-    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
-                "FP_LG_ACC")==0) {
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "FP_LG_ACC") == 0) {
       sys.scaling_coefficients[FP_LG_ACC] =
-          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
-    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
-                "FP_SIN_ACC")==0) {
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "FP_SIN_ACC") == 0) {
       sys.scaling_coefficients[FP_SIN_ACC] =
-          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
-    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
-                "FP_EXP_ACC")==0) {
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "FP_EXP_ACC") == 0) {
       sys.scaling_coefficients[FP_EXP_ACC] =
-          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
-    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
-                "DP_MUL_ACC")==0) {
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "DP_MUL_ACC") == 0) {
       sys.scaling_coefficients[DP_MUL_ACC] =
-          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
-    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
-                "DP_DIV_ACC")==0) {
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "DP_DIV_ACC") == 0) {
       sys.scaling_coefficients[DP_DIV_ACC] =
-          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
-    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
-                "TENSOR_ACC")==0) {
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "TENSOR_ACC") == 0) {
       sys.scaling_coefficients[TENSOR_ACC] =
-          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
-    if (strcmp(xNode2.getChildNode("param",i).getAttribute("name"),
-                "TEX_ACC")==0) {
+    if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
+               "TEX_ACC") == 0) {
       sys.scaling_coefficients[TEX_ACC] =
-          atof(xNode2.getChildNode("param",i).getAttribute("value"));
+          atof(xNode2.getChildNode("param", i).getAttribute("value"));
       continue;
     }
     if (strcmp(xNode2.getChildNode("param", i).getAttribute("name"),
diff --git a/src/accelwattch/XML_Parse.h b/src/accelwattch/XML_Parse.h
index c82359faf..176b82f6e 100644
--- a/src/accelwattch/XML_Parse.h
+++ b/src/accelwattch/XML_Parse.h
@@ -30,8 +30,8 @@
  ***************************************************************************/
 /********************************************************************
  *      Modified by:
- * Jingwen Leng, University of Texas, Austin                
- * Syed Gilani, University of Wisconsin–Madison         
+ * Jingwen Leng, University of Texas, Austin
+ * Syed Gilani, University of Wisconsin–Madison
  * Tayler Hetherington, University of British Columbia
  * Ahmed ElTantawy, University of British Columbia
  * Vijay Kandiah, Northwestern University
@@ -70,7 +70,7 @@ ToXMLStringTool tx,tx2;
 extern const char* perf_count_label[];
 
 enum perf_count_t {
-  TOT_INST=0,
+  TOT_INST = 0,
   FP_INT,
   IC_H,
   IC_M,
@@ -86,23 +86,23 @@ enum perf_count_t {
   REG_RD,
   REG_WR,
   NON_REG_OPs,
-  INT_ACC, //SPU
-  FP_ACC, //FPU
-  DP_ACC, //FPU
-  INT_MUL24_ACC, //SFU
-  INT_MUL32_ACC, //SFU
-  INT_MUL_ACC, //SFU 
-  INT_DIV_ACC, //SFU
-  FP_MUL_ACC, //SFU
-  FP_DIV_ACC, //SFU
-  FP_SQRT_ACC, //SFU
-  FP_LG_ACC, //SFU
-  FP_SIN_ACC, //SFU
-  FP_EXP_ACC, //SFU
-  DP_MUL_ACC, //SFU
-  DP_DIV_ACC, //SFU 
-  TENSOR_ACC, //SFU
-  TEX_ACC, //SFU 
+  INT_ACC,        // SPU
+  FP_ACC,         // FPU
+  DP_ACC,         // FPU
+  INT_MUL24_ACC,  // SFU
+  INT_MUL32_ACC,  // SFU
+  INT_MUL_ACC,    // SFU
+  INT_DIV_ACC,    // SFU
+  FP_MUL_ACC,     // SFU
+  FP_DIV_ACC,     // SFU
+  FP_SQRT_ACC,    // SFU
+  FP_LG_ACC,      // SFU
+  FP_SIN_ACC,     // SFU
+  FP_EXP_ACC,     // SFU
+  DP_MUL_ACC,     // SFU
+  DP_DIV_ACC,     // SFU
+  TENSOR_ACC,     // SFU
+  TEX_ACC,        // SFU
   MEM_RD,
   MEM_WR,
   MEM_PRE,
diff --git a/src/accelwattch/gpgpu_sim_wrapper.cc b/src/accelwattch/gpgpu_sim_wrapper.cc
index 67d9daa1f..4883c7c54 100644
--- a/src/accelwattch/gpgpu_sim_wrapper.cc
+++ b/src/accelwattch/gpgpu_sim_wrapper.cc
@@ -1,16 +1,17 @@
-// Copyright (c) 2009-2021, Tor M. Aamodt, Tayler Hetherington, Ahmed ElTantawy, Vijay Kandiah, Nikos Hardavellas
-// The University of British Columbia, Northwestern University
-// All rights reserved.
+// Copyright (c) 2009-2021, Tor M. Aamodt, Tayler Hetherington, Ahmed ElTantawy,
+// Vijay Kandiah, Nikos Hardavellas The University of British Columbia,
+// Northwestern University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// 1. Redistributions of source code must retain the above copyright notice, this
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
 //    list of conditions and the following disclaimer;
 // 2. Redistributions in binary form must reproduce the above copyright notice,
 //    this list of conditions and the following disclaimer in the documentation
 //    and/or other materials provided with the distribution;
-// 3. Neither the names of The University of British Columbia, Northwestern 
+// 3. Neither the names of The University of British Columbia, Northwestern
 //    University nor the names of their contributors may be used to
 //    endorse or promote products derived from this software without specific
 //    prior written permission.
@@ -33,14 +34,16 @@
 #define SFU_BASE_POWER 0
 
 static const char* pwr_cmp_label[] = {
-    "IBP,", "ICP,", "DCP,", "TCP,", "CCP,", "SHRDP,", "RFP,", "INTP,", 
-    "FPUP,", "DPUP,", "INT_MUL24P,", "INT_MUL32P,", "INT_MULP,", "INT_DIVP,", 
-    "FP_MULP,", "FP_DIVP,", "FP_SQRTP,", "FP_LGP,", "FP_SINP,", "FP_EXP,", 
-    "DP_MULP,", "DP_DIVP,", "TENSORP,", "TEXP,", "SCHEDP,", "L2CP,", "MCP,", "NOCP,", 
-    "DRAMP,", "PIPEP,", "IDLE_COREP,", "CONSTP", "STATICP"};
+    "IBP,",        "ICP,",        "DCP,",      "TCP,",      "CCP,",
+    "SHRDP,",      "RFP,",        "INTP,",     "FPUP,",     "DPUP,",
+    "INT_MUL24P,", "INT_MUL32P,", "INT_MULP,", "INT_DIVP,", "FP_MULP,",
+    "FP_DIVP,",    "FP_SQRTP,",   "FP_LGP,",   "FP_SINP,",  "FP_EXP,",
+    "DP_MULP,",    "DP_DIVP,",    "TENSORP,",  "TEXP,",     "SCHEDP,",
+    "L2CP,",       "MCP,",        "NOCP,",     "DRAMP,",    "PIPEP,",
+    "IDLE_COREP,", "CONSTP",      "STATICP"};
 
 enum pwr_cmp_t {
-  IBP=0,
+  IBP = 0,
   ICP,
   DCP,
   TCP,
@@ -77,7 +80,8 @@ enum pwr_cmp_t {
 };
 
 gpgpu_sim_wrapper::gpgpu_sim_wrapper(bool power_simulation_enabled,
-                                     char* xmlfile, int power_simulation_mode, bool dvfs_enabled) {
+                                     char* xmlfile, int power_simulation_mode,
+                                     bool dvfs_enabled) {
   kernel_sample_count = 0;
   total_sample_count = 0;
 
@@ -142,7 +146,8 @@ bool gpgpu_sim_wrapper::sanity_check(double a, double b) {
   return false;
 }
 void gpgpu_sim_wrapper::init_mcpat_hw_mode(unsigned gpu_sim_cycle) {
-   p->sys.total_cycles = gpu_sim_cycle; //total simulated cycles for current kernel
+  p->sys.total_cycles =
+      gpu_sim_cycle;  // total simulated cycles for current kernel
 }
 
 void gpgpu_sim_wrapper::init_mcpat(
@@ -150,9 +155,9 @@ void gpgpu_sim_wrapper::init_mcpat(
     char* metric_trace_filename, char* steady_state_filename,
     bool power_sim_enabled, bool trace_enabled, bool steady_state_enabled,
     bool power_per_cycle_dump, double steady_power_deviation,
-    double steady_min_period, int zlevel, double init_val,
-    int stat_sample_freq, int power_sim_mode, bool dvfs_enabled,
-    unsigned clock_freq, unsigned num_shaders) {
+    double steady_min_period, int zlevel, double init_val, int stat_sample_freq,
+    int power_sim_mode, bool dvfs_enabled, unsigned clock_freq,
+    unsigned num_shaders) {
   // Write File Headers for (-metrics trace, -power trace)
 
   reset_counters();
@@ -382,10 +387,7 @@ void gpgpu_sim_wrapper::set_l2cache_power(double read_hits, double read_misses,
   sample_perf_counters[L2_WM] = write_misses;
 }
 
-void gpgpu_sim_wrapper::set_num_cores(double num_core) {
-  
-  num_cores = num_core;
-}
+void gpgpu_sim_wrapper::set_num_cores(double num_core) { num_cores = num_core; }
 
 void gpgpu_sim_wrapper::set_idle_core_power(double num_idle_core) {
   p->sys.num_idle_cores = num_idle_core;
@@ -411,29 +413,25 @@ void gpgpu_sim_wrapper::set_mem_ctrl_power(double reads, double writes,
   sample_perf_counters[MEM_PRE] = dram_precharge;
 }
 
-
 void gpgpu_sim_wrapper::set_model_voltage(double model_voltage) {
-	modeled_chip_voltage = model_voltage;
+  modeled_chip_voltage = model_voltage;
 }
 
-
 void gpgpu_sim_wrapper::set_exec_unit_power(double fpu_accesses,
                                             double ialu_accesses,
                                             double sfu_accesses) {
   p->sys.core[0].fpu_accesses = fpu_accesses;
   tot_fpu_accesses = fpu_accesses;
-  //Integer ALU (not present in Tesla)
+  // Integer ALU (not present in Tesla)
   p->sys.core[0].ialu_accesses = ialu_accesses;
 
-  //Sfu accesses
+  // Sfu accesses
   p->sys.core[0].mul_accesses = sfu_accesses;
   tot_sfu_accesses = sfu_accesses;
 }
 
-PowerscalingCoefficients * gpgpu_sim_wrapper::get_scaling_coeffs()
-{
-
-  PowerscalingCoefficients * scalingCoeffs = new PowerscalingCoefficients();
+PowerscalingCoefficients* gpgpu_sim_wrapper::get_scaling_coeffs() {
+  PowerscalingCoefficients* scalingCoeffs = new PowerscalingCoefficients();
 
   scalingCoeffs->int_coeff = p->sys.scaling_coefficients[INT_ACC];
   scalingCoeffs->int_mul_coeff = p->sys.scaling_coefficients[INT_MUL_ACC];
@@ -453,68 +451,55 @@ PowerscalingCoefficients * gpgpu_sim_wrapper::get_scaling_coeffs()
   scalingCoeffs->tensor_coeff = p->sys.scaling_coefficients[TENSOR_ACC];
   scalingCoeffs->tex_coeff = p->sys.scaling_coefficients[TEX_ACC];
   return scalingCoeffs;
-
 }
 
-void gpgpu_sim_wrapper::set_int_accesses(double ialu_accesses, 
-                                        double imul24_accesses, 
-                                        double imul32_accesses, 
-                                        double imul_accesses, 
-                                        double idiv_accesses)
-{
-
-  sample_perf_counters[INT_ACC]=ialu_accesses;
-  sample_perf_counters[INT_MUL24_ACC]=imul24_accesses;
-  sample_perf_counters[INT_MUL32_ACC]=imul32_accesses;
-  sample_perf_counters[INT_MUL_ACC]=imul_accesses;
-  sample_perf_counters[INT_DIV_ACC]=idiv_accesses;
+void gpgpu_sim_wrapper::set_int_accesses(double ialu_accesses,
+                                         double imul24_accesses,
+                                         double imul32_accesses,
+                                         double imul_accesses,
+                                         double idiv_accesses) {
+  sample_perf_counters[INT_ACC] = ialu_accesses;
+  sample_perf_counters[INT_MUL24_ACC] = imul24_accesses;
+  sample_perf_counters[INT_MUL32_ACC] = imul32_accesses;
+  sample_perf_counters[INT_MUL_ACC] = imul_accesses;
+  sample_perf_counters[INT_DIV_ACC] = idiv_accesses;
 }
 
-void gpgpu_sim_wrapper::set_dp_accesses(double dpu_accesses, 
-                                        double dpmul_accesses, 
-                                        double dpdiv_accesses)
-{
-  sample_perf_counters[DP_ACC]=dpu_accesses;
-  sample_perf_counters[DP_MUL_ACC]=dpmul_accesses;
-  sample_perf_counters[DP_DIV_ACC]=dpdiv_accesses;
+void gpgpu_sim_wrapper::set_dp_accesses(double dpu_accesses,
+                                        double dpmul_accesses,
+                                        double dpdiv_accesses) {
+  sample_perf_counters[DP_ACC] = dpu_accesses;
+  sample_perf_counters[DP_MUL_ACC] = dpmul_accesses;
+  sample_perf_counters[DP_DIV_ACC] = dpdiv_accesses;
 }
 
-void gpgpu_sim_wrapper::set_fp_accesses(double fpu_accesses, 
-                                        double fpmul_accesses, 
-                                        double fpdiv_accesses)
-{
-  sample_perf_counters[FP_ACC]=fpu_accesses;
-  sample_perf_counters[FP_MUL_ACC]=fpmul_accesses;
-  sample_perf_counters[FP_DIV_ACC]=fpdiv_accesses;
+void gpgpu_sim_wrapper::set_fp_accesses(double fpu_accesses,
+                                        double fpmul_accesses,
+                                        double fpdiv_accesses) {
+  sample_perf_counters[FP_ACC] = fpu_accesses;
+  sample_perf_counters[FP_MUL_ACC] = fpmul_accesses;
+  sample_perf_counters[FP_DIV_ACC] = fpdiv_accesses;
 }
 
-void gpgpu_sim_wrapper::set_trans_accesses(double sqrt_accesses, 
-                                           double log_accesses, 
-                                           double sin_accesses, 
-                                           double exp_accesses)
-{
-
-  sample_perf_counters[FP_SQRT_ACC]=sqrt_accesses;
-  sample_perf_counters[FP_LG_ACC]=log_accesses;
-  sample_perf_counters[FP_SIN_ACC]=sin_accesses;
-  sample_perf_counters[FP_EXP_ACC]=exp_accesses;
-
+void gpgpu_sim_wrapper::set_trans_accesses(double sqrt_accesses,
+                                           double log_accesses,
+                                           double sin_accesses,
+                                           double exp_accesses) {
+  sample_perf_counters[FP_SQRT_ACC] = sqrt_accesses;
+  sample_perf_counters[FP_LG_ACC] = log_accesses;
+  sample_perf_counters[FP_SIN_ACC] = sin_accesses;
+  sample_perf_counters[FP_EXP_ACC] = exp_accesses;
 }
 
-void gpgpu_sim_wrapper::set_tensor_accesses(double tensor_accesses)
-{
-  sample_perf_counters[TENSOR_ACC]=tensor_accesses;
-
+void gpgpu_sim_wrapper::set_tensor_accesses(double tensor_accesses) {
+  sample_perf_counters[TENSOR_ACC] = tensor_accesses;
 }
 
-void gpgpu_sim_wrapper::set_tex_accesses(double tex_accesses)
-{
-  sample_perf_counters[TEX_ACC]=tex_accesses;
-
+void gpgpu_sim_wrapper::set_tex_accesses(double tex_accesses) {
+  sample_perf_counters[TEX_ACC] = tex_accesses;
 }
 
-void gpgpu_sim_wrapper::set_avg_active_threads(float active_threads)
-{
+void gpgpu_sim_wrapper::set_avg_active_threads(float active_threads) {
   avg_threads_per_warp = (unsigned)ceil(active_threads);
   avg_threads_per_warp_tot += active_threads;
 }
@@ -536,7 +521,8 @@ void gpgpu_sim_wrapper::power_metrics_calculations() {
   kernel_sample_count++;
 
   // Current sample power
-  double sample_power = proc->rt_power.readOp.dynamic + sample_cmp_pwr[CONSTP] + sample_cmp_pwr[STATICP];
+  double sample_power = proc->rt_power.readOp.dynamic + sample_cmp_pwr[CONSTP] +
+                        sample_cmp_pwr[STATICP];
   // double sample_power;
   // for(unsigned i=0; i<num_pwr_cmps; i++){
   //   sample_power+=sample_cmp_pwr[i]; //fix for dvfs
@@ -602,371 +588,503 @@ void gpgpu_sim_wrapper::print_trace_files() {
   close_files();
 }
 
-void gpgpu_sim_wrapper::update_coefficients()
-{
-
-  initpower_coeff[FP_INT]=proc->cores[0]->get_coefficient_fpint_insts();
-  effpower_coeff[FP_INT]=initpower_coeff[FP_INT] * p->sys.scaling_coefficients[FP_INT];
-
-  initpower_coeff[TOT_INST]=proc->cores[0]->get_coefficient_tot_insts();
-  effpower_coeff[TOT_INST]=initpower_coeff[TOT_INST] * p->sys.scaling_coefficients[TOT_INST];
-
-  initpower_coeff[REG_RD]=proc->cores[0]->get_coefficient_regreads_accesses()*(proc->cores[0]->exu->rf_fu_clockRate/proc->cores[0]->exu->clockRate);
-  initpower_coeff[REG_WR]=proc->cores[0]->get_coefficient_regwrites_accesses()*(proc->cores[0]->exu->rf_fu_clockRate/proc->cores[0]->exu->clockRate);
-  initpower_coeff[NON_REG_OPs]=proc->cores[0]->get_coefficient_noregfileops_accesses()*(proc->cores[0]->exu->rf_fu_clockRate/proc->cores[0]->exu->clockRate);
-  effpower_coeff[REG_RD]=initpower_coeff[REG_RD]*p->sys.scaling_coefficients[REG_RD];
-  effpower_coeff[REG_WR]=initpower_coeff[REG_WR]*p->sys.scaling_coefficients[REG_WR];
-  effpower_coeff[NON_REG_OPs]=initpower_coeff[NON_REG_OPs]*p->sys.scaling_coefficients[NON_REG_OPs];
-
-  initpower_coeff[IC_H]=proc->cores[0]->get_coefficient_icache_hits();
-  initpower_coeff[IC_M]=proc->cores[0]->get_coefficient_icache_misses();
-  effpower_coeff[IC_H]=initpower_coeff[IC_H]*p->sys.scaling_coefficients[IC_H];
-  effpower_coeff[IC_M]=initpower_coeff[IC_M]*p->sys.scaling_coefficients[IC_M];
-
-  initpower_coeff[CC_H]=(proc->cores[0]->get_coefficient_ccache_readhits()+proc->get_coefficient_readcoalescing());
-  initpower_coeff[CC_M]=(proc->cores[0]->get_coefficient_ccache_readmisses()+proc->get_coefficient_readcoalescing());
-  effpower_coeff[CC_H]=initpower_coeff[CC_H]*p->sys.scaling_coefficients[CC_H];
-  effpower_coeff[CC_M]=initpower_coeff[CC_M]*p->sys.scaling_coefficients[CC_M];
-
-  initpower_coeff[TC_H]=(proc->cores[0]->get_coefficient_tcache_readhits()+proc->get_coefficient_readcoalescing());
-  initpower_coeff[TC_M]=(proc->cores[0]->get_coefficient_tcache_readmisses()+proc->get_coefficient_readcoalescing());
-  effpower_coeff[TC_H]=initpower_coeff[TC_H]*p->sys.scaling_coefficients[TC_H];
-  effpower_coeff[TC_M]=initpower_coeff[TC_M]*p->sys.scaling_coefficients[TC_M];
-
-  initpower_coeff[SHRD_ACC]=proc->cores[0]->get_coefficient_sharedmemory_readhits();
-  effpower_coeff[SHRD_ACC]=initpower_coeff[SHRD_ACC]*p->sys.scaling_coefficients[SHRD_ACC];
-
-  initpower_coeff[DC_RH]=(proc->cores[0]->get_coefficient_dcache_readhits() + proc->get_coefficient_readcoalescing());
-  initpower_coeff[DC_RM]=(proc->cores[0]->get_coefficient_dcache_readmisses() + proc->get_coefficient_readcoalescing());
-  initpower_coeff[DC_WH]=(proc->cores[0]->get_coefficient_dcache_writehits() + proc->get_coefficient_writecoalescing());
-  initpower_coeff[DC_WM]=(proc->cores[0]->get_coefficient_dcache_writemisses() + proc->get_coefficient_writecoalescing());
-  effpower_coeff[DC_RH]=initpower_coeff[DC_RH]*p->sys.scaling_coefficients[DC_RH];
-  effpower_coeff[DC_RM]=initpower_coeff[DC_RM]*p->sys.scaling_coefficients[DC_RM];
-  effpower_coeff[DC_WH]=initpower_coeff[DC_WH]*p->sys.scaling_coefficients[DC_WH];
-  effpower_coeff[DC_WM]=initpower_coeff[DC_WM]*p->sys.scaling_coefficients[DC_WM];
-
-  initpower_coeff[L2_RH]=proc->get_coefficient_l2_read_hits();
-  initpower_coeff[L2_RM]=proc->get_coefficient_l2_read_misses();
-  initpower_coeff[L2_WH]=proc->get_coefficient_l2_write_hits();
-  initpower_coeff[L2_WM]=proc->get_coefficient_l2_write_misses();
-  effpower_coeff[L2_RH]=initpower_coeff[L2_RH]*p->sys.scaling_coefficients[L2_RH];
-  effpower_coeff[L2_RM]=initpower_coeff[L2_RM]*p->sys.scaling_coefficients[L2_RM];
-  effpower_coeff[L2_WH]=initpower_coeff[L2_WH]*p->sys.scaling_coefficients[L2_WH];
-  effpower_coeff[L2_WM]=initpower_coeff[L2_WM]*p->sys.scaling_coefficients[L2_WM];
-
-  initpower_coeff[IDLE_CORE_N]=p->sys.idle_core_power * proc->cores[0]->executionTime;
-  effpower_coeff[IDLE_CORE_N]=initpower_coeff[IDLE_CORE_N]*p->sys.scaling_coefficients[IDLE_CORE_N];
-
-  initpower_coeff[PIPE_A]=proc->cores[0]->get_coefficient_duty_cycle();
-  effpower_coeff[PIPE_A]=initpower_coeff[PIPE_A]*p->sys.scaling_coefficients[PIPE_A];
-
-  initpower_coeff[MEM_RD]=proc->get_coefficient_mem_reads();
-  initpower_coeff[MEM_WR]=proc->get_coefficient_mem_writes();
-  initpower_coeff[MEM_PRE]=proc->get_coefficient_mem_pre();
-  effpower_coeff[MEM_RD]=initpower_coeff[MEM_RD]*p->sys.scaling_coefficients[MEM_RD];
-  effpower_coeff[MEM_WR]=initpower_coeff[MEM_WR]*p->sys.scaling_coefficients[MEM_WR];
-  effpower_coeff[MEM_PRE]=initpower_coeff[MEM_PRE]*p->sys.scaling_coefficients[MEM_PRE];
-  
+void gpgpu_sim_wrapper::update_coefficients() {
+  initpower_coeff[FP_INT] = proc->cores[0]->get_coefficient_fpint_insts();
+  effpower_coeff[FP_INT] =
+      initpower_coeff[FP_INT] * p->sys.scaling_coefficients[FP_INT];
+
+  initpower_coeff[TOT_INST] = proc->cores[0]->get_coefficient_tot_insts();
+  effpower_coeff[TOT_INST] =
+      initpower_coeff[TOT_INST] * p->sys.scaling_coefficients[TOT_INST];
+
+  initpower_coeff[REG_RD] =
+      proc->cores[0]->get_coefficient_regreads_accesses() *
+      (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
+  initpower_coeff[REG_WR] =
+      proc->cores[0]->get_coefficient_regwrites_accesses() *
+      (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
+  initpower_coeff[NON_REG_OPs] =
+      proc->cores[0]->get_coefficient_noregfileops_accesses() *
+      (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
+  effpower_coeff[REG_RD] =
+      initpower_coeff[REG_RD] * p->sys.scaling_coefficients[REG_RD];
+  effpower_coeff[REG_WR] =
+      initpower_coeff[REG_WR] * p->sys.scaling_coefficients[REG_WR];
+  effpower_coeff[NON_REG_OPs] =
+      initpower_coeff[NON_REG_OPs] * p->sys.scaling_coefficients[NON_REG_OPs];
+
+  initpower_coeff[IC_H] = proc->cores[0]->get_coefficient_icache_hits();
+  initpower_coeff[IC_M] = proc->cores[0]->get_coefficient_icache_misses();
+  effpower_coeff[IC_H] =
+      initpower_coeff[IC_H] * p->sys.scaling_coefficients[IC_H];
+  effpower_coeff[IC_M] =
+      initpower_coeff[IC_M] * p->sys.scaling_coefficients[IC_M];
+
+  initpower_coeff[CC_H] = (proc->cores[0]->get_coefficient_ccache_readhits() +
+                           proc->get_coefficient_readcoalescing());
+  initpower_coeff[CC_M] = (proc->cores[0]->get_coefficient_ccache_readmisses() +
+                           proc->get_coefficient_readcoalescing());
+  effpower_coeff[CC_H] =
+      initpower_coeff[CC_H] * p->sys.scaling_coefficients[CC_H];
+  effpower_coeff[CC_M] =
+      initpower_coeff[CC_M] * p->sys.scaling_coefficients[CC_M];
+
+  initpower_coeff[TC_H] = (proc->cores[0]->get_coefficient_tcache_readhits() +
+                           proc->get_coefficient_readcoalescing());
+  initpower_coeff[TC_M] = (proc->cores[0]->get_coefficient_tcache_readmisses() +
+                           proc->get_coefficient_readcoalescing());
+  effpower_coeff[TC_H] =
+      initpower_coeff[TC_H] * p->sys.scaling_coefficients[TC_H];
+  effpower_coeff[TC_M] =
+      initpower_coeff[TC_M] * p->sys.scaling_coefficients[TC_M];
+
+  initpower_coeff[SHRD_ACC] =
+      proc->cores[0]->get_coefficient_sharedmemory_readhits();
+  effpower_coeff[SHRD_ACC] =
+      initpower_coeff[SHRD_ACC] * p->sys.scaling_coefficients[SHRD_ACC];
+
+  initpower_coeff[DC_RH] = (proc->cores[0]->get_coefficient_dcache_readhits() +
+                            proc->get_coefficient_readcoalescing());
+  initpower_coeff[DC_RM] =
+      (proc->cores[0]->get_coefficient_dcache_readmisses() +
+       proc->get_coefficient_readcoalescing());
+  initpower_coeff[DC_WH] = (proc->cores[0]->get_coefficient_dcache_writehits() +
+                            proc->get_coefficient_writecoalescing());
+  initpower_coeff[DC_WM] =
+      (proc->cores[0]->get_coefficient_dcache_writemisses() +
+       proc->get_coefficient_writecoalescing());
+  effpower_coeff[DC_RH] =
+      initpower_coeff[DC_RH] * p->sys.scaling_coefficients[DC_RH];
+  effpower_coeff[DC_RM] =
+      initpower_coeff[DC_RM] * p->sys.scaling_coefficients[DC_RM];
+  effpower_coeff[DC_WH] =
+      initpower_coeff[DC_WH] * p->sys.scaling_coefficients[DC_WH];
+  effpower_coeff[DC_WM] =
+      initpower_coeff[DC_WM] * p->sys.scaling_coefficients[DC_WM];
+
+  initpower_coeff[L2_RH] = proc->get_coefficient_l2_read_hits();
+  initpower_coeff[L2_RM] = proc->get_coefficient_l2_read_misses();
+  initpower_coeff[L2_WH] = proc->get_coefficient_l2_write_hits();
+  initpower_coeff[L2_WM] = proc->get_coefficient_l2_write_misses();
+  effpower_coeff[L2_RH] =
+      initpower_coeff[L2_RH] * p->sys.scaling_coefficients[L2_RH];
+  effpower_coeff[L2_RM] =
+      initpower_coeff[L2_RM] * p->sys.scaling_coefficients[L2_RM];
+  effpower_coeff[L2_WH] =
+      initpower_coeff[L2_WH] * p->sys.scaling_coefficients[L2_WH];
+  effpower_coeff[L2_WM] =
+      initpower_coeff[L2_WM] * p->sys.scaling_coefficients[L2_WM];
+
+  initpower_coeff[IDLE_CORE_N] =
+      p->sys.idle_core_power * proc->cores[0]->executionTime;
+  effpower_coeff[IDLE_CORE_N] =
+      initpower_coeff[IDLE_CORE_N] * p->sys.scaling_coefficients[IDLE_CORE_N];
+
+  initpower_coeff[PIPE_A] = proc->cores[0]->get_coefficient_duty_cycle();
+  effpower_coeff[PIPE_A] =
+      initpower_coeff[PIPE_A] * p->sys.scaling_coefficients[PIPE_A];
+
+  initpower_coeff[MEM_RD] = proc->get_coefficient_mem_reads();
+  initpower_coeff[MEM_WR] = proc->get_coefficient_mem_writes();
+  initpower_coeff[MEM_PRE] = proc->get_coefficient_mem_pre();
+  effpower_coeff[MEM_RD] =
+      initpower_coeff[MEM_RD] * p->sys.scaling_coefficients[MEM_RD];
+  effpower_coeff[MEM_WR] =
+      initpower_coeff[MEM_WR] * p->sys.scaling_coefficients[MEM_WR];
+  effpower_coeff[MEM_PRE] =
+      initpower_coeff[MEM_PRE] * p->sys.scaling_coefficients[MEM_PRE];
+
   double fp_coeff = proc->cores[0]->get_coefficient_fpu_accesses();
   double sfu_coeff = proc->cores[0]->get_coefficient_sfu_accesses();
 
-  initpower_coeff[INT_ACC]= proc->cores[0]->get_coefficient_ialu_accesses()*(proc->cores[0]->exu->rf_fu_clockRate/proc->cores[0]->exu->clockRate);
+  initpower_coeff[INT_ACC] =
+      proc->cores[0]->get_coefficient_ialu_accesses() *
+      (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
+
+  if (tot_fpu_accesses != 0) {
+    initpower_coeff[FP_ACC] =
+        fp_coeff * sample_perf_counters[FP_ACC] / tot_fpu_accesses;
+    initpower_coeff[DP_ACC] =
+        fp_coeff * sample_perf_counters[DP_ACC] / tot_fpu_accesses;
+  } else {
+    initpower_coeff[FP_ACC] = 0;
+    initpower_coeff[DP_ACC] = 0;
+  }
 
-  if(tot_fpu_accesses != 0){
-    initpower_coeff[FP_ACC]= fp_coeff * sample_perf_counters[FP_ACC]/tot_fpu_accesses;
-    initpower_coeff[DP_ACC]= fp_coeff * sample_perf_counters[DP_ACC]/tot_fpu_accesses;
+  if (tot_sfu_accesses != 0) {
+    initpower_coeff[INT_MUL24_ACC] =
+        sfu_coeff * sample_perf_counters[INT_MUL24_ACC] / tot_sfu_accesses;
+    initpower_coeff[INT_MUL32_ACC] =
+        sfu_coeff * sample_perf_counters[INT_MUL32_ACC] / tot_sfu_accesses;
+    initpower_coeff[INT_MUL_ACC] =
+        sfu_coeff * sample_perf_counters[INT_MUL_ACC] / tot_sfu_accesses;
+    initpower_coeff[INT_DIV_ACC] =
+        sfu_coeff * sample_perf_counters[INT_DIV_ACC] / tot_sfu_accesses;
+    initpower_coeff[DP_MUL_ACC] =
+        sfu_coeff * sample_perf_counters[DP_MUL_ACC] / tot_sfu_accesses;
+    initpower_coeff[DP_DIV_ACC] =
+        sfu_coeff * sample_perf_counters[DP_DIV_ACC] / tot_sfu_accesses;
+    initpower_coeff[FP_MUL_ACC] =
+        sfu_coeff * sample_perf_counters[FP_MUL_ACC] / tot_sfu_accesses;
+    initpower_coeff[FP_DIV_ACC] =
+        sfu_coeff * sample_perf_counters[FP_DIV_ACC] / tot_sfu_accesses;
+    initpower_coeff[FP_SQRT_ACC] =
+        sfu_coeff * sample_perf_counters[FP_SQRT_ACC] / tot_sfu_accesses;
+    initpower_coeff[FP_LG_ACC] =
+        sfu_coeff * sample_perf_counters[FP_LG_ACC] / tot_sfu_accesses;
+    initpower_coeff[FP_SIN_ACC] =
+        sfu_coeff * sample_perf_counters[FP_SIN_ACC] / tot_sfu_accesses;
+    initpower_coeff[FP_EXP_ACC] =
+        sfu_coeff * sample_perf_counters[FP_EXP_ACC] / tot_sfu_accesses;
+    initpower_coeff[TENSOR_ACC] =
+        sfu_coeff * sample_perf_counters[TENSOR_ACC] / tot_sfu_accesses;
+    initpower_coeff[TEX_ACC] =
+        sfu_coeff * sample_perf_counters[TEX_ACC] / tot_sfu_accesses;
+  } else {
+    initpower_coeff[INT_MUL24_ACC] = 0;
+    initpower_coeff[INT_MUL32_ACC] = 0;
+    initpower_coeff[INT_MUL_ACC] = 0;
+    initpower_coeff[INT_DIV_ACC] = 0;
+    initpower_coeff[DP_MUL_ACC] = 0;
+    initpower_coeff[DP_DIV_ACC] = 0;
+    initpower_coeff[FP_MUL_ACC] = 0;
+    initpower_coeff[FP_DIV_ACC] = 0;
+    initpower_coeff[FP_SQRT_ACC] = 0;
+    initpower_coeff[FP_LG_ACC] = 0;
+    initpower_coeff[FP_SIN_ACC] = 0;
+    initpower_coeff[FP_EXP_ACC] = 0;
+    initpower_coeff[TENSOR_ACC] = 0;
+    initpower_coeff[TEX_ACC] = 0;
   }
-  else{
-    initpower_coeff[FP_ACC]= 0;
-    initpower_coeff[DP_ACC]= 0;
+
+  effpower_coeff[INT_ACC] = initpower_coeff[INT_ACC];
+  effpower_coeff[FP_ACC] = initpower_coeff[FP_ACC];
+  effpower_coeff[DP_ACC] = initpower_coeff[DP_ACC];
+  effpower_coeff[INT_MUL24_ACC] = initpower_coeff[INT_MUL24_ACC];
+  effpower_coeff[INT_MUL32_ACC] = initpower_coeff[INT_MUL32_ACC];
+  effpower_coeff[INT_MUL_ACC] = initpower_coeff[INT_MUL_ACC];
+  effpower_coeff[INT_DIV_ACC] = initpower_coeff[INT_DIV_ACC];
+  effpower_coeff[DP_MUL_ACC] = initpower_coeff[DP_MUL_ACC];
+  effpower_coeff[DP_DIV_ACC] = initpower_coeff[DP_DIV_ACC];
+  effpower_coeff[FP_MUL_ACC] = initpower_coeff[FP_MUL_ACC];
+  effpower_coeff[FP_DIV_ACC] = initpower_coeff[FP_DIV_ACC];
+  effpower_coeff[FP_SQRT_ACC] = initpower_coeff[FP_SQRT_ACC];
+  effpower_coeff[FP_LG_ACC] = initpower_coeff[FP_LG_ACC];
+  effpower_coeff[FP_SIN_ACC] = initpower_coeff[FP_SIN_ACC];
+  effpower_coeff[FP_EXP_ACC] = initpower_coeff[FP_EXP_ACC];
+  effpower_coeff[TENSOR_ACC] = initpower_coeff[TENSOR_ACC];
+  effpower_coeff[TEX_ACC] = initpower_coeff[TEX_ACC];
+
+  initpower_coeff[NOC_A] = proc->get_coefficient_noc_accesses();
+  effpower_coeff[NOC_A] =
+      initpower_coeff[NOC_A] * p->sys.scaling_coefficients[NOC_A];
+
+  // const_dynamic_power=proc->get_const_dynamic_power()/(proc->cores[0]->executionTime);
+
+  for (unsigned i = 0; i < num_perf_counters; i++) {
+    initpower_coeff[i] /= (proc->cores[0]->executionTime);
+    effpower_coeff[i] /= (proc->cores[0]->executionTime);
   }
+}
 
-  if(tot_sfu_accesses != 0){
-    initpower_coeff[INT_MUL24_ACC]= sfu_coeff * sample_perf_counters[INT_MUL24_ACC]/tot_sfu_accesses;
-    initpower_coeff[INT_MUL32_ACC]= sfu_coeff * sample_perf_counters[INT_MUL32_ACC]/tot_sfu_accesses;
-    initpower_coeff[INT_MUL_ACC]= sfu_coeff * sample_perf_counters[INT_MUL_ACC]/tot_sfu_accesses;
-    initpower_coeff[INT_DIV_ACC]= sfu_coeff * sample_perf_counters[INT_DIV_ACC]/tot_sfu_accesses;
-    initpower_coeff[DP_MUL_ACC]= sfu_coeff * sample_perf_counters[DP_MUL_ACC]/tot_sfu_accesses;
-    initpower_coeff[DP_DIV_ACC]= sfu_coeff * sample_perf_counters[DP_DIV_ACC]/tot_sfu_accesses;
-    initpower_coeff[FP_MUL_ACC]= sfu_coeff * sample_perf_counters[FP_MUL_ACC]/tot_sfu_accesses;
-    initpower_coeff[FP_DIV_ACC]= sfu_coeff * sample_perf_counters[FP_DIV_ACC]/tot_sfu_accesses;
-    initpower_coeff[FP_SQRT_ACC]= sfu_coeff * sample_perf_counters[FP_SQRT_ACC]/tot_sfu_accesses;
-    initpower_coeff[FP_LG_ACC]= sfu_coeff * sample_perf_counters[FP_LG_ACC]/tot_sfu_accesses;
-    initpower_coeff[FP_SIN_ACC]= sfu_coeff * sample_perf_counters[FP_SIN_ACC]/tot_sfu_accesses;
-    initpower_coeff[FP_EXP_ACC]= sfu_coeff * sample_perf_counters[FP_EXP_ACC]/tot_sfu_accesses;
-    initpower_coeff[TENSOR_ACC]= sfu_coeff * sample_perf_counters[TENSOR_ACC]/tot_sfu_accesses;
-    initpower_coeff[TEX_ACC]= sfu_coeff * sample_perf_counters[TEX_ACC]/tot_sfu_accesses;
+double gpgpu_sim_wrapper::calculate_static_power() {
+  double int_accesses =
+      initpower_coeff[INT_ACC] + initpower_coeff[INT_MUL24_ACC] +
+      initpower_coeff[INT_MUL32_ACC] + initpower_coeff[INT_MUL_ACC] +
+      initpower_coeff[INT_DIV_ACC];
+  double int_add_accesses = initpower_coeff[INT_ACC];
+  double int_mul_accesses =
+      initpower_coeff[INT_MUL24_ACC] + initpower_coeff[INT_MUL32_ACC] +
+      initpower_coeff[INT_MUL_ACC] + initpower_coeff[INT_DIV_ACC];
+  double fp_accesses = initpower_coeff[FP_ACC] + initpower_coeff[FP_MUL_ACC] +
+                       initpower_coeff[FP_DIV_ACC];
+  double dp_accesses = initpower_coeff[DP_ACC] + initpower_coeff[DP_MUL_ACC] +
+                       initpower_coeff[DP_DIV_ACC];
+  double sfu_accesses =
+      initpower_coeff[FP_SQRT_ACC] + initpower_coeff[FP_LG_ACC] +
+      initpower_coeff[FP_SIN_ACC] + initpower_coeff[FP_EXP_ACC];
+  double tensor_accesses = initpower_coeff[TENSOR_ACC];
+  double tex_accesses = initpower_coeff[TEX_ACC];
+  double total_static_power = 0.0;
+  double base_static_power = 0.0;
+  double lane_static_power = 0.0;
+  double per_active_core = (num_cores - num_idle_cores) / num_cores;
+
+  double l1_accesses = initpower_coeff[DC_RH] + initpower_coeff[DC_RM] +
+                       initpower_coeff[DC_WH] + initpower_coeff[DC_WM];
+  double l2_accesses = initpower_coeff[L2_RH] + initpower_coeff[L2_RM] +
+                       initpower_coeff[L2_WH] + initpower_coeff[L2_WM];
+  double shared_accesses = initpower_coeff[SHRD_ACC];
+
+  if (avg_threads_per_warp ==
+      0) {  // no functional unit threads, check for memory or a 'LIGHT_SM'
+    if (l1_accesses != 0.0)
+      return (p->sys.static_l1_flane * per_active_core);
+    else if (shared_accesses != 0.0)
+      return (p->sys.static_shared_flane * per_active_core);
+    else if (l2_accesses != 0.0)
+      return (p->sys.static_l2_flane * per_active_core);
+    else  // LIGHT_SM
+      return (p->sys.static_light_flane *
+              per_active_core);  // return LIGHT_SM base static power
   }
-  else{
-    initpower_coeff[INT_MUL24_ACC]= 0;
-    initpower_coeff[INT_MUL32_ACC]= 0;
-    initpower_coeff[INT_MUL_ACC]= 0;
-    initpower_coeff[INT_DIV_ACC]= 0;
-    initpower_coeff[DP_MUL_ACC]= 0;
-    initpower_coeff[DP_DIV_ACC]= 0;
-    initpower_coeff[FP_MUL_ACC]= 0;
-    initpower_coeff[FP_DIV_ACC]= 0;
-    initpower_coeff[FP_SQRT_ACC]= 0;
-    initpower_coeff[FP_LG_ACC]= 0;
-    initpower_coeff[FP_SIN_ACC]= 0;
-    initpower_coeff[FP_EXP_ACC]= 0;
-    initpower_coeff[TENSOR_ACC]= 0;
-    initpower_coeff[TEX_ACC]= 0;
+
+  /* using a linear model for thread divergence */
+  if ((int_accesses != 0.0) && (fp_accesses != 0.0) && (dp_accesses != 0.0) &&
+      (sfu_accesses == 0.0) && (tensor_accesses == 0.0) &&
+      (tex_accesses == 0.0)) {
+    /* INT_FP_DP */
+    base_static_power = p->sys.static_cat3_flane;
+    lane_static_power = p->sys.static_cat3_addlane;
   }
 
-  effpower_coeff[INT_ACC]= initpower_coeff[INT_ACC];
-  effpower_coeff[FP_ACC]= initpower_coeff[FP_ACC];
-  effpower_coeff[DP_ACC]= initpower_coeff[DP_ACC];
-  effpower_coeff[INT_MUL24_ACC]= initpower_coeff[INT_MUL24_ACC];
-  effpower_coeff[INT_MUL32_ACC]= initpower_coeff[INT_MUL32_ACC];
-  effpower_coeff[INT_MUL_ACC]= initpower_coeff[INT_MUL_ACC];
-  effpower_coeff[INT_DIV_ACC]= initpower_coeff[INT_DIV_ACC];
-  effpower_coeff[DP_MUL_ACC]= initpower_coeff[DP_MUL_ACC];
-  effpower_coeff[DP_DIV_ACC]= initpower_coeff[DP_DIV_ACC];
-  effpower_coeff[FP_MUL_ACC]= initpower_coeff[FP_MUL_ACC];
-  effpower_coeff[FP_DIV_ACC]= initpower_coeff[FP_DIV_ACC];
-  effpower_coeff[FP_SQRT_ACC]= initpower_coeff[FP_SQRT_ACC];
-  effpower_coeff[FP_LG_ACC]= initpower_coeff[FP_LG_ACC];
-  effpower_coeff[FP_SIN_ACC]= initpower_coeff[FP_SIN_ACC];
-  effpower_coeff[FP_EXP_ACC]= initpower_coeff[FP_EXP_ACC];
-  effpower_coeff[TENSOR_ACC]= initpower_coeff[TENSOR_ACC];
-  effpower_coeff[TEX_ACC]= initpower_coeff[TEX_ACC];
-
-  initpower_coeff[NOC_A]=proc->get_coefficient_noc_accesses();
-  effpower_coeff[NOC_A]=initpower_coeff[NOC_A]*p->sys.scaling_coefficients[NOC_A];
-
-  //const_dynamic_power=proc->get_const_dynamic_power()/(proc->cores[0]->executionTime);
-
-  for(unsigned i=0; i<num_perf_counters; i++){
-    initpower_coeff[i]/=(proc->cores[0]->executionTime);
-    effpower_coeff[i]/=(proc->cores[0]->executionTime);
+  else if ((int_accesses != 0.0) && (fp_accesses != 0.0) &&
+           (dp_accesses == 0.0) && (sfu_accesses == 0.0) &&
+           (tensor_accesses != 0.0) && (tex_accesses == 0.0)) {
+    /* INT_FP_TENSOR */
+    base_static_power = p->sys.static_cat6_flane;
+    lane_static_power = p->sys.static_cat6_addlane;
   }
-}
 
-double gpgpu_sim_wrapper::calculate_static_power(){ 
-	double int_accesses = initpower_coeff[INT_ACC] + initpower_coeff[INT_MUL24_ACC] + initpower_coeff[INT_MUL32_ACC] + initpower_coeff[INT_MUL_ACC] + initpower_coeff[INT_DIV_ACC];
-	double int_add_accesses = initpower_coeff[INT_ACC];
-	double int_mul_accesses = initpower_coeff[INT_MUL24_ACC] + initpower_coeff[INT_MUL32_ACC] + initpower_coeff[INT_MUL_ACC] + initpower_coeff[INT_DIV_ACC];
-	double fp_accesses = initpower_coeff[FP_ACC] + initpower_coeff[FP_MUL_ACC] + initpower_coeff[FP_DIV_ACC];
-	double dp_accesses = initpower_coeff[DP_ACC] + initpower_coeff[DP_MUL_ACC] + initpower_coeff[DP_DIV_ACC];
-	double sfu_accesses = initpower_coeff[FP_SQRT_ACC] + initpower_coeff[FP_LG_ACC] + initpower_coeff[FP_SIN_ACC] + initpower_coeff[FP_EXP_ACC];
-	double tensor_accesses = initpower_coeff[TENSOR_ACC];
-	double tex_accesses = initpower_coeff[TEX_ACC];
-	double total_static_power = 0.0;
-	double base_static_power = 0.0; 
-	double lane_static_power = 0.0;
-	double per_active_core = (num_cores - num_idle_cores)/num_cores;
-
-
-	double l1_accesses = initpower_coeff[DC_RH] + initpower_coeff[DC_RM] + initpower_coeff[DC_WH] + initpower_coeff[DC_WM];
-	double l2_accesses = initpower_coeff[L2_RH] + initpower_coeff[L2_RM] + initpower_coeff[L2_WH] + initpower_coeff[L2_WM];
-	double shared_accesses = initpower_coeff[SHRD_ACC];
-
-
-	if(avg_threads_per_warp == 0){ //no functional unit threads, check for memory or a 'LIGHT_SM'
-		if(l1_accesses != 0.0)
-			return (p->sys.static_l1_flane*per_active_core);
-		else if(shared_accesses != 0.0)
-			return (p->sys.static_shared_flane*per_active_core);
-		else if(l2_accesses != 0.0)
-			return (p->sys.static_l2_flane*per_active_core);
-		else //LIGHT_SM
-			return (p->sys.static_light_flane*per_active_core); //return LIGHT_SM base static power
-	}
-
-	/* using a linear model for thread divergence */
-	if((int_accesses != 0.0) && (fp_accesses != 0.0) && (dp_accesses != 0.0) && (sfu_accesses == 0.0) && (tensor_accesses == 0.0) && (tex_accesses == 0.0)){
-		/* INT_FP_DP */
-		base_static_power = p->sys.static_cat3_flane;
-		lane_static_power = p->sys.static_cat3_addlane;
-	}
-
-	else if((int_accesses != 0.0) && (fp_accesses != 0.0) && (dp_accesses == 0.0) && (sfu_accesses == 0.0) && (tensor_accesses != 0.0) && (tex_accesses == 0.0)){
-		/* INT_FP_TENSOR */
-		base_static_power = p->sys.static_cat6_flane;
-		lane_static_power = p->sys.static_cat6_addlane;
-	}
-
-	else if((int_accesses != 0.0) && (fp_accesses != 0.0) && (dp_accesses == 0.0) && (sfu_accesses != 0.0) && (tensor_accesses == 0.0) && (tex_accesses == 0.0)){
-		/* INT_FP_SFU */
-		base_static_power = p->sys.static_cat4_flane;
-		lane_static_power = p->sys.static_cat4_addlane;
-	}
-
-	else if((int_accesses != 0.0) && (fp_accesses != 0.0) && (dp_accesses == 0.0) && (sfu_accesses == 0.0) && (tensor_accesses == 0.0) && (tex_accesses != 0.0)){
-		/* INT_FP_TEX */
-		base_static_power = p->sys.static_cat5_flane;
-		lane_static_power = p->sys.static_cat5_addlane;
-	}
-
-	else if((int_accesses != 0.0) && (fp_accesses != 0.0) && (dp_accesses == 0.0) && (sfu_accesses == 0.0) && (tensor_accesses == 0.0) && (tex_accesses == 0.0)){
-		/* INT_FP */
-		base_static_power = p->sys.static_cat2_flane;
-		lane_static_power = p->sys.static_cat2_addlane;
-	}
-
-	else if((int_accesses != 0.0) && (fp_accesses == 0.0) && (dp_accesses == 0.0) && (sfu_accesses == 0.0) && (tensor_accesses == 0.0) && (tex_accesses == 0.0)){
-		/* INT */
-		/* Seperating INT_ADD only and INT_MUL only from mix of INT instructions */
-		if((int_add_accesses != 0.0) && (int_mul_accesses == 0.0)){ //INT_ADD
-			base_static_power = p->sys.static_intadd_flane;
-			lane_static_power = p->sys.static_intadd_addlane;
-		}
-		else if((int_add_accesses == 0.0) && (int_mul_accesses != 0.0)){ //INT_MUL
-			base_static_power = p->sys.static_intmul_flane;
-			lane_static_power = p->sys.static_intmul_addlane;
-		}
-		else{ //INT_ADD+MUL
-			base_static_power = p->sys.static_cat1_flane;
-			lane_static_power = p->sys.static_cat1_addlane;
-		}
-	}
-
-	else if((int_accesses == 0.0) && (fp_accesses == 0.0) && (dp_accesses == 0.0) && (sfu_accesses == 0.0) && (tensor_accesses == 0.0) && (tex_accesses == 0.0)){
-		/* LIGHT_SM or memory only sample */
-		lane_static_power = 0.0; //addlane static power is 0 for l1/l2/shared memory only accesses
-		if(l1_accesses != 0.0)
-			base_static_power = p->sys.static_l1_flane;
-		else if(shared_accesses != 0.0)
-			base_static_power = p->sys.static_shared_flane;
-		else if(l2_accesses != 0.0)
-			base_static_power = p->sys.static_l2_flane;
-		else{
-			base_static_power = p->sys.static_light_flane;
-			lane_static_power = p->sys.static_light_addlane;
-		}
-	}
-	else{
-		base_static_power = p->sys.static_geomean_flane; //GEOMEAN except LIGHT_SM if we don't fall into any of the categories above
-		lane_static_power = p->sys.static_geomean_addlane;
-	}
-
-	total_static_power = base_static_power + (((double)avg_threads_per_warp-1.0)*lane_static_power); //Linear Model
-	return (total_static_power*per_active_core);
-}
+  else if ((int_accesses != 0.0) && (fp_accesses != 0.0) &&
+           (dp_accesses == 0.0) && (sfu_accesses != 0.0) &&
+           (tensor_accesses == 0.0) && (tex_accesses == 0.0)) {
+    /* INT_FP_SFU */
+    base_static_power = p->sys.static_cat4_flane;
+    lane_static_power = p->sys.static_cat4_addlane;
+  }
 
-void gpgpu_sim_wrapper::update_components_power()
-{
+  else if ((int_accesses != 0.0) && (fp_accesses != 0.0) &&
+           (dp_accesses == 0.0) && (sfu_accesses == 0.0) &&
+           (tensor_accesses == 0.0) && (tex_accesses != 0.0)) {
+    /* INT_FP_TEX */
+    base_static_power = p->sys.static_cat5_flane;
+    lane_static_power = p->sys.static_cat5_addlane;
+  }
 
+  else if ((int_accesses != 0.0) && (fp_accesses != 0.0) &&
+           (dp_accesses == 0.0) && (sfu_accesses == 0.0) &&
+           (tensor_accesses == 0.0) && (tex_accesses == 0.0)) {
+    /* INT_FP */
+    base_static_power = p->sys.static_cat2_flane;
+    lane_static_power = p->sys.static_cat2_addlane;
+  }
+
+  else if ((int_accesses != 0.0) && (fp_accesses == 0.0) &&
+           (dp_accesses == 0.0) && (sfu_accesses == 0.0) &&
+           (tensor_accesses == 0.0) && (tex_accesses == 0.0)) {
+    /* INT */
+    /* Seperating INT_ADD only and INT_MUL only from mix of INT instructions */
+    if ((int_add_accesses != 0.0) && (int_mul_accesses == 0.0)) {  // INT_ADD
+      base_static_power = p->sys.static_intadd_flane;
+      lane_static_power = p->sys.static_intadd_addlane;
+    } else if ((int_add_accesses == 0.0) &&
+               (int_mul_accesses != 0.0)) {  // INT_MUL
+      base_static_power = p->sys.static_intmul_flane;
+      lane_static_power = p->sys.static_intmul_addlane;
+    } else {  // INT_ADD+MUL
+      base_static_power = p->sys.static_cat1_flane;
+      lane_static_power = p->sys.static_cat1_addlane;
+    }
+  }
+
+  else if ((int_accesses == 0.0) && (fp_accesses == 0.0) &&
+           (dp_accesses == 0.0) && (sfu_accesses == 0.0) &&
+           (tensor_accesses == 0.0) && (tex_accesses == 0.0)) {
+    /* LIGHT_SM or memory only sample */
+    lane_static_power =
+        0.0;  // addlane static power is 0 for l1/l2/shared memory only accesses
+    if (l1_accesses != 0.0)
+      base_static_power = p->sys.static_l1_flane;
+    else if (shared_accesses != 0.0)
+      base_static_power = p->sys.static_shared_flane;
+    else if (l2_accesses != 0.0)
+      base_static_power = p->sys.static_l2_flane;
+    else {
+      base_static_power = p->sys.static_light_flane;
+      lane_static_power = p->sys.static_light_addlane;
+    }
+  } else {
+    base_static_power =
+        p->sys.static_geomean_flane;  // GEOMEAN except LIGHT_SM if we don't
+                                      // fall into any of the categories above
+    lane_static_power = p->sys.static_geomean_addlane;
+  }
+
+  total_static_power =
+      base_static_power + (((double)avg_threads_per_warp - 1.0) *
+                           lane_static_power);  // Linear Model
+  return (total_static_power * per_active_core);
+}
+
+void gpgpu_sim_wrapper::update_components_power() {
   update_coefficients();
 
-  proc_power=proc->rt_power.readOp.dynamic;
-  sample_cmp_pwr[IBP]=(proc->cores[0]->ifu->IB->rt_power.readOp.dynamic
-          +proc->cores[0]->ifu->IB->rt_power.writeOp.dynamic
-          +proc->cores[0]->ifu->ID_misc->rt_power.readOp.dynamic
-          +proc->cores[0]->ifu->ID_operand->rt_power.readOp.dynamic
-          +proc->cores[0]->ifu->ID_inst->rt_power.readOp.dynamic)/(proc->cores[0]->executionTime);
+  proc_power = proc->rt_power.readOp.dynamic;
+  sample_cmp_pwr[IBP] =
+      (proc->cores[0]->ifu->IB->rt_power.readOp.dynamic +
+       proc->cores[0]->ifu->IB->rt_power.writeOp.dynamic +
+       proc->cores[0]->ifu->ID_misc->rt_power.readOp.dynamic +
+       proc->cores[0]->ifu->ID_operand->rt_power.readOp.dynamic +
+       proc->cores[0]->ifu->ID_inst->rt_power.readOp.dynamic) /
+      (proc->cores[0]->executionTime);
 
-  sample_cmp_pwr[ICP]=proc->cores[0]->ifu->icache.rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+  sample_cmp_pwr[ICP] = proc->cores[0]->ifu->icache.rt_power.readOp.dynamic /
+                        (proc->cores[0]->executionTime);
 
-  sample_cmp_pwr[DCP]=proc->cores[0]->lsu->dcache.rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+  sample_cmp_pwr[DCP] = proc->cores[0]->lsu->dcache.rt_power.readOp.dynamic /
+                        (proc->cores[0]->executionTime);
 
-  sample_cmp_pwr[TCP]=proc->cores[0]->lsu->tcache.rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+  sample_cmp_pwr[TCP] = proc->cores[0]->lsu->tcache.rt_power.readOp.dynamic /
+                        (proc->cores[0]->executionTime);
 
-  sample_cmp_pwr[CCP]=proc->cores[0]->lsu->ccache.rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+  sample_cmp_pwr[CCP] = proc->cores[0]->lsu->ccache.rt_power.readOp.dynamic /
+                        (proc->cores[0]->executionTime);
 
-  sample_cmp_pwr[SHRDP]=proc->cores[0]->lsu->sharedmemory.rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+  sample_cmp_pwr[SHRDP] =
+      proc->cores[0]->lsu->sharedmemory.rt_power.readOp.dynamic /
+      (proc->cores[0]->executionTime);
 
-  sample_cmp_pwr[RFP]=(proc->cores[0]->exu->rfu->rt_power.readOp.dynamic/(proc->cores[0]->executionTime))
-         *(proc->cores[0]->exu->rf_fu_clockRate/proc->cores[0]->exu->clockRate);
+  sample_cmp_pwr[RFP] =
+      (proc->cores[0]->exu->rfu->rt_power.readOp.dynamic /
+       (proc->cores[0]->executionTime)) *
+      (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
 
-  double sample_fp_pwr = (proc->cores[0]->exu->fp_u->rt_power.readOp.dynamic/(proc->cores[0]->executionTime));
+  double sample_fp_pwr = (proc->cores[0]->exu->fp_u->rt_power.readOp.dynamic /
+                          (proc->cores[0]->executionTime));
 
-  double sample_sfu_pwr = (proc->cores[0]->exu->mul->rt_power.readOp.dynamic/(proc->cores[0]->executionTime));
+  double sample_sfu_pwr = (proc->cores[0]->exu->mul->rt_power.readOp.dynamic /
+                           (proc->cores[0]->executionTime));
 
-  sample_cmp_pwr[INTP]=(proc->cores[0]->exu->exeu->rt_power.readOp.dynamic/(proc->cores[0]->executionTime))
-         *(proc->cores[0]->exu->rf_fu_clockRate/proc->cores[0]->exu->clockRate);
+  sample_cmp_pwr[INTP] =
+      (proc->cores[0]->exu->exeu->rt_power.readOp.dynamic /
+       (proc->cores[0]->executionTime)) *
+      (proc->cores[0]->exu->rf_fu_clockRate / proc->cores[0]->exu->clockRate);
 
-  
-  if(tot_fpu_accesses != 0){
-    sample_cmp_pwr[FPUP]= sample_fp_pwr * sample_perf_counters[FP_ACC]/tot_fpu_accesses;
-    sample_cmp_pwr[DPUP]= sample_fp_pwr * sample_perf_counters[DP_ACC]/tot_fpu_accesses;
-  }
-  else{
-    sample_cmp_pwr[FPUP]= 0;
-    sample_cmp_pwr[DPUP]= 0;
-  }
-  if(tot_sfu_accesses != 0){
-    sample_cmp_pwr[INT_MUL24P]= sample_sfu_pwr * sample_perf_counters[INT_MUL24_ACC]/tot_sfu_accesses;
-    sample_cmp_pwr[INT_MUL32P]= sample_sfu_pwr * sample_perf_counters[INT_MUL32_ACC]/tot_sfu_accesses;
-    sample_cmp_pwr[INT_MULP]= sample_sfu_pwr * sample_perf_counters[INT_MUL_ACC]/tot_sfu_accesses;
-    sample_cmp_pwr[INT_DIVP]= sample_sfu_pwr * sample_perf_counters[INT_DIV_ACC]/tot_sfu_accesses;
-    sample_cmp_pwr[FP_MULP]= sample_sfu_pwr * sample_perf_counters[FP_MUL_ACC]/tot_sfu_accesses;
-    sample_cmp_pwr[FP_DIVP]= sample_sfu_pwr * sample_perf_counters[FP_DIV_ACC]/tot_sfu_accesses;
-    sample_cmp_pwr[FP_SQRTP]= sample_sfu_pwr * sample_perf_counters[FP_SQRT_ACC]/tot_sfu_accesses;
-    sample_cmp_pwr[FP_LGP]= sample_sfu_pwr * sample_perf_counters[FP_LG_ACC]/tot_sfu_accesses;
-    sample_cmp_pwr[FP_SINP]= sample_sfu_pwr * sample_perf_counters[FP_SIN_ACC]/tot_sfu_accesses;
-    sample_cmp_pwr[FP_EXP]= sample_sfu_pwr * sample_perf_counters[FP_EXP_ACC]/tot_sfu_accesses;
-    sample_cmp_pwr[DP_MULP]= sample_sfu_pwr * sample_perf_counters[DP_MUL_ACC]/tot_sfu_accesses;
-    sample_cmp_pwr[DP_DIVP]= sample_sfu_pwr * sample_perf_counters[DP_DIV_ACC]/tot_sfu_accesses;
-    sample_cmp_pwr[TENSORP]= sample_sfu_pwr * sample_perf_counters[TENSOR_ACC]/tot_sfu_accesses;
-    sample_cmp_pwr[TEXP]= sample_sfu_pwr * sample_perf_counters[TEX_ACC]/tot_sfu_accesses;
+  if (tot_fpu_accesses != 0) {
+    sample_cmp_pwr[FPUP] =
+        sample_fp_pwr * sample_perf_counters[FP_ACC] / tot_fpu_accesses;
+    sample_cmp_pwr[DPUP] =
+        sample_fp_pwr * sample_perf_counters[DP_ACC] / tot_fpu_accesses;
+  } else {
+    sample_cmp_pwr[FPUP] = 0;
+    sample_cmp_pwr[DPUP] = 0;
   }
-  else{
-    sample_cmp_pwr[INT_MUL24P]= 0;
-    sample_cmp_pwr[INT_MUL32P]= 0;
-    sample_cmp_pwr[INT_MULP]= 0;
-    sample_cmp_pwr[INT_DIVP]= 0;
-    sample_cmp_pwr[FP_MULP]= 0;
-    sample_cmp_pwr[FP_DIVP]= 0;
-    sample_cmp_pwr[FP_SQRTP]= 0;
-    sample_cmp_pwr[FP_LGP]= 0;
-    sample_cmp_pwr[FP_SINP]= 0;
-    sample_cmp_pwr[FP_EXP]= 0;
-    sample_cmp_pwr[DP_MULP]= 0;
-    sample_cmp_pwr[DP_DIVP]= 0;
-    sample_cmp_pwr[TENSORP]= 0;
-    sample_cmp_pwr[TEXP]= 0;
+  if (tot_sfu_accesses != 0) {
+    sample_cmp_pwr[INT_MUL24P] =
+        sample_sfu_pwr * sample_perf_counters[INT_MUL24_ACC] / tot_sfu_accesses;
+    sample_cmp_pwr[INT_MUL32P] =
+        sample_sfu_pwr * sample_perf_counters[INT_MUL32_ACC] / tot_sfu_accesses;
+    sample_cmp_pwr[INT_MULP] =
+        sample_sfu_pwr * sample_perf_counters[INT_MUL_ACC] / tot_sfu_accesses;
+    sample_cmp_pwr[INT_DIVP] =
+        sample_sfu_pwr * sample_perf_counters[INT_DIV_ACC] / tot_sfu_accesses;
+    sample_cmp_pwr[FP_MULP] =
+        sample_sfu_pwr * sample_perf_counters[FP_MUL_ACC] / tot_sfu_accesses;
+    sample_cmp_pwr[FP_DIVP] =
+        sample_sfu_pwr * sample_perf_counters[FP_DIV_ACC] / tot_sfu_accesses;
+    sample_cmp_pwr[FP_SQRTP] =
+        sample_sfu_pwr * sample_perf_counters[FP_SQRT_ACC] / tot_sfu_accesses;
+    sample_cmp_pwr[FP_LGP] =
+        sample_sfu_pwr * sample_perf_counters[FP_LG_ACC] / tot_sfu_accesses;
+    sample_cmp_pwr[FP_SINP] =
+        sample_sfu_pwr * sample_perf_counters[FP_SIN_ACC] / tot_sfu_accesses;
+    sample_cmp_pwr[FP_EXP] =
+        sample_sfu_pwr * sample_perf_counters[FP_EXP_ACC] / tot_sfu_accesses;
+    sample_cmp_pwr[DP_MULP] =
+        sample_sfu_pwr * sample_perf_counters[DP_MUL_ACC] / tot_sfu_accesses;
+    sample_cmp_pwr[DP_DIVP] =
+        sample_sfu_pwr * sample_perf_counters[DP_DIV_ACC] / tot_sfu_accesses;
+    sample_cmp_pwr[TENSORP] =
+        sample_sfu_pwr * sample_perf_counters[TENSOR_ACC] / tot_sfu_accesses;
+    sample_cmp_pwr[TEXP] =
+        sample_sfu_pwr * sample_perf_counters[TEX_ACC] / tot_sfu_accesses;
+  } else {
+    sample_cmp_pwr[INT_MUL24P] = 0;
+    sample_cmp_pwr[INT_MUL32P] = 0;
+    sample_cmp_pwr[INT_MULP] = 0;
+    sample_cmp_pwr[INT_DIVP] = 0;
+    sample_cmp_pwr[FP_MULP] = 0;
+    sample_cmp_pwr[FP_DIVP] = 0;
+    sample_cmp_pwr[FP_SQRTP] = 0;
+    sample_cmp_pwr[FP_LGP] = 0;
+    sample_cmp_pwr[FP_SINP] = 0;
+    sample_cmp_pwr[FP_EXP] = 0;
+    sample_cmp_pwr[DP_MULP] = 0;
+    sample_cmp_pwr[DP_DIVP] = 0;
+    sample_cmp_pwr[TENSORP] = 0;
+    sample_cmp_pwr[TEXP] = 0;
   }
 
-  sample_cmp_pwr[SCHEDP]=proc->cores[0]->exu->scheu->rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+  sample_cmp_pwr[SCHEDP] = proc->cores[0]->exu->scheu->rt_power.readOp.dynamic /
+                           (proc->cores[0]->executionTime);
 
-  sample_cmp_pwr[L2CP]=(proc->XML->sys.number_of_L2s>0)? proc->l2array[0]->rt_power.readOp.dynamic/(proc->cores[0]->executionTime):0;
+  sample_cmp_pwr[L2CP] = (proc->XML->sys.number_of_L2s > 0)
+                             ? proc->l2array[0]->rt_power.readOp.dynamic /
+                                   (proc->cores[0]->executionTime)
+                             : 0;
 
-  sample_cmp_pwr[MCP]=(proc->mc->rt_power.readOp.dynamic-proc->mc->dram->rt_power.readOp.dynamic)/(proc->cores[0]->executionTime);
+  sample_cmp_pwr[MCP] = (proc->mc->rt_power.readOp.dynamic -
+                         proc->mc->dram->rt_power.readOp.dynamic) /
+                        (proc->cores[0]->executionTime);
 
-  sample_cmp_pwr[NOCP]=proc->nocs[0]->rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+  sample_cmp_pwr[NOCP] =
+      proc->nocs[0]->rt_power.readOp.dynamic / (proc->cores[0]->executionTime);
 
-  sample_cmp_pwr[DRAMP]=proc->mc->dram->rt_power.readOp.dynamic/(proc->cores[0]->executionTime);
+  sample_cmp_pwr[DRAMP] =
+      proc->mc->dram->rt_power.readOp.dynamic / (proc->cores[0]->executionTime);
 
-  sample_cmp_pwr[PIPEP]=proc->cores[0]->Pipeline_energy/(proc->cores[0]->executionTime);
+  sample_cmp_pwr[PIPEP] =
+      proc->cores[0]->Pipeline_energy / (proc->cores[0]->executionTime);
 
-  sample_cmp_pwr[IDLE_COREP]=proc->cores[0]->IdleCoreEnergy/(proc->cores[0]->executionTime);
+  sample_cmp_pwr[IDLE_COREP] =
+      proc->cores[0]->IdleCoreEnergy / (proc->cores[0]->executionTime);
 
-  // This constant dynamic power (e.g., clock power) part is estimated via regression model.
-  sample_cmp_pwr[CONSTP]=0;
-  sample_cmp_pwr[STATICP]=0;
-  // double cnst_dyn = proc->get_const_dynamic_power()/(proc->cores[0]->executionTime);
-  // // If the regression scaling term is greater than the recorded constant dynamic power
-  // // then use the difference (other portion already added to dynamic power). Else,
+  // This constant dynamic power (e.g., clock power) part is estimated via
+  // regression model.
+  sample_cmp_pwr[CONSTP] = 0;
+  sample_cmp_pwr[STATICP] = 0;
+  // double cnst_dyn =
+  // proc->get_const_dynamic_power()/(proc->cores[0]->executionTime);
+  // // If the regression scaling term is greater than the recorded constant
+  // dynamic power
+  // // then use the difference (other portion already added to dynamic power).
+  // Else,
   // // all the constant dynamic power is accounted for, add nothing.
   // if(p->sys.scaling_coefficients[constant_power] > cnst_dyn)
-  //   sample_cmp_pwr[CONSTP] = (p->sys.scaling_coefficients[constant_power]-cnst_dyn);
+  //   sample_cmp_pwr[CONSTP] =
+  //   (p->sys.scaling_coefficients[constant_power]-cnst_dyn);
   sample_cmp_pwr[CONSTP] = p->sys.scaling_coefficients[constant_power];
   sample_cmp_pwr[STATICP] = calculate_static_power();
 
-  if(g_dvfs_enabled){
-  	double voltage_ratio = modeled_chip_voltage/p->sys.modeled_chip_voltage_ref; 
-  	sample_cmp_pwr[IDLE_COREP] *= voltage_ratio; // static power scaled by voltage_ratio
-  	sample_cmp_pwr[STATICP] *= voltage_ratio;  // static power scaled by voltage_ratio
-  	for(unsigned i=0; i<num_pwr_cmps; i++){
-    	if((i != IDLE_COREP) && (i != STATICP)){ 
-    		sample_cmp_pwr[i] *= voltage_ratio*voltage_ratio; // dynamic power scaled by square of voltage_ratio
-    	}
-  	}
+  if (g_dvfs_enabled) {
+    double voltage_ratio =
+        modeled_chip_voltage / p->sys.modeled_chip_voltage_ref;
+    sample_cmp_pwr[IDLE_COREP] *=
+        voltage_ratio;  // static power scaled by voltage_ratio
+    sample_cmp_pwr[STATICP] *=
+        voltage_ratio;  // static power scaled by voltage_ratio
+    for (unsigned i = 0; i < num_pwr_cmps; i++) {
+      if ((i != IDLE_COREP) && (i != STATICP)) {
+        sample_cmp_pwr[i] *=
+            voltage_ratio *
+            voltage_ratio;  // dynamic power scaled by square of voltage_ratio
+      }
+    }
   }
-  
-  proc_power+=sample_cmp_pwr[CONSTP]+sample_cmp_pwr[STATICP];
-  if(!g_dvfs_enabled){ // sanity check will fail when voltage scaling is applied, fix later
-	  double sum_pwr_cmp=0;
-	  for(unsigned i=0; i<num_pwr_cmps; i++){
-	    sum_pwr_cmp+=sample_cmp_pwr[i];
-	  }
-	  bool check=false;
-	  check=sanity_check(sum_pwr_cmp,proc_power);
-	  if(!check)
-	    printf("sum_pwr_cmp %f : proc_power %f \n",sum_pwr_cmp,proc_power);
-	  assert("Total Power does not equal the sum of the components\n" && (check));
+
+  proc_power += sample_cmp_pwr[CONSTP] + sample_cmp_pwr[STATICP];
+  if (!g_dvfs_enabled) {  // sanity check will fail when voltage scaling is
+                          // applied, fix later
+    double sum_pwr_cmp = 0;
+    for (unsigned i = 0; i < num_pwr_cmps; i++) {
+      sum_pwr_cmp += sample_cmp_pwr[i];
+    }
+    bool check = false;
+    check = sanity_check(sum_pwr_cmp, proc_power);
+    if (!check)
+      printf("sum_pwr_cmp %f : proc_power %f \n", sum_pwr_cmp, proc_power);
+    assert("Total Power does not equal the sum of the components\n" && (check));
   }
 }
 
@@ -993,13 +1111,12 @@ void gpgpu_sim_wrapper::print_power_kernel_stats(
     }
 
     powerfile << "gpu_avg_threads_per_warp = "
-                << avg_threads_per_warp_tot / (double)kernel_sample_count
-                << std::endl;
+              << avg_threads_per_warp_tot / (double)kernel_sample_count
+              << std::endl;
 
     for (unsigned i = 0; i < num_perf_counters; ++i) {
       powerfile << "gpu_tot_" << perf_count_label[i] << " = "
-                << kernel_cmp_perf_counters[i].avg
-                << std::endl;
+                << kernel_cmp_perf_counters[i].avg << std::endl;
     }
 
     powerfile << std::endl << "Kernel Maximum Power Data:" << std::endl;
diff --git a/src/accelwattch/gpgpu_sim_wrapper.h b/src/accelwattch/gpgpu_sim_wrapper.h
index 33c4b72f2..dd71d891f 100644
--- a/src/accelwattch/gpgpu_sim_wrapper.h
+++ b/src/accelwattch/gpgpu_sim_wrapper.h
@@ -1,16 +1,17 @@
-// Copyright (c) 2009-2021, Tor M. Aamodt, Tayler Hetherington, Ahmed ElTantawy, Vijay Kandiah, Nikos Hardavellas
-// The University of British Columbia, Northwestern University
-// All rights reserved.
+// Copyright (c) 2009-2021, Tor M. Aamodt, Tayler Hetherington, Ahmed ElTantawy,
+// Vijay Kandiah, Nikos Hardavellas The University of British Columbia,
+// Northwestern University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// 1. Redistributions of source code must retain the above copyright notice, this
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
 //    list of conditions and the following disclaimer;
 // 2. Redistributions in binary form must reproduce the above copyright notice,
 //    this list of conditions and the following disclaimer in the documentation
 //    and/or other materials provided with the distribution;
-// 3. Neither the names of The University of British Columbia, Northwestern 
+// 3. Neither the names of The University of British Columbia, Northwestern
 //    University nor the names of their contributors may be used to
 //    endorse or promote products derived from this software without specific
 //    prior written permission.
@@ -58,31 +59,32 @@ struct avg_max_min_counters {
 #ifndef COEFF_STRUCT
 #define COEFF_STRUCT
 
-struct PowerscalingCoefficients{
-    double int_coeff;
-    double int_mul_coeff;
-    double int_mul24_coeff;
-    double int_mul32_coeff;
-    double int_div_coeff;
-    double fp_coeff;
-    double dp_coeff;
-    double fp_mul_coeff;
-    double fp_div_coeff;
-    double dp_mul_coeff;
-    double dp_div_coeff;
-    double sqrt_coeff;
-    double log_coeff;
-    double sin_coeff;
-    double exp_coeff;
-    double tensor_coeff;
-    double tex_coeff;
+struct PowerscalingCoefficients {
+  double int_coeff;
+  double int_mul_coeff;
+  double int_mul24_coeff;
+  double int_mul32_coeff;
+  double int_div_coeff;
+  double fp_coeff;
+  double dp_coeff;
+  double fp_mul_coeff;
+  double fp_div_coeff;
+  double dp_mul_coeff;
+  double dp_div_coeff;
+  double sqrt_coeff;
+  double log_coeff;
+  double sin_coeff;
+  double exp_coeff;
+  double tensor_coeff;
+  double tex_coeff;
 };
 
 #endif
 
 class gpgpu_sim_wrapper {
  public:
-  gpgpu_sim_wrapper(bool power_simulation_enabled, char* xmlfile, int power_simulation_mode, bool dvfs_enabled);
+  gpgpu_sim_wrapper(bool power_simulation_enabled, char* xmlfile,
+                    int power_simulation_mode, bool dvfs_enabled);
   ~gpgpu_sim_wrapper();
 
   void init_mcpat(char* xmlfile, char* powerfile, char* power_trace_file,
@@ -90,8 +92,9 @@ class gpgpu_sim_wrapper {
                   bool power_sim_enabled, bool trace_enabled,
                   bool steady_state_enabled, bool power_per_cycle_dump,
                   double steady_power_deviation, double steady_min_period,
-                  int zlevel, double init_val, int stat_sample_freq, int power_sim_mode, 
-                  bool dvfs_enabled, unsigned clock_freq, unsigned num_shaders);
+                  int zlevel, double init_val, int stat_sample_freq,
+                  int power_sim_mode, bool dvfs_enabled, unsigned clock_freq,
+                  unsigned num_shaders);
   void init_mcpat_hw_mode(unsigned gpu_sim_cycle);
   void detect_print_steady_state(int position, double init_val);
   void close_files();
@@ -128,15 +131,15 @@ class gpgpu_sim_wrapper {
   void set_mem_ctrl_power(double reads, double writes, double dram_precharge);
   void set_exec_unit_power(double fpu_accesses, double ialu_accesses,
                            double sfu_accesses);
-  void set_int_accesses(double ialu_accesses, double imul24_accesses, 
-                        double imul32_accesses, double imul_accesses, 
+  void set_int_accesses(double ialu_accesses, double imul24_accesses,
+                        double imul32_accesses, double imul_accesses,
                         double idiv_accesses);
-  void set_dp_accesses(double dpu_accesses, double dpmul_accesses, 
+  void set_dp_accesses(double dpu_accesses, double dpmul_accesses,
                        double dpdiv_accesses);
-  void set_fp_accesses(double fpu_accesses, double fpmul_accesses, 
+  void set_fp_accesses(double fpu_accesses, double fpmul_accesses,
                        double fpdiv_accesses);
-  void set_trans_accesses(double sqrt_accesses, double log_accesses, 
-                       double sin_accesses, double exp_accesses);
+  void set_trans_accesses(double sqrt_accesses, double log_accesses,
+                          double sin_accesses, double exp_accesses);
   void set_tensor_accesses(double tensor_accesses);
   void set_tex_accesses(double tex_accesses);
   void set_avg_active_threads(float active_threads);
@@ -145,7 +148,7 @@ class gpgpu_sim_wrapper {
   void set_NoC_power(double noc_tot_acc);
   bool sanity_check(double a, double b);
 
-  PowerscalingCoefficients * get_scaling_coeffs();
+  PowerscalingCoefficients* get_scaling_coeffs();
 
  private:
   void print_steady_state(int position, double init_val);
diff --git a/src/accelwattch/processor.cc b/src/accelwattch/processor.cc
index 9e7f5b2c5..d5c7cdda8 100644
--- a/src/accelwattch/processor.cc
+++ b/src/accelwattch/processor.cc
@@ -30,8 +30,8 @@
  ***************************************************************************/
 /********************************************************************
  *      Modified by:
- * Jingwen Leng, University of Texas, Austin                
- * Syed Gilani, University of Wisconsin–Madison         
+ * Jingwen Leng, University of Texas, Austin
+ * Syed Gilani, University of Wisconsin–Madison
  * Tayler Hetherington, University of British Columbia
  * Ahmed ElTantawy, University of British Columbia
  * Vijay Kandiah, Northwestern University
diff --git a/src/accelwattch/xmlParser.cc b/src/accelwattch/xmlParser.cc
index 8f49b3912..780d2ad04 100644
--- a/src/accelwattch/xmlParser.cc
+++ b/src/accelwattch/xmlParser.cc
@@ -1236,12 +1236,10 @@ static NextToken GetNextToken(XML *pXML, int *pcbToken,
       // Indicate we are dealing with text
       *pType = eTokenText;
       while ((ch = getNextChar(pXML))) {
-        if
-          XML_isSPACECHAR(ch) {
-            indexStart++;
-            break;
-          }
-        else if (ch == _CXML('/')) {
+        if XML_isSPACECHAR (ch) {
+          indexStart++;
+          break;
+        } else if (ch == _CXML('/')) {
           // If we find a slash then this maybe text or a short hand end tag
           // Peek at the next character to see it we have short hand end tag
           ch = pXML->lpXML[pXML->nIndex];
@@ -2193,15 +2191,15 @@ int XMLNode::CreateXMLStringR(XMLNodeData *pEntry, XMLSTR lpszMarker,
         nResult++;
       }
     } else
-        // If there are child nodes we need to terminate the start tag
-        if (nElementI) {
-      if (lpszMarker) lpszMarker[nResult - 1] = _CXML('>');
-      if (nFormat >= 0) {
-        if (lpszMarker) lpszMarker[nResult] = _CXML('\n');
-        nResult++;
-      }
-    } else
-      nResult--;
+      // If there are child nodes we need to terminate the start tag
+      if (nElementI) {
+        if (lpszMarker) lpszMarker[nResult - 1] = _CXML('>');
+        if (nFormat >= 0) {
+          if (lpszMarker) lpszMarker[nResult] = _CXML('\n');
+          nResult++;
+        }
+      } else
+        nResult--;
   }
 
   // Calculate the child format for when we recurse.  This is used to
diff --git a/src/cuda-sim/cuda-sim.cc b/src/cuda-sim/cuda-sim.cc
index 888cf7750..833d33f5c 100644
--- a/src/cuda-sim/cuda-sim.cc
+++ b/src/cuda-sim/cuda-sim.cc
@@ -1,18 +1,19 @@
 // Copyright (c) 2009-2021, Tor M. Aamodt, Ali Bakhoda, Wilson W.L. Fung,
-// George L. Yuan, Jimmy Kwa, Vijay Kandiah, Nikos Hardavellas, 
+// George L. Yuan, Jimmy Kwa, Vijay Kandiah, Nikos Hardavellas,
 // Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
-// The University of British Columbia, Northwestern University, Purdue University
-// All rights reserved.
+// The University of British Columbia, Northwestern University, Purdue
+// University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// 1. Redistributions of source code must retain the above copyright notice, this
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
 //    list of conditions and the following disclaimer;
 // 2. Redistributions in binary form must reproduce the above copyright notice,
 //    this list of conditions and the following disclaimer in the documentation
 //    and/or other materials provided with the distribution;
-// 3. Neither the names of The University of British Columbia, Northwestern 
+// 3. Neither the names of The University of British Columbia, Northwestern
 //    University nor the names of their contributors may be used to
 //    endorse or promote products derived from this software without specific
 //    prior written permission.
@@ -591,118 +592,118 @@ void ptx_instruction::set_fp_or_int_archop() {
   }
 }
 
-void ptx_instruction::set_mul_div_or_other_archop(){
-  sp_op=OTHER_OP;
-  if((m_opcode != MEMBAR_OP) && (m_opcode != SSY_OP) && (m_opcode != BRA_OP) && (m_opcode != BAR_OP) && (m_opcode != EXIT_OP) && (m_opcode != NOP_OP) && (m_opcode != RETP_OP) && (m_opcode != RET_OP) && (m_opcode != CALLP_OP) && (m_opcode != CALL_OP)){
-    if(get_type() == F64_TYPE || get_type() == FF64_TYPE){
-         switch(get_opcode()){
-            case MUL_OP:
-            case MAD_OP:
-            case FMA_OP:
-                sp_op=DP_MUL_OP;
-               break;
-            case DIV_OP:
-            case REM_OP:
-                sp_op=DP_DIV_OP;
-               break;
-            case RCP_OP:
-                sp_op=DP_DIV_OP;
-               break;
-            case LG2_OP:
-                sp_op=FP_LG_OP;
-               break;
-            case RSQRT_OP:
-            case SQRT_OP:
-                sp_op=FP_SQRT_OP;
-               break;            
-            case SIN_OP:
-            case COS_OP:
-                sp_op=FP_SIN_OP;
-               break;
-            case EX2_OP:
-                sp_op=FP_EXP_OP;
-               break;
-            case MMA_OP:
-                sp_op=TENSOR__OP;
-            break;
-            case TEX_OP:
-                sp_op=TEX__OP;
-            break;
-            default:
-               if((op==DP_OP) || (op==ALU_OP))
-                  sp_op=DP___OP;
-               break;
-         }
+void ptx_instruction::set_mul_div_or_other_archop() {
+  sp_op = OTHER_OP;
+  if ((m_opcode != MEMBAR_OP) && (m_opcode != SSY_OP) && (m_opcode != BRA_OP) &&
+      (m_opcode != BAR_OP) && (m_opcode != EXIT_OP) && (m_opcode != NOP_OP) &&
+      (m_opcode != RETP_OP) && (m_opcode != RET_OP) && (m_opcode != CALLP_OP) &&
+      (m_opcode != CALL_OP)) {
+    if (get_type() == F64_TYPE || get_type() == FF64_TYPE) {
+      switch (get_opcode()) {
+        case MUL_OP:
+        case MAD_OP:
+        case FMA_OP:
+          sp_op = DP_MUL_OP;
+          break;
+        case DIV_OP:
+        case REM_OP:
+          sp_op = DP_DIV_OP;
+          break;
+        case RCP_OP:
+          sp_op = DP_DIV_OP;
+          break;
+        case LG2_OP:
+          sp_op = FP_LG_OP;
+          break;
+        case RSQRT_OP:
+        case SQRT_OP:
+          sp_op = FP_SQRT_OP;
+          break;
+        case SIN_OP:
+        case COS_OP:
+          sp_op = FP_SIN_OP;
+          break;
+        case EX2_OP:
+          sp_op = FP_EXP_OP;
+          break;
+        case MMA_OP:
+          sp_op = TENSOR__OP;
+          break;
+        case TEX_OP:
+          sp_op = TEX__OP;
+          break;
+        default:
+          if ((op == DP_OP) || (op == ALU_OP)) sp_op = DP___OP;
+          break;
       }
-      else if(get_type()==F16_TYPE || get_type()==F32_TYPE){
-         switch(get_opcode()){
-            case MUL_OP:
-            case MAD_OP:
-            case FMA_OP:
-                sp_op=FP_MUL_OP;
-               break;
-            case DIV_OP:
-            case REM_OP:
-                sp_op=FP_DIV_OP;
-               break;
-            case RCP_OP:
-                sp_op=FP_DIV_OP;
-               break;
-            case LG2_OP:
-                sp_op=FP_LG_OP;
-               break;
-            case RSQRT_OP:
-            case SQRT_OP:
-                sp_op=FP_SQRT_OP;
-               break;            
-            case SIN_OP:
-            case COS_OP:
-                sp_op=FP_SIN_OP;
-               break;
-            case EX2_OP:
-                sp_op=FP_EXP_OP;
-               break;
-            case MMA_OP:
-                sp_op=TENSOR__OP;
-            break;
-            case TEX_OP:
-                sp_op=TEX__OP;
-            break;
-            default:
-               if((op==SP_OP) || (op==ALU_OP))
-                  sp_op=FP__OP;
-               break;
-         }
-      }else {
-         switch(get_opcode()){
-            case MUL24_OP:
-            case MAD24_OP:
-                sp_op=INT_MUL24_OP;
-            break;
-            case MUL_OP:
-            case MAD_OP:
-            case FMA_OP:
-               if(get_type()==U32_TYPE || get_type()==S32_TYPE || get_type()==B32_TYPE)
-                   sp_op=INT_MUL32_OP;
-               else
-                   sp_op=INT_MUL_OP;
-            break;
-            case DIV_OP:
-            case REM_OP:
-                sp_op=INT_DIV_OP;
-            break;
-            case MMA_OP:
-                sp_op=TENSOR__OP;
-            break;
-            case TEX_OP:
-                sp_op=TEX__OP;
-            break;
-            default:
-               if((op==INTP_OP) || (op==ALU_OP))
-                   sp_op=INT__OP;
-               break;
-         }
+    } else if (get_type() == F16_TYPE || get_type() == F32_TYPE) {
+      switch (get_opcode()) {
+        case MUL_OP:
+        case MAD_OP:
+        case FMA_OP:
+          sp_op = FP_MUL_OP;
+          break;
+        case DIV_OP:
+        case REM_OP:
+          sp_op = FP_DIV_OP;
+          break;
+        case RCP_OP:
+          sp_op = FP_DIV_OP;
+          break;
+        case LG2_OP:
+          sp_op = FP_LG_OP;
+          break;
+        case RSQRT_OP:
+        case SQRT_OP:
+          sp_op = FP_SQRT_OP;
+          break;
+        case SIN_OP:
+        case COS_OP:
+          sp_op = FP_SIN_OP;
+          break;
+        case EX2_OP:
+          sp_op = FP_EXP_OP;
+          break;
+        case MMA_OP:
+          sp_op = TENSOR__OP;
+          break;
+        case TEX_OP:
+          sp_op = TEX__OP;
+          break;
+        default:
+          if ((op == SP_OP) || (op == ALU_OP)) sp_op = FP__OP;
+          break;
       }
+    } else {
+      switch (get_opcode()) {
+        case MUL24_OP:
+        case MAD24_OP:
+          sp_op = INT_MUL24_OP;
+          break;
+        case MUL_OP:
+        case MAD_OP:
+        case FMA_OP:
+          if (get_type() == U32_TYPE || get_type() == S32_TYPE ||
+              get_type() == B32_TYPE)
+            sp_op = INT_MUL32_OP;
+          else
+            sp_op = INT_MUL_OP;
+          break;
+        case DIV_OP:
+        case REM_OP:
+          sp_op = INT_DIV_OP;
+          break;
+        case MMA_OP:
+          sp_op = TENSOR__OP;
+          break;
+        case TEX_OP:
+          sp_op = TEX__OP;
+          break;
+        default:
+          if ((op == INTP_OP) || (op == ALU_OP)) sp_op = INT__OP;
+          break;
+      }
+    }
   }
 }
 
@@ -960,14 +961,16 @@ void ptx_instruction::set_opcode_and_latency() {
           break;
       }
       break;
-    case MUL24_OP: //MUL24 is performed on mul32 units (with additional instructions for bitmasking) on devices with compute capability >1.x
-      latency = int_latency[2]+1;
-      initiation_interval = int_init[2]+1;
+    case MUL24_OP:  // MUL24 is performed on mul32 units (with additional
+                    // instructions for bitmasking) on devices with compute
+                    // capability >1.x
+      latency = int_latency[2] + 1;
+      initiation_interval = int_init[2] + 1;
       op = INTP_OP;
       break;
     case MAD24_OP:
-      latency = int_latency[3]+1;
-      initiation_interval = int_init[3]+1;
+      latency = int_latency[3] + 1;
+      initiation_interval = int_init[3] + 1;
       op = INTP_OP;
       break;
     case DIV_OP:
@@ -1533,10 +1536,10 @@ void function_info::ptx_jit_config(
            filename_c.c_str());
   assert(system(buff) != 0);
   FILE *fp = fopen(filename_c.c_str(), "r");
-  char * ptr = fgets(buff, 1024, fp);
-  if(ptr == NULL ){
-          printf("can't read file %s \n", filename_c.c_str());
-          assert(0);
+  char *ptr = fgets(buff, 1024, fp);
+  if (ptr == NULL) {
+    printf("can't read file %s \n", filename_c.c_str());
+    assert(0);
   }
   fclose(fp);
   std::string fn(buff);
diff --git a/src/cuda-sim/instructions.cc b/src/cuda-sim/instructions.cc
index 4981c9994..4792efc80 100644
--- a/src/cuda-sim/instructions.cc
+++ b/src/cuda-sim/instructions.cc
@@ -1,18 +1,19 @@
 // Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Ali Bakhoda,
 // Jimmy Kwa, George L. Yuan, Vijay Kandiah, Nikos Hardavellas,
 // Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
-// The University of British Columbia, Northwestern University, Purdue University
-// All rights reserved.
+// The University of British Columbia, Northwestern University, Purdue
+// University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// 1. Redistributions of source code must retain the above copyright notice, this
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
 //    list of conditions and the following disclaimer;
 // 2. Redistributions in binary form must reproduce the above copyright notice,
 //    this list of conditions and the following disclaimer in the documentation
 //    and/or other materials provided with the distribution;
-// 3. Neither the names of The University of British Columbia, Northwestern 
+// 3. Neither the names of The University of British Columbia, Northwestern
 //    University nor the names of their contributors may be used to
 //    endorse or promote products derived from this software without specific
 //    prior written permission.
@@ -1948,7 +1949,7 @@ void mma_impl(const ptx_instruction *pI, core_t *core, warp_inst_t inst) {
             hex_val = (v[k / 2].s64 & 0xffff);
           else
             hex_val = ((v[k / 2].s64 & 0xffff0000) >> 16);
-          nw_v[k].f16 = *(reinterpret_cast<half*>(hex_val));
+          nw_v[k].f16 = *(reinterpret_cast<half *>(hex_val));
         }
       }
       if (!((operand_num == 3) && (type2 == F32_TYPE))) {
@@ -3980,7 +3981,7 @@ void mad_def(const ptx_instruction *pI, ptx_thread_info *thread,
           fesetround(FE_TOWARDZERO);
           break;
         default:
-          //assert(0);
+          // assert(0);
           break;
       }
       d.f32 = a.f32 * b.f32 + c.f32;
@@ -4326,7 +4327,7 @@ void mul_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
     case S64_TYPE:
       t.s64 = a.s64 * b.s64;
       assert(!pI->is_wide());
-      //assert(!pI->is_hi());
+      // assert(!pI->is_hi());
       d.s64 = t.s64;
       break;
     case U16_TYPE:
diff --git a/src/cuda-sim/ptx_ir.cc b/src/cuda-sim/ptx_ir.cc
index f25f1d582..d3095428f 100644
--- a/src/cuda-sim/ptx_ir.cc
+++ b/src/cuda-sim/ptx_ir.cc
@@ -1,18 +1,19 @@
 // Copyright (c) 2009-2021, Tor M. Aamodt, Ali Bakhoda, Wilson W.L. Fung,
 // George L. Yuan, Vijay Kandiah, Nikos Hardavellas,
 // Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
-// The University of British Columbia, Northwestern University, Purdue University
-// All rights reserved.
+// The University of British Columbia, Northwestern University, Purdue
+// University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// 1. Redistributions of source code must retain the above copyright notice, this
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
 //    list of conditions and the following disclaimer;
 // 2. Redistributions in binary form must reproduce the above copyright notice,
 //    this list of conditions and the following disclaimer in the documentation
 //    and/or other materials provided with the distribution;
-// 3. Neither the names of The University of British Columbia, Northwestern 
+// 3. Neither the names of The University of British Columbia, Northwestern
 //    University nor the names of their contributors may be used to
 //    endorse or promote products derived from this software without specific
 //    prior written permission.
@@ -1386,7 +1387,7 @@ ptx_instruction::ptx_instruction(
       case CS_OPTION:
       case LU_OPTION:
       case CV_OPTION:
-      case WB_OPTION: 
+      case WB_OPTION:
       case WT_OPTION:
         m_cache_option = last_ptx_inst_option;
         break;
@@ -1469,8 +1470,8 @@ std::string ptx_instruction::to_string() const {
   char buf[STR_SIZE];
   unsigned used_bytes = 0;
   if (!is_label()) {
-    used_bytes +=
-        snprintf(buf + used_bytes, STR_SIZE - used_bytes, " PC=0x%03llx ", m_PC);
+    used_bytes += snprintf(buf + used_bytes, STR_SIZE - used_bytes,
+                           " PC=0x%03llx ", m_PC);
   } else {
     used_bytes +=
         snprintf(buf + used_bytes, STR_SIZE - used_bytes, "                ");
diff --git a/src/cuda-sim/ptx_ir.h b/src/cuda-sim/ptx_ir.h
index 7ba717118..8b1f19c86 100644
--- a/src/cuda-sim/ptx_ir.h
+++ b/src/cuda-sim/ptx_ir.h
@@ -1248,7 +1248,7 @@ class function_info {
   const ptx_version &get_ptx_version() const {
     return m_symtab->get_ptx_version();
   }
-  virtual ~function_info(){}
+  virtual ~function_info() {}
   unsigned get_sm_target() const { return m_symtab->get_sm_target(); }
   bool is_extern() const { return m_extern; }
   void set_name(const char *name) { m_name = name; }
diff --git a/src/cuda-sim/ptx_sim.cc b/src/cuda-sim/ptx_sim.cc
index 6503499fc..2a548ee36 100644
--- a/src/cuda-sim/ptx_sim.cc
+++ b/src/cuda-sim/ptx_sim.cc
@@ -369,7 +369,8 @@ static void print_reg(FILE *fp, std::string name, ptx_reg_t value,
       fprintf(fp, ".u64 %llu [0x%llx]\n", value.u64, value.u64);
       break;
     case F16_TYPE:
-      fprintf(fp, ".f16 %f [0x%04x]\n", static_cast<float>(value.f16), (unsigned)value.u16);
+      fprintf(fp, ".f16 %f [0x%04x]\n", static_cast<float>(value.f16),
+              (unsigned)value.u16);
       break;
     case F32_TYPE:
       fprintf(fp, ".f32 %.15lf [0x%08x]\n", value.f32, value.u32);
diff --git a/src/debug.cc b/src/debug.cc
index e23ffd46d..8cc5e1f52 100644
--- a/src/debug.cc
+++ b/src/debug.cc
@@ -124,7 +124,7 @@ void gpgpu_sim::gpgpu_debug() {
     fflush(stdout);
 
     char line[1024];
-    char * ptr = fgets(line, 1024, stdin);
+    char *ptr = fgets(line, 1024, stdin);
 
     char *tok = strtok(line, " \t\n");
     if (!strcmp(tok, "dp")) {
@@ -137,7 +137,7 @@ void gpgpu_sim::gpgpu_debug() {
     } else if (!strcmp(tok, "q") || !strcmp(tok, "quit")) {
       printf("\nreally quit GPGPU-Sim (y/n)?\n");
       ptr = fgets(line, 1024, stdin);
-      if(ptr == NULL ){
+      if (ptr == NULL) {
         printf("can't read input\n");
         exit(0);
       }
diff --git a/src/gpgpu-sim/dram.cc b/src/gpgpu-sim/dram.cc
index 53c823870..80e20d795 100644
--- a/src/gpgpu-sim/dram.cc
+++ b/src/gpgpu-sim/dram.cc
@@ -1,18 +1,19 @@
 // Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Ali Bakhoda,
-// Ivan Sham, George L. Yuan, Vijay Kandiah, Nikos Hardavellas, 
+// Ivan Sham, George L. Yuan, Vijay Kandiah, Nikos Hardavellas,
 // Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
-// The University of British Columbia, Northwestern University, Purdue University
-// All rights reserved.
+// The University of British Columbia, Northwestern University, Purdue
+// University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// 1. Redistributions of source code must retain the above copyright notice, this
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
 //    list of conditions and the following disclaimer;
 // 2. Redistributions in binary form must reproduce the above copyright notice,
 //    this list of conditions and the following disclaimer in the documentation
 //    and/or other materials provided with the distribution;
-// 3. Neither the names of The University of British Columbia, Northwestern 
+// 3. Neither the names of The University of British Columbia, Northwestern
 //    University nor the names of their contributors may be used to
 //    endorse or promote products derived from this software without specific
 //    prior written permission.
@@ -592,39 +593,40 @@ bool dram_t::issue_col_command(int j) {
         bk[j]->mrq = NULL;
       }
     } else
-        // correct row activated for a WRITE
-        if (!issued && !CCDc && !bk[j]->RCDWRc && !(bkgrp[grp]->CCDLc) &&
-            (bk[j]->curr_row == bk[j]->mrq->row) && (bk[j]->mrq->rw == WRITE) &&
-            (RTWc == 0) && (bk[j]->state == BANK_ACTIVE) && !rwq->full()) {
-      if (rw == READ) {
-        rw = WRITE;
-        rwq->set_min_length(m_config->WL);
-      }
-      rwq->push(bk[j]->mrq);
-
-      bk[j]->mrq->txbytes += m_config->dram_atom_size;
-      CCDc = m_config->tCCD;
-      bkgrp[grp]->CCDLc = m_config->tCCDL;
-      WTRc = m_config->tWTR;
-      bk[j]->WTPc = m_config->tWTP;
-      issued = true;
-
-      if (bk[j]->mrq->data->get_access_type() == L2_WRBK_ACC)
-        n_wr_WB++;
-      else
-        n_wr++;
-      bwutil += m_config->BL / m_config->data_command_freq_ratio;
-      bwutil_partial += m_config->BL / m_config->data_command_freq_ratio;
+      // correct row activated for a WRITE
+      if (!issued && !CCDc && !bk[j]->RCDWRc && !(bkgrp[grp]->CCDLc) &&
+          (bk[j]->curr_row == bk[j]->mrq->row) && (bk[j]->mrq->rw == WRITE) &&
+          (RTWc == 0) && (bk[j]->state == BANK_ACTIVE) && !rwq->full()) {
+        if (rw == READ) {
+          rw = WRITE;
+          rwq->set_min_length(m_config->WL);
+        }
+        rwq->push(bk[j]->mrq);
+
+        bk[j]->mrq->txbytes += m_config->dram_atom_size;
+        CCDc = m_config->tCCD;
+        bkgrp[grp]->CCDLc = m_config->tCCDL;
+        WTRc = m_config->tWTR;
+        bk[j]->WTPc = m_config->tWTP;
+        issued = true;
+
+        if (bk[j]->mrq->data->get_access_type() == L2_WRBK_ACC)
+          n_wr_WB++;
+        else
+          n_wr++;
+        bwutil += m_config->BL / m_config->data_command_freq_ratio;
+        bwutil_partial += m_config->BL / m_config->data_command_freq_ratio;
 #ifdef DRAM_VERIFY
-      PRINT_CYCLE = 1;
-      printf("\tWR  Bk:%d Row:%03x Col:%03x \n", j, bk[j]->curr_row,
-             bk[j]->mrq->col + bk[j]->mrq->txbytes - m_config->dram_atom_size);
+        PRINT_CYCLE = 1;
+        printf(
+            "\tWR  Bk:%d Row:%03x Col:%03x \n", j, bk[j]->curr_row,
+            bk[j]->mrq->col + bk[j]->mrq->txbytes - m_config->dram_atom_size);
 #endif
-      // transfer done
-      if (!(bk[j]->mrq->txbytes < bk[j]->mrq->nbytes)) {
-        bk[j]->mrq = NULL;
+        // transfer done
+        if (!(bk[j]->mrq->txbytes < bk[j]->mrq->nbytes)) {
+          bk[j]->mrq = NULL;
+        }
       }
-    }
   }
 
   return issued;
@@ -660,23 +662,23 @@ bool dram_t::issue_row_command(int j) {
     }
 
     else
-        // different row activated
-        if ((!issued) && (bk[j]->curr_row != bk[j]->mrq->row) &&
-            (bk[j]->state == BANK_ACTIVE) &&
-            (!bk[j]->RASc && !bk[j]->WTPc && !bk[j]->RTPc &&
-             !bkgrp[grp]->RTPLc)) {
-      // make the bank idle again
-      bk[j]->state = BANK_IDLE;
-      bk[j]->RPc = m_config->tRP;
-      prio = (j + 1) % m_config->nbk;
-      issued = true;
-      n_pre++;
-      n_pre_partial++;
+      // different row activated
+      if ((!issued) && (bk[j]->curr_row != bk[j]->mrq->row) &&
+          (bk[j]->state == BANK_ACTIVE) &&
+          (!bk[j]->RASc && !bk[j]->WTPc && !bk[j]->RTPc &&
+           !bkgrp[grp]->RTPLc)) {
+        // make the bank idle again
+        bk[j]->state = BANK_IDLE;
+        bk[j]->RPc = m_config->tRP;
+        prio = (j + 1) % m_config->nbk;
+        issued = true;
+        n_pre++;
+        n_pre_partial++;
 #ifdef DRAM_VERIFY
-      PRINT_CYCLE = 1;
-      printf("\tPRE BK:%d Row:%03x \n", j, bk[j]->curr_row);
+        PRINT_CYCLE = 1;
+        printf("\tPRE BK:%d Row:%03x \n", j, bk[j]->curr_row);
 #endif
-    }
+      }
   }
   return issued;
 }
@@ -880,5 +882,5 @@ unsigned dram_t::get_bankgrp_number(unsigned i) {
   } else {
     assert(1);
   }
-  return 0; // we should never get here
+  return 0;  // we should never get here
 }
diff --git a/src/gpgpu-sim/dram.h b/src/gpgpu-sim/dram.h
index 90ea3e40e..9e9517b9d 100644
--- a/src/gpgpu-sim/dram.h
+++ b/src/gpgpu-sim/dram.h
@@ -1,18 +1,19 @@
 // Copyright (c) 2009-2021, Tor M. Aamodt, Ivan Sham, Ali Bakhoda,
 // George L. Yuan, Wilson W.L. Fung, Vijay Kandiah, Nikos Hardavellas,
 // Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
-// The University of British Columbia, Northwestern University, Purdue University
-// All rights reserved.
+// The University of British Columbia, Northwestern University, Purdue
+// University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// 1. Redistributions of source code must retain the above copyright notice, this
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
 //    list of conditions and the following disclaimer;
 // 2. Redistributions in binary form must reproduce the above copyright notice,
 //    this list of conditions and the following disclaimer in the documentation
 //    and/or other materials provided with the distribution;
-// 3. Neither the names of The University of British Columbia, Northwestern 
+// 3. Neither the names of The University of British Columbia, Northwestern
 //    University nor the names of their contributors may be used to
 //    endorse or promote products derived from this software without specific
 //    prior written permission.
diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index f4448d3d0..32cc56b63 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -1,18 +1,19 @@
-// Copyright (c) 2009-2021, Tor M. Aamodt, Tayler Hetherington, 
+// Copyright (c) 2009-2021, Tor M. Aamodt, Tayler Hetherington,
 // Vijay Kandiah, Nikos Hardavellas, Mahmoud Khairy, Junrui Pan,
 // Timothy G. Rogers
-// The University of British Columbia, Northwestern University, Purdue University
-// All rights reserved.
+// The University of British Columbia, Northwestern University, Purdue
+// University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// 1. Redistributions of source code must retain the above copyright notice, this
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
 //    list of conditions and the following disclaimer;
 // 2. Redistributions in binary form must reproduce the above copyright notice,
 //    this list of conditions and the following disclaimer in the documentation
 //    and/or other materials provided with the distribution;
-// 3. Neither the names of The University of British Columbia, Northwestern 
+// 3. Neither the names of The University of British Columbia, Northwestern
 //    University nor the names of their contributors may be used to
 //    endorse or promote products derived from this software without specific
 //    prior written permission.
@@ -287,10 +288,11 @@ enum cache_request_status tag_array::probe(new_addr_type addr, unsigned &idx,
       // number of dirty lines / total lines in the cache
       float dirty_line_percentage =
           ((float)m_dirty / (m_config.m_nset * m_config.m_assoc)) * 100;
-      // If the cacheline is from a load op (not modified), 
+      // If the cacheline is from a load op (not modified),
       // or the total dirty cacheline is above a specific value,
-      // Then this cacheline is eligible to be considered for replacement candidate
-      // i.e. Only evict clean cachelines until total dirty cachelines reach the limit.
+      // Then this cacheline is eligible to be considered for replacement
+      // candidate i.e. Only evict clean cachelines until total dirty cachelines
+      // reach the limit.
       if (!line->is_modified_line() ||
           dirty_line_percentage >= m_config.m_wr_percent) {
         all_reserved = false;
@@ -411,7 +413,7 @@ void tag_array::fill(new_addr_type addr, unsigned time,
   enum cache_request_status status = probe(addr, idx, mask, is_write);
 
   if (status == RESERVATION_FAIL) {
-	 return;
+    return;
   }
 
   bool before = m_lines[idx]->is_modified_line();
@@ -437,7 +439,8 @@ void tag_array::fill(new_addr_type addr, unsigned time,
 void tag_array::fill(unsigned index, unsigned time, mem_fetch *mf) {
   assert(m_config.m_alloc_policy == ON_MISS);
   bool before = m_lines[index]->is_modified_line();
-  m_lines[index]->fill(time, mf->get_access_sector_mask(), mf->get_access_byte_mask());
+  m_lines[index]->fill(time, mf->get_access_sector_mask(),
+                       mf->get_access_byte_mask());
   if (m_lines[index]->is_modified_line() && !before) {
     m_dirty++;
   }
@@ -1210,15 +1213,14 @@ void data_cache::update_m_readable(mem_fetch *mf, unsigned cache_index) {
     if (mf->get_access_sector_mask().test(i)) {
       bool all_set = true;
       for (unsigned k = i * SECTOR_SIZE; k < (i + 1) * SECTOR_SIZE; k++) {
-        // If any bit in the byte mask (within the sector) is not set, 
+        // If any bit in the byte mask (within the sector) is not set,
         // the sector is unreadble
         if (!block->get_dirty_byte_mask().test(k)) {
           all_set = false;
           break;
         }
       }
-      if (all_set)
-        block->set_m_readable(true, mf->get_access_sector_mask());
+      if (all_set) block->set_m_readable(true, mf->get_access_sector_mask());
     }
   }
 }
@@ -1239,7 +1241,7 @@ cache_request_status data_cache::wr_hit_wb(new_addr_type addr,
   }
   block->set_status(MODIFIED, mf->get_access_sector_mask());
   block->set_byte_mask(mf);
-  update_m_readable(mf,cache_index);
+  update_m_readable(mf, cache_index);
 
   return HIT;
 }
@@ -1263,7 +1265,7 @@ cache_request_status data_cache::wr_hit_wt(new_addr_type addr,
   }
   block->set_status(MODIFIED, mf->get_access_sector_mask());
   block->set_byte_mask(mf);
-  update_m_readable(mf,cache_index);
+  update_m_readable(mf, cache_index);
 
   // generate a write-through
   send_write_request(mf, cache_event(WRITE_REQUEST_SENT), time, events);
@@ -1559,7 +1561,7 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
     if (m_status == HIT_RESERVED)
       block->set_readable_on_fill(true, mf->get_access_sector_mask());
   }
-  update_m_readable(mf,cache_index);
+  update_m_readable(mf, cache_index);
 
   if (m_status != RESERVATION_FAIL) {
     // If evicted block is modified and not a write-through
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index ad41320bf..5fd40a9bc 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -1,17 +1,18 @@
-// Copyright (c) 2009-2021, Tor M. Aamodt, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas, 
-// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
-// The University of British Columbia, Northwestern University, Purdue University
+// Copyright (c) 2009-2021, Tor M. Aamodt, Tayler Hetherington, Vijay Kandiah,
+// Nikos Hardavellas, Mahmoud Khairy, Junrui Pan, Timothy G. Rogers The
+// University of British Columbia, Northwestern University, Purdue University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// 1. Redistributions of source code must retain the above copyright notice, this
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
 //    list of conditions and the following disclaimer;
 // 2. Redistributions in binary form must reproduce the above copyright notice,
 //    this list of conditions and the following disclaimer in the documentation
 //    and/or other materials provided with the distribution;
-// 3. Neither the names of The University of British Columbia, Northwestern 
+// 3. Neither the names of The University of British Columbia, Northwestern
 //    University nor the names of their contributors may be used to
 //    endorse or promote products derived from this software without specific
 //    prior written permission.
@@ -499,7 +500,7 @@ struct sector_cache_block : public cache_block_t {
     for (unsigned i = 0; i < SECTOR_CHUNCK_SIZE; ++i) {
       if (sector_mask.to_ulong() & (1 << i)) return i;
     }
-    return SECTOR_CHUNCK_SIZE; //error
+    return SECTOR_CHUNCK_SIZE;  // error
   }
 };
 
@@ -564,12 +565,10 @@ class cache_config {
     char ct, rp, wp, ap, mshr_type, wap, sif;
 
     int ntok =
-        sscanf(config, "%c:%u:%u:%u,%c:%c:%c:%c:%c,%c:%u:%u,%u:%u,%u", 
-            &ct, &m_nset, &m_line_sz, &m_assoc, 
-            &rp, &wp, &ap, &wap, &sif,
-            &mshr_type, &m_mshr_entries, &m_mshr_max_merge,
-            &m_miss_queue_size, &m_result_fifo_entries, 
-            &m_data_port_width);
+        sscanf(config, "%c:%u:%u:%u,%c:%c:%c:%c:%c,%c:%u:%u,%u:%u,%u", &ct,
+               &m_nset, &m_line_sz, &m_assoc, &rp, &wp, &ap, &wap, &sif,
+               &mshr_type, &m_mshr_entries, &m_mshr_max_merge,
+               &m_miss_queue_size, &m_result_fifo_entries, &m_data_port_width);
 
     if (ntok < 12) {
       if (!strcmp(config, "none")) {
@@ -726,15 +725,14 @@ class cache_config {
     }
 
     if (m_cache_type == SECTOR) {
-      bool cond = 
-            m_line_sz / SECTOR_SIZE == SECTOR_CHUNCK_SIZE &&
-            m_line_sz % SECTOR_SIZE == 0;
-      if(!cond){
-          std::cerr<<"error: For sector cache, the simulator uses hard-coded "
-             "SECTOR_SIZE and SECTOR_CHUNCK_SIZE. The line size "
-             "must be product of both values.\n";
-          assert(0);
-        }
+      bool cond = m_line_sz / SECTOR_SIZE == SECTOR_CHUNCK_SIZE &&
+                  m_line_sz % SECTOR_SIZE == 0;
+      if (!cond) {
+        std::cerr << "error: For sector cache, the simulator uses hard-coded "
+                     "SECTOR_SIZE and SECTOR_CHUNCK_SIZE. The line size "
+                     "must be product of both values.\n";
+        assert(0);
+      }
     }
 
     // default: port to data array width and granularity = line size
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index 47c0b4a89..1cb8a251e 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -1,18 +1,19 @@
 // Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, George L. Yuan,
-// Ali Bakhoda, Andrew Turner, Ivan Sham, Vijay Kandiah, Nikos Hardavellas, 
+// Ali Bakhoda, Andrew Turner, Ivan Sham, Vijay Kandiah, Nikos Hardavellas,
 // Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
-// The University of British Columbia, Northwestern University, Purdue University
-// All rights reserved.
+// The University of British Columbia, Northwestern University, Purdue
+// University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// 1. Redistributions of source code must retain the above copyright notice, this
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
 //    list of conditions and the following disclaimer;
 // 2. Redistributions in binary form must reproduce the above copyright notice,
 //    this list of conditions and the following disclaimer in the documentation
 //    and/or other materials provided with the distribution;
-// 3. Neither the names of The University of British Columbia, Northwestern 
+// 3. Neither the names of The University of British Columbia, Northwestern
 //    University nor the names of their contributors may be used to
 //    endorse or promote products derived from this software without specific
 //    prior written permission.
@@ -80,7 +81,7 @@ class gpgpu_sim_wrapper {};
 #include <sstream>
 #include <string>
 
-// #define MAX(a, b) (((a) > (b)) ? (a) : (b)) //redefined 
+// #define MAX(a, b) (((a) > (b)) ? (a) : (b)) //redefined
 
 bool g_interactive_debugger_enabled = false;
 
@@ -97,7 +98,6 @@ tr1_hash_map<new_addr_type, unsigned> address_random_interleaving;
 
 #include "mem_latency_stat.h"
 
-
 void power_config::reg_options(class OptionParser *opp) {
   option_parser_register(opp, "-accelwattch_xml_file", OPT_CSTR,
                          &g_power_config_name, "AccelWattch XML file",
@@ -111,91 +111,106 @@ void power_config::reg_options(class OptionParser *opp) {
                          &g_power_per_cycle_dump,
                          "Dump detailed power output each cycle", "0");
 
-
-
-
   option_parser_register(opp, "-hw_perf_file_name", OPT_CSTR,
-                         &g_hw_perf_file_name, "Hardware Performance Statistics file",
-                         "hw_perf.csv");
+                         &g_hw_perf_file_name,
+                         "Hardware Performance Statistics file", "hw_perf.csv");
 
-  option_parser_register(opp, "-hw_perf_bench_name", OPT_CSTR,
-                         &g_hw_perf_bench_name, "Kernel Name in Hardware Performance Statistics file",
-                         "");
+  option_parser_register(
+      opp, "-hw_perf_bench_name", OPT_CSTR, &g_hw_perf_bench_name,
+      "Kernel Name in Hardware Performance Statistics file", "");
 
   option_parser_register(opp, "-power_simulation_mode", OPT_INT32,
                          &g_power_simulation_mode,
-                         "Switch performance counter input for power simulation (0=Sim, 1=HW, 2=HW-Sim Hybrid)", "0");
+                         "Switch performance counter input for power "
+                         "simulation (0=Sim, 1=HW, 2=HW-Sim Hybrid)",
+                         "0");
 
-  option_parser_register(opp, "-dvfs_enabled", OPT_BOOL,
-                         &g_dvfs_enabled,
+  option_parser_register(opp, "-dvfs_enabled", OPT_BOOL, &g_dvfs_enabled,
                          "Turn on DVFS for power model", "0");
   option_parser_register(opp, "-aggregate_power_stats", OPT_BOOL,
                          &g_aggregate_power_stats,
                          "Accumulate power across all kernels", "0");
 
-  //Accelwattch Hyrbid Configuration
-
-  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L1_RH", OPT_BOOL,
-                         &accelwattch_hybrid_configuration[HW_L1_RH],
-                         "Get L1 Read Hits for Accelwattch-Hybrid from Accel-Sim", "0");
-  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L1_RM", OPT_BOOL,
-                         &accelwattch_hybrid_configuration[HW_L1_RM],
-                         "Get L1 Read Misses for Accelwattch-Hybrid from Accel-Sim", "0");
-  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L1_WH", OPT_BOOL,
-                         &accelwattch_hybrid_configuration[HW_L1_WH],
-                         "Get L1 Write Hits for Accelwattch-Hybrid from Accel-Sim", "0");
-  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L1_WM", OPT_BOOL,
-                         &accelwattch_hybrid_configuration[HW_L1_WM],
-                         "Get L1 Write Misses for Accelwattch-Hybrid from Accel-Sim", "0");
-
-  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L2_RH", OPT_BOOL,
-                         &accelwattch_hybrid_configuration[HW_L2_RH],
-                         "Get L2 Read Hits for Accelwattch-Hybrid from Accel-Sim", "0");
-  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L2_RM", OPT_BOOL,
-                         &accelwattch_hybrid_configuration[HW_L2_RM],
-                         "Get L2 Read Misses for Accelwattch-Hybrid from Accel-Sim", "0");
-  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L2_WH", OPT_BOOL,
-                         &accelwattch_hybrid_configuration[HW_L2_WH],
-                         "Get L2 Write Hits for Accelwattch-Hybrid from Accel-Sim", "0");
-  option_parser_register(opp, "-accelwattch_hybrid_perfsim_L2_WM", OPT_BOOL,
-                         &accelwattch_hybrid_configuration[HW_L2_WM],
-                         "Get L2 Write Misses for Accelwattch-Hybrid from Accel-Sim", "0");
-
-  option_parser_register(opp, "-accelwattch_hybrid_perfsim_CC_ACC", OPT_BOOL,
-                         &accelwattch_hybrid_configuration[HW_CC_ACC],
-                         "Get Constant Cache Acesses for Accelwattch-Hybrid from Accel-Sim", "0");
-
-  option_parser_register(opp, "-accelwattch_hybrid_perfsim_SHARED_ACC", OPT_BOOL,
-                         &accelwattch_hybrid_configuration[HW_SHRD_ACC],
-                         "Get Shared Memory Acesses for Accelwattch-Hybrid from Accel-Sim", "0");
+  // Accelwattch Hyrbid Configuration
+
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_L1_RH", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_L1_RH],
+      "Get L1 Read Hits for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_L1_RM", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_L1_RM],
+      "Get L1 Read Misses for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_L1_WH", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_L1_WH],
+      "Get L1 Write Hits for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_L1_WM", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_L1_WM],
+      "Get L1 Write Misses for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_L2_RH", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_L2_RH],
+      "Get L2 Read Hits for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_L2_RM", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_L2_RM],
+      "Get L2 Read Misses for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_L2_WH", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_L2_WH],
+      "Get L2 Write Hits for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_L2_WM", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_L2_WM],
+      "Get L2 Write Misses for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_CC_ACC", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_CC_ACC],
+      "Get Constant Cache Acesses for Accelwattch-Hybrid from Accel-Sim", "0");
+
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_SHARED_ACC", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_SHRD_ACC],
+      "Get Shared Memory Acesses for Accelwattch-Hybrid from Accel-Sim", "0");
 
   option_parser_register(opp, "-accelwattch_hybrid_perfsim_DRAM_RD", OPT_BOOL,
                          &accelwattch_hybrid_configuration[HW_DRAM_RD],
-                         "Get DRAM Reads for Accelwattch-Hybrid from Accel-Sim", "0");
-  option_parser_register(opp, "-accelwattch_hybrid_perfsim_DRAM_WR", OPT_BOOL,
-                         &accelwattch_hybrid_configuration[HW_DRAM_WR],
-                         "Get DRAM Writes for Accelwattch-Hybrid from Accel-Sim", "0");
-
-  option_parser_register(opp, "-accelwattch_hybrid_perfsim_NOC", OPT_BOOL,
-                         &accelwattch_hybrid_configuration[HW_NOC],
-                         "Get Interconnect Acesses for Accelwattch-Hybrid from Accel-Sim", "0");
+                         "Get DRAM Reads for Accelwattch-Hybrid from Accel-Sim",
+                         "0");
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_DRAM_WR", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_DRAM_WR],
+      "Get DRAM Writes for Accelwattch-Hybrid from Accel-Sim", "0");
 
-  option_parser_register(opp, "-accelwattch_hybrid_perfsim_PIPE_DUTY", OPT_BOOL,
-                         &accelwattch_hybrid_configuration[HW_PIPE_DUTY],
-                         "Get Pipeline Duty Cycle Acesses for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_NOC", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_NOC],
+      "Get Interconnect Acesses for Accelwattch-Hybrid from Accel-Sim", "0");
 
-  option_parser_register(opp, "-accelwattch_hybrid_perfsim_NUM_SM_IDLE", OPT_BOOL,
-                         &accelwattch_hybrid_configuration[HW_NUM_SM_IDLE],
-                         "Get Number of Idle SMs for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_PIPE_DUTY", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_PIPE_DUTY],
+      "Get Pipeline Duty Cycle Acesses for Accelwattch-Hybrid from Accel-Sim",
+      "0");
 
-  option_parser_register(opp, "-accelwattch_hybrid_perfsim_CYCLES", OPT_BOOL,
-                         &accelwattch_hybrid_configuration[HW_CYCLES],
-                         "Get Executed Cycles for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_NUM_SM_IDLE", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_NUM_SM_IDLE],
+      "Get Number of Idle SMs for Accelwattch-Hybrid from Accel-Sim", "0");
 
-  option_parser_register(opp, "-accelwattch_hybrid_perfsim_VOLTAGE", OPT_BOOL,
-                         &accelwattch_hybrid_configuration[HW_VOLTAGE],
-                         "Get Chip Voltage for Accelwattch-Hybrid from Accel-Sim", "0");
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_CYCLES", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_CYCLES],
+      "Get Executed Cycles for Accelwattch-Hybrid from Accel-Sim", "0");
 
+  option_parser_register(
+      opp, "-accelwattch_hybrid_perfsim_VOLTAGE", OPT_BOOL,
+      &accelwattch_hybrid_configuration[HW_VOLTAGE],
+      "Get Chip Voltage for Accelwattch-Hybrid from Accel-Sim", "0");
 
   // Output Data Formats
   option_parser_register(
@@ -702,7 +717,8 @@ void gpgpu_sim_config::reg_options(option_parser_t opp) {
   option_parser_register(
       opp, "-gpgpu_max_concurrent_kernel", OPT_INT32, &max_concurrent_kernel,
       "maximum kernels that can run concurrently on GPU, set this value "
-      "according to max resident grids for your compute capability", "32");
+      "according to max resident grids for your compute capability",
+      "32");
   option_parser_register(
       opp, "-gpgpu_cflog_interval", OPT_INT32, &gpgpu_cflog_interval,
       "Interval between each snapshot in control flow logger", "0");
@@ -924,8 +940,9 @@ gpgpu_sim::gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx)
   ptx_file_line_stats_create_exposed_latency_tracker(m_config.num_shader());
 
 #ifdef GPGPUSIM_POWER_MODEL
-  m_gpgpusim_wrapper = new gpgpu_sim_wrapper(config.g_power_simulation_enabled,
-                                             config.g_power_config_name, config.g_power_simulation_mode, config.g_dvfs_enabled);
+  m_gpgpusim_wrapper = new gpgpu_sim_wrapper(
+      config.g_power_simulation_enabled, config.g_power_config_name,
+      config.g_power_simulation_mode, config.g_dvfs_enabled);
 #endif
 
   m_shader_stats = new shader_core_stats(m_shader_config);
@@ -1157,8 +1174,7 @@ void gpgpu_sim::update_stats() {
   gpu_occupancy = occupancy_stats();
 }
 
-PowerscalingCoefficients *gpgpu_sim::get_scaling_coeffs()
-{
+PowerscalingCoefficients *gpgpu_sim::get_scaling_coeffs() {
   return m_gpgpusim_wrapper->get_scaling_coeffs();
 }
 
@@ -1243,10 +1259,10 @@ std::string gpgpu_sim::executed_kernel_info_string() {
 }
 
 std::string gpgpu_sim::executed_kernel_name() {
-  std::stringstream statout;  
-  if( m_executed_kernel_names.size() == 1)
-     statout << m_executed_kernel_names[0];
-  else{
+  std::stringstream statout;
+  if (m_executed_kernel_names.size() == 1)
+    statout << m_executed_kernel_names[0];
+  else {
     for (unsigned int k = 0; k < m_executed_kernel_names.size(); k++) {
       statout << m_executed_kernel_names[k] << " ";
     }
@@ -1433,20 +1449,23 @@ void gpgpu_sim::gpu_print_stat() {
   m_shader_stats->print(stdout);
 #ifdef GPGPUSIM_POWER_MODEL
   if (m_config.g_power_simulation_enabled) {
-    if(m_config.g_power_simulation_mode > 0){
-        //if(!m_config.g_aggregate_power_stats)
-          mcpat_reset_perf_count(m_gpgpusim_wrapper);
-        calculate_hw_mcpat(m_config, getShaderCoreConfig(), m_gpgpusim_wrapper,
-                  m_power_stats, m_config.gpu_stat_sample_freq,
-                  gpu_tot_sim_cycle, gpu_sim_cycle, gpu_tot_sim_insn,
-                  gpu_sim_insn, m_config.g_power_simulation_mode, m_config.g_dvfs_enabled, 
-                  m_config.g_hw_perf_file_name, m_config.g_hw_perf_bench_name, executed_kernel_name(), m_config.accelwattch_hybrid_configuration, m_config.g_aggregate_power_stats);
+    if (m_config.g_power_simulation_mode > 0) {
+      // if(!m_config.g_aggregate_power_stats)
+      mcpat_reset_perf_count(m_gpgpusim_wrapper);
+      calculate_hw_mcpat(m_config, getShaderCoreConfig(), m_gpgpusim_wrapper,
+                         m_power_stats, m_config.gpu_stat_sample_freq,
+                         gpu_tot_sim_cycle, gpu_sim_cycle, gpu_tot_sim_insn,
+                         gpu_sim_insn, m_config.g_power_simulation_mode,
+                         m_config.g_dvfs_enabled, m_config.g_hw_perf_file_name,
+                         m_config.g_hw_perf_bench_name, executed_kernel_name(),
+                         m_config.accelwattch_hybrid_configuration,
+                         m_config.g_aggregate_power_stats);
     }
     m_gpgpusim_wrapper->print_power_kernel_stats(
         gpu_sim_cycle, gpu_tot_sim_cycle, gpu_tot_sim_insn + gpu_sim_insn,
         kernel_info_str, true);
-    //if(!m_config.g_aggregate_power_stats)
-      mcpat_reset_perf_count(m_gpgpusim_wrapper);
+    // if(!m_config.g_aggregate_power_stats)
+    mcpat_reset_perf_count(m_gpgpusim_wrapper);
   }
 #endif
 
@@ -1810,7 +1829,8 @@ void shader_core_ctx::issue_block2core(kernel_info_t &kernel) {
                  "GPGPU-Sim uArch: cta:%2u, start_tid:%4u, end_tid:%4u, "
                  "initialized @(%lld,%lld), kernel_uid:%u, kernel_name:%s\n",
                  free_cta_hw_id, start_thread, end_thread, m_gpu->gpu_sim_cycle,
-                 m_gpu->gpu_tot_sim_cycle, kernel.get_uid(), kernel.get_name().c_str());
+                 m_gpu->gpu_tot_sim_cycle, kernel.get_uid(),
+                 kernel.get_name().c_str());
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////
@@ -1987,11 +2007,11 @@ void gpgpu_sim::cycle() {
       // McPAT main cycle (interface with McPAT)
 #ifdef GPGPUSIM_POWER_MODEL
     if (m_config.g_power_simulation_enabled) {
-      if(m_config.g_power_simulation_mode == 0){
-      mcpat_cycle(m_config, getShaderCoreConfig(), m_gpgpusim_wrapper,
-                  m_power_stats, m_config.gpu_stat_sample_freq,
-                  gpu_tot_sim_cycle, gpu_sim_cycle, gpu_tot_sim_insn,
-                  gpu_sim_insn, m_config.g_dvfs_enabled);
+      if (m_config.g_power_simulation_mode == 0) {
+        mcpat_cycle(m_config, getShaderCoreConfig(), m_gpgpusim_wrapper,
+                    m_power_stats, m_config.gpu_stat_sample_freq,
+                    gpu_tot_sim_cycle, gpu_sim_cycle, gpu_tot_sim_insn,
+                    gpu_sim_insn, m_config.g_dvfs_enabled);
       }
     }
 #endif
diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h
index de69ef8ce..a24ffd30e 100644
--- a/src/gpgpu-sim/gpu-sim.h
+++ b/src/gpgpu-sim/gpu-sim.h
@@ -1,17 +1,18 @@
-// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Vijay Kandiah, Nikos Hardavellas
-// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
-// The University of British Columbia, Northwestern University, Purdue University
+// Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Vijay Kandiah,
+// Nikos Hardavellas Mahmoud Khairy, Junrui Pan, Timothy G. Rogers The
+// University of British Columbia, Northwestern University, Purdue University
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// 1. Redistributions of source code must retain the above copyright notice, this
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
 //    list of conditions and the following disclaimer;
 // 2. Redistributions in binary form must reproduce the above copyright notice,
 //    this list of conditions and the following disclaimer in the documentation
 //    and/or other materials provided with the distribution;
-// 3. Neither the names of The University of British Columbia, Northwestern 
+// 3. Neither the names of The University of British Columbia, Northwestern
 //    University nor the names of their contributors may be used to
 //    endorse or promote products derived from this software without specific
 //    prior written permission.
@@ -28,7 +29,6 @@
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 
-
 #ifndef GPU_SIM_H
 #define GPU_SIM_H
 
@@ -72,7 +72,7 @@ extern tr1_hash_map<new_addr_type, unsigned> address_random_interleaving;
 enum dram_ctrl_t { DRAM_FIFO = 0, DRAM_FRFCFS = 1 };
 
 enum hw_perf_t {
-  HW_BENCH_NAME=0,
+  HW_BENCH_NAME = 0,
   HW_KERNEL_NAME,
   HW_L1_RH,
   HW_L1_RM,
@@ -108,7 +108,7 @@ struct power_config {
       s++;
     }
     char buf1[1024];
-    //snprintf(buf1, 1024, "accelwattch_power_report__%s.log", date);
+    // snprintf(buf1, 1024, "accelwattch_power_report__%s.log", date);
     snprintf(buf1, 1024, "accelwattch_power_report.log");
     g_power_filename = strdup(buf1);
     char buf2[1024];
@@ -155,7 +155,6 @@ struct power_config {
   double gpu_steady_power_deviation;
   double gpu_steady_min_period;
 
-
   char *g_hw_perf_file_name;
   char *g_hw_perf_bench_name;
   int g_power_simulation_mode;
diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index 5b63765a6..846945378 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -1,17 +1,18 @@
-// Copyright (c) 2009-2021, Tor M. Aamodt, Vijay Kandiah, Nikos Hardavellas, 
+// Copyright (c) 2009-2021, Tor M. Aamodt, Vijay Kandiah, Nikos Hardavellas,
 // Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
-// The University of British Columbia, Northwestern University, Purdue University
-// All rights reserved.
+// The University of British Columbia, Northwestern University, Purdue
+// University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// 1. Redistributions of source code must retain the above copyright notice, this
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
 //    list of conditions and the following disclaimer;
 // 2. Redistributions in binary form must reproduce the above copyright notice,
 //    this list of conditions and the following disclaimer in the documentation
 //    and/or other materials provided with the distribution;
-// 3. Neither the names of The University of British Columbia, Northwestern 
+// 3. Neither the names of The University of British Columbia, Northwestern
 //    University nor the names of their contributors may be used to
 //    endorse or promote products derived from this software without specific
 //    prior written permission.
@@ -390,7 +391,8 @@ void memory_partition_unit::set_done(mem_fetch *mf) {
 
 void memory_partition_unit::set_dram_power_stats(
     unsigned &n_cmd, unsigned &n_activity, unsigned &n_nop, unsigned &n_act,
-    unsigned &n_pre, unsigned &n_rd, unsigned &n_wr, unsigned &n_wr_WB, unsigned &n_req) const {
+    unsigned &n_pre, unsigned &n_rd, unsigned &n_wr, unsigned &n_wr_WB,
+    unsigned &n_req) const {
   m_dram->set_dram_power_stats(n_cmd, n_activity, n_nop, n_act, n_pre, n_rd,
                                n_wr, n_wr_WB, n_req);
 }
diff --git a/src/gpgpu-sim/l2cache.h b/src/gpgpu-sim/l2cache.h
index 7fa1f2917..ccf9b70e8 100644
--- a/src/gpgpu-sim/l2cache.h
+++ b/src/gpgpu-sim/l2cache.h
@@ -1,17 +1,18 @@
 // Copyright (c) 2009-2021, Tor M. Aamodt, Vijay Kandiah, Nikos Hardavellas,
 // Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
-// The University of British Columbia, Northwestern University, Purdue University
-// All rights reserved.
+// The University of British Columbia, Northwestern University, Purdue
+// University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// 1. Redistributions of source code must retain the above copyright notice, this
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
 //    list of conditions and the following disclaimer;
 // 2. Redistributions in binary form must reproduce the above copyright notice,
 //    this list of conditions and the following disclaimer in the documentation
 //    and/or other materials provided with the distribution;
-// 3. Neither the names of The University of British Columbia, Northwestern 
+// 3. Neither the names of The University of British Columbia, Northwestern
 //    University nor the names of their contributors may be used to
 //    endorse or promote products derived from this software without specific
 //    prior written permission.
diff --git a/src/gpgpu-sim/local_interconnect.cc b/src/gpgpu-sim/local_interconnect.cc
index fe7bc74fb..7e1ab5b04 100644
--- a/src/gpgpu-sim/local_interconnect.cc
+++ b/src/gpgpu-sim/local_interconnect.cc
@@ -148,7 +148,7 @@ void xbar_router::RR_Advance() {
       }
     }
   }
-  next_node_id = next_node_id + 1 ;
+  next_node_id = next_node_id + 1;
   next_node_id = (next_node_id % total_nodes);
 
   conflicts += conflict_sub;
diff --git a/src/gpgpu-sim/power_interface.cc b/src/gpgpu-sim/power_interface.cc
index 45a09bcd9..cddb6e987 100644
--- a/src/gpgpu-sim/power_interface.cc
+++ b/src/gpgpu-sim/power_interface.cc
@@ -1,17 +1,18 @@
-// Copyright (c) 2009-2021, Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas, 
-// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
-// The University of British Columbia, Northwestern University, Purdue University
-// All rights reserved.
+// Copyright (c) 2009-2021, Tor M. Aamodt, Ahmed El-Shafiey, Tayler
+// Hetherington, Vijay Kandiah, Nikos Hardavellas, Mahmoud Khairy, Junrui Pan,
+// Timothy G. Rogers The University of British Columbia, Northwestern
+// University, Purdue University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// 1. Redistributions of source code must retain the above copyright notice, this
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
 //    list of conditions and the following disclaimer;
 // 2. Redistributions in binary form must reproduce the above copyright notice,
 //    this list of conditions and the following disclaimer in the documentation
 //    and/or other materials provided with the distribution;
-// 3. Neither the names of The University of British Columbia, Northwestern 
+// 3. Neither the names of The University of British Columbia, Northwestern
 //    University nor the names of their contributors may be used to
 //    endorse or promote products derived from this software without specific
 //    prior written permission.
@@ -28,10 +29,8 @@
 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 // POSSIBILITY OF SUCH DAMAGE.
 
-
 #include "power_interface.h"
 
-
 void init_mcpat(const gpgpu_sim_config &config,
                 class gpgpu_sim_wrapper *wrapper, unsigned stat_sample_freq,
                 unsigned tot_inst, unsigned inst) {
@@ -42,11 +41,9 @@ void init_mcpat(const gpgpu_sim_config &config,
       config.g_power_simulation_enabled, config.g_power_trace_enabled,
       config.g_steady_power_levels_enabled, config.g_power_per_cycle_dump,
       config.gpu_steady_power_deviation, config.gpu_steady_min_period,
-      config.g_power_trace_zlevel, tot_inst + inst, stat_sample_freq,  
-      config.g_power_simulation_mode, 
-      config.g_dvfs_enabled,
-      config.get_core_freq()/1000000,
-      config.num_shader());
+      config.g_power_trace_zlevel, tot_inst + inst, stat_sample_freq,
+      config.g_power_simulation_mode, config.g_dvfs_enabled,
+      config.get_core_freq() / 1000000, config.num_shader());
 }
 
 void mcpat_cycle(const gpgpu_sim_config &config,
@@ -63,8 +60,9 @@ void mcpat_cycle(const gpgpu_sim_config &config,
   }
 
   if ((tot_cycle + cycle) % stat_sample_freq == 0) {
-    if(dvfs_enabled){
-      wrapper->set_model_voltage(1); //performance model needs to support this.
+    if (dvfs_enabled) {
+      wrapper->set_model_voltage(1);  // performance model needs to support
+                                      // this.
     }
 
     wrapper->set_inst_power(
@@ -85,14 +83,17 @@ void mcpat_cycle(const gpgpu_sim_config &config,
                               power_stats->get_inst_c_misses(0));
 
     // Constant Cache, shared memory, texture cache
-    wrapper->set_ccache_power(power_stats->get_const_accessess(0), 0); //assuming all HITS in constant cache for now
+    wrapper->set_ccache_power(
+        power_stats->get_const_accessess(0),
+        0);  // assuming all HITS in constant cache for now
     wrapper->set_tcache_power(power_stats->get_texture_c_hits(),
                               power_stats->get_texture_c_misses());
     wrapper->set_shrd_mem_power(power_stats->get_shmem_access(0));
 
-    wrapper->set_l1cache_power(
-        power_stats->get_l1d_read_hits(0), power_stats->get_l1d_read_misses(0),
-        power_stats->get_l1d_write_hits(0), power_stats->get_l1d_write_misses(0));
+    wrapper->set_l1cache_power(power_stats->get_l1d_read_hits(0),
+                               power_stats->get_l1d_read_misses(0),
+                               power_stats->get_l1d_write_hits(0),
+                               power_stats->get_l1d_write_misses(0));
 
     wrapper->set_l2cache_power(
         power_stats->get_l2_read_hits(0), power_stats->get_l2_read_misses(0),
@@ -120,24 +121,23 @@ void mcpat_cycle(const gpgpu_sim_config &config,
     // Execution pipeline accesses
     // FPU (SP) accesses, Integer ALU (not present in Tesla), Sfu accesses
 
-    wrapper->set_int_accesses(power_stats->get_ialu_accessess(0), 
-                              power_stats->get_intmul24_accessess(0), 
-                              power_stats->get_intmul32_accessess(0), 
-                              power_stats->get_intmul_accessess(0), 
+    wrapper->set_int_accesses(power_stats->get_ialu_accessess(0),
+                              power_stats->get_intmul24_accessess(0),
+                              power_stats->get_intmul32_accessess(0),
+                              power_stats->get_intmul_accessess(0),
                               power_stats->get_intdiv_accessess(0));
 
-    wrapper->set_dp_accesses(power_stats->get_dp_accessess(0), 
-                              power_stats->get_dpmul_accessess(0), 
-                              power_stats->get_dpdiv_accessess(0));
+    wrapper->set_dp_accesses(power_stats->get_dp_accessess(0),
+                             power_stats->get_dpmul_accessess(0),
+                             power_stats->get_dpdiv_accessess(0));
 
-    wrapper->set_fp_accesses(power_stats->get_fp_accessess(0), 
-                            power_stats->get_fpmul_accessess(0), 
-                            power_stats->get_fpdiv_accessess(0));
+    wrapper->set_fp_accesses(power_stats->get_fp_accessess(0),
+                             power_stats->get_fpmul_accessess(0),
+                             power_stats->get_fpdiv_accessess(0));
 
-    wrapper->set_trans_accesses(power_stats->get_sqrt_accessess(0), 
-                                power_stats->get_log_accessess(0), 
-                                power_stats->get_sin_accessess(0), 
-                                power_stats->get_exp_accessess(0));
+    wrapper->set_trans_accesses(
+        power_stats->get_sqrt_accessess(0), power_stats->get_log_accessess(0),
+        power_stats->get_sin_accessess(0), power_stats->get_exp_accessess(0));
 
     wrapper->set_tensor_accesses(power_stats->get_tensor_accessess(0));
 
@@ -154,23 +154,21 @@ void mcpat_cycle(const gpgpu_sim_config &config,
         (power_stats->get_sp_active_lanes()) / stat_sample_freq;
     float avg_sfu_active_lanes =
         (power_stats->get_sfu_active_lanes()) / stat_sample_freq;
-    if(avg_sp_active_lanes >32.0 )
-      avg_sp_active_lanes = 32.0;
-    if(avg_sfu_active_lanes >32.0 )
-      avg_sfu_active_lanes = 32.0;
+    if (avg_sp_active_lanes > 32.0) avg_sp_active_lanes = 32.0;
+    if (avg_sfu_active_lanes > 32.0) avg_sfu_active_lanes = 32.0;
     assert(avg_sp_active_lanes <= 32);
     assert(avg_sfu_active_lanes <= 32);
     wrapper->set_active_lanes_power(avg_sp_active_lanes, avg_sfu_active_lanes);
 
-    double n_icnt_simt_to_mem =
-        (double)
-            power_stats->get_icnt_simt_to_mem(0);  // # flits from SIMT clusters
-                                                  // to memory partitions
-    double n_icnt_mem_to_simt =
-        (double)
-            power_stats->get_icnt_mem_to_simt(0);  // # flits from memory
-                                                  // partitions to SIMT clusters
-    wrapper->set_NoC_power(n_icnt_mem_to_simt + n_icnt_simt_to_mem);  // Number of flits traversing the interconnect
+    double n_icnt_simt_to_mem = (double)power_stats->get_icnt_simt_to_mem(
+        0);  // # flits from SIMT clusters
+             // to memory partitions
+    double n_icnt_mem_to_simt = (double)power_stats->get_icnt_mem_to_simt(
+        0);  // # flits from memory
+             // partitions to SIMT clusters
+    wrapper->set_NoC_power(
+        n_icnt_mem_to_simt +
+        n_icnt_simt_to_mem);  // Number of flits traversing the interconnect
 
     wrapper->compute();
 
@@ -191,68 +189,77 @@ void mcpat_reset_perf_count(class gpgpu_sim_wrapper *wrapper) {
   wrapper->reset_counters();
 }
 
-bool parse_hw_file(char* hwpowerfile, bool find_target_kernel, vector<string> &hw_data, char* benchname, std::string executed_kernelname){
+bool parse_hw_file(char *hwpowerfile, bool find_target_kernel,
+                   vector<string> &hw_data, char *benchname,
+                   std::string executed_kernelname) {
   fstream hw_file;
   hw_file.open(hwpowerfile, ios::in);
   string line, word, temp;
-  while(!hw_file.eof()){
+  while (!hw_file.eof()) {
     hw_data.clear();
     getline(hw_file, line);
     stringstream s(line);
-    while (getline(s,word,',')){
+    while (getline(s, word, ',')) {
       hw_data.push_back(word);
     }
-    if(hw_data[HW_BENCH_NAME] == std::string(benchname)){
-      if(find_target_kernel){
-        if(hw_data[HW_KERNEL_NAME] == ""){
+    if (hw_data[HW_BENCH_NAME] == std::string(benchname)) {
+      if (find_target_kernel) {
+        if (hw_data[HW_KERNEL_NAME] == "") {
           hw_file.close();
           return true;
-        }
-        else{
-          if(hw_data[HW_KERNEL_NAME] == executed_kernelname){
+        } else {
+          if (hw_data[HW_KERNEL_NAME] == executed_kernelname) {
             hw_file.close();
             return true;
           }
         }
-      }
-      else{
+      } else {
         hw_file.close();
         return true;
       }
-    } 
+    }
   }
   hw_file.close();
   return false;
 }
 
-
-void calculate_hw_mcpat(const gpgpu_sim_config &config,
-                 const shader_core_config *shdr_config,
-                 class gpgpu_sim_wrapper *wrapper,
-                 class power_stat_t *power_stats, unsigned stat_sample_freq,
-                 unsigned tot_cycle, unsigned cycle, unsigned tot_inst,
-                 unsigned inst, int power_simulation_mode, bool dvfs_enabled, char* hwpowerfile, 
-                 char* benchname, std::string executed_kernelname, 
-                 const bool *accelwattch_hybrid_configuration, bool aggregate_power_stats){
-
+void calculate_hw_mcpat(
+    const gpgpu_sim_config &config, const shader_core_config *shdr_config,
+    class gpgpu_sim_wrapper *wrapper, class power_stat_t *power_stats,
+    unsigned stat_sample_freq, unsigned tot_cycle, unsigned cycle,
+    unsigned tot_inst, unsigned inst, int power_simulation_mode,
+    bool dvfs_enabled, char *hwpowerfile, char *benchname,
+    std::string executed_kernelname,
+    const bool *accelwattch_hybrid_configuration, bool aggregate_power_stats) {
   /* Reading HW data from CSV file */
 
   vector<string> hw_data;
   bool kernel_found = false;
-  kernel_found = parse_hw_file(hwpowerfile, true, hw_data, benchname, executed_kernelname); //Searching for matching executed_kernelname.
-  if(!kernel_found)
-    kernel_found = parse_hw_file(hwpowerfile, false, hw_data, benchname, executed_kernelname); //Searching for any kernel with same benchname. 
-  assert("Could not find perf stats for the target benchmark in hwpowerfile.\n" && (kernel_found));
-  unsigned perf_cycles = static_cast<unsigned int>(std::stod(hw_data[HW_CYCLES]) + 0.5);
-  if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_CYCLES]))
+  kernel_found = parse_hw_file(
+      hwpowerfile, true, hw_data, benchname,
+      executed_kernelname);  // Searching for matching executed_kernelname.
+  if (!kernel_found)
+    kernel_found = parse_hw_file(
+        hwpowerfile, false, hw_data, benchname,
+        executed_kernelname);  // Searching for any kernel with same benchname.
+  assert(
+      "Could not find perf stats for the target benchmark in hwpowerfile.\n" &&
+      (kernel_found));
+  unsigned perf_cycles =
+      static_cast<unsigned int>(std::stod(hw_data[HW_CYCLES]) + 0.5);
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_CYCLES]))
     perf_cycles = cycle;
-  wrapper->init_mcpat_hw_mode(perf_cycles); //total PERF MODEL cycles for current kernel
+  wrapper->init_mcpat_hw_mode(
+      perf_cycles);  // total PERF MODEL cycles for current kernel
 
-  if(dvfs_enabled){
-    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_VOLTAGE])) 
-      wrapper->set_model_voltage(1); //performance model needs to support this
-    else  
-      wrapper->set_model_voltage(std::stod(hw_data[HW_VOLTAGE])); //performance model needs to support this
+  if (dvfs_enabled) {
+    if ((power_simulation_mode == 2) &&
+        (accelwattch_hybrid_configuration[HW_VOLTAGE]))
+      wrapper->set_model_voltage(1);  // performance model needs to support this
+    else
+      wrapper->set_model_voltage(std::stod(
+          hw_data[HW_VOLTAGE]));  // performance model needs to support this
   }
 
   double l1_read_hits = std::stod(hw_data[HW_L1_RH]);
@@ -260,266 +267,302 @@ void calculate_hw_mcpat(const gpgpu_sim_config &config,
   double l1_write_hits = std::stod(hw_data[HW_L1_WH]);
   double l1_write_misses = std::stod(hw_data[HW_L1_WM]);
 
-  if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L1_RH]))
-    l1_read_hits = power_stats->get_l1d_read_hits(1) - power_stats->l1r_hits_kernel;
-  if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L1_RM]))
-    l1_read_misses = power_stats->get_l1d_read_misses(1) - power_stats->l1r_misses_kernel;
-  if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L1_WH]))
-    l1_write_hits = power_stats->get_l1d_write_hits(1) - power_stats->l1w_hits_kernel;
-  if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L1_WM]))
-    l1_write_misses = power_stats->get_l1d_write_misses(1) - power_stats->l1w_misses_kernel;
-
-  if(aggregate_power_stats){
-      power_stats->tot_inst_execution += power_stats->get_total_inst(1);
-      power_stats->tot_int_inst_execution +=  power_stats->get_total_int_inst(1);
-      power_stats->tot_fp_inst_execution +=  power_stats->get_total_fp_inst(1);
-      power_stats->commited_inst_execution += power_stats->get_committed_inst(1);
-      wrapper->set_inst_power(
-        shdr_config->gpgpu_clock_gated_lanes, cycle, //TODO: core.[0] cycles counts don't matter, remove this
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_L1_RH]))
+    l1_read_hits =
+        power_stats->get_l1d_read_hits(1) - power_stats->l1r_hits_kernel;
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_L1_RM]))
+    l1_read_misses =
+        power_stats->get_l1d_read_misses(1) - power_stats->l1r_misses_kernel;
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_L1_WH]))
+    l1_write_hits =
+        power_stats->get_l1d_write_hits(1) - power_stats->l1w_hits_kernel;
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_L1_WM]))
+    l1_write_misses =
+        power_stats->get_l1d_write_misses(1) - power_stats->l1w_misses_kernel;
+
+  if (aggregate_power_stats) {
+    power_stats->tot_inst_execution += power_stats->get_total_inst(1);
+    power_stats->tot_int_inst_execution += power_stats->get_total_int_inst(1);
+    power_stats->tot_fp_inst_execution += power_stats->get_total_fp_inst(1);
+    power_stats->commited_inst_execution += power_stats->get_committed_inst(1);
+    wrapper->set_inst_power(
+        shdr_config->gpgpu_clock_gated_lanes,
+        cycle,  // TODO: core.[0] cycles counts don't matter, remove this
         cycle, power_stats->tot_inst_execution,
         power_stats->tot_int_inst_execution, power_stats->tot_fp_inst_execution,
-        l1_read_hits + l1_read_misses,
-        l1_write_hits + l1_write_misses,
+        l1_read_hits + l1_read_misses, l1_write_hits + l1_write_misses,
         power_stats->commited_inst_execution);
-  }
-  else{
-        wrapper->set_inst_power(
-        shdr_config->gpgpu_clock_gated_lanes, cycle, //TODO: core.[0] cycles counts don't matter, remove this
+  } else {
+    wrapper->set_inst_power(
+        shdr_config->gpgpu_clock_gated_lanes,
+        cycle,  // TODO: core.[0] cycles counts don't matter, remove this
         cycle, power_stats->get_total_inst(1),
         power_stats->get_total_int_inst(1), power_stats->get_total_fp_inst(1),
-        l1_read_hits + l1_read_misses,
-        l1_write_hits + l1_write_misses,
+        l1_read_hits + l1_read_misses, l1_write_hits + l1_write_misses,
         power_stats->get_committed_inst(1));
   }
 
-    // Single RF for both int and fp ops -- activity factor set to 0 for Accelwattch HW and Accelwattch Hybrid because no HW Perf Stats for register files
-    wrapper->set_regfile_power(power_stats->get_regfile_reads(1),
-                               power_stats->get_regfile_writes(1),
-                               power_stats->get_non_regfile_operands(1));
+  // Single RF for both int and fp ops -- activity factor set to 0 for
+  // Accelwattch HW and Accelwattch Hybrid because no HW Perf Stats for register
+  // files
+  wrapper->set_regfile_power(power_stats->get_regfile_reads(1),
+                             power_stats->get_regfile_writes(1),
+                             power_stats->get_non_regfile_operands(1));
+
+  // Instruction cache stats -- activity factor set to 0 for Accelwattch HW and
+  // Accelwattch Hybrid because no HW Perf Stats for instruction cache
+  wrapper->set_icache_power(
+      power_stats->get_inst_c_hits(1) - power_stats->l1i_hits_kernel,
+      power_stats->get_inst_c_misses(1) - power_stats->l1i_misses_kernel);
+
+  // Constant Cache, shared memory, texture cache
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_CC_ACC]))
+    wrapper->set_ccache_power(
+        power_stats->get_const_accessess(1) - power_stats->cc_accesses_kernel,
+        0);  // assuming all HITS in constant cache for now
+  else
+    wrapper->set_ccache_power(
+        std::stod(hw_data[HW_CC_ACC]),
+        0);  // assuming all HITS in constant cache for now
+
+  // wrapper->set_tcache_power(power_stats->get_texture_c_hits(),
+  //                           power_stats->get_texture_c_misses());
+
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_SHRD_ACC]))
+    wrapper->set_shrd_mem_power(power_stats->get_shmem_access(1) -
+                                power_stats->shared_accesses_kernel);
+  else
+    wrapper->set_shrd_mem_power(std::stod(hw_data[HW_SHRD_ACC]));
+
+  wrapper->set_l1cache_power(l1_read_hits, l1_read_misses, l1_write_hits,
+                             l1_write_misses);
+
+  double l2_read_hits = std::stod(hw_data[HW_L2_RH]);
+  double l2_read_misses = std::stod(hw_data[HW_L2_RM]);
+  double l2_write_hits = std::stod(hw_data[HW_L2_WH]);
+  double l2_write_misses = std::stod(hw_data[HW_L2_WM]);
+
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_L2_RH]))
+    l2_read_hits =
+        power_stats->get_l2_read_hits(1) - power_stats->l2r_hits_kernel;
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_L2_RM]))
+    l2_read_misses =
+        power_stats->get_l2_read_misses(1) - power_stats->l2r_misses_kernel;
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_L2_WH]))
+    l2_write_hits =
+        power_stats->get_l2_write_hits(1) - power_stats->l2w_hits_kernel;
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_L2_WM]))
+    l2_write_misses =
+        power_stats->get_l2_write_misses(1) - power_stats->l2w_misses_kernel;
+
+  wrapper->set_l2cache_power(l2_read_hits, l2_read_misses, l2_write_hits,
+                             l2_write_misses);
+
+  float active_sms = (*power_stats->m_active_sms) / stat_sample_freq;
+  float num_cores = shdr_config->num_shader();
+  float num_idle_core = num_cores - active_sms;
+  wrapper->set_num_cores(num_cores);
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_NUM_SM_IDLE]))
+    wrapper->set_idle_core_power(num_idle_core);
+  else
+    wrapper->set_idle_core_power(std::stod(hw_data[HW_NUM_SM_IDLE]));
 
-    // Instruction cache stats -- activity factor set to 0 for Accelwattch HW and Accelwattch Hybrid because no HW Perf Stats for instruction cache
-    wrapper->set_icache_power(power_stats->get_inst_c_hits(1) - power_stats->l1i_hits_kernel,
-                              power_stats->get_inst_c_misses(1) - power_stats->l1i_misses_kernel);
+  float pipeline_duty_cycle =
+      ((*power_stats->m_average_pipeline_duty_cycle / (stat_sample_freq)) < 0.8)
+          ? ((*power_stats->m_average_pipeline_duty_cycle) / stat_sample_freq)
+          : 0.8;
 
-    // Constant Cache, shared memory, texture cache
-    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_CC_ACC]))
-      wrapper->set_ccache_power(power_stats->get_const_accessess(1) - power_stats->cc_accesses_kernel, 0); //assuming all HITS in constant cache for now
-    else  
-      wrapper->set_ccache_power(std::stod(hw_data[HW_CC_ACC]), 0); //assuming all HITS in constant cache for now
-
-    
-    // wrapper->set_tcache_power(power_stats->get_texture_c_hits(),
-    //                           power_stats->get_texture_c_misses());
-
-    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_SHRD_ACC]))
-      wrapper->set_shrd_mem_power(power_stats->get_shmem_access(1) - power_stats->shared_accesses_kernel);
-    else  
-      wrapper->set_shrd_mem_power(std::stod(hw_data[HW_SHRD_ACC]));
-
-    wrapper->set_l1cache_power( l1_read_hits,  l1_read_misses, l1_write_hits,  l1_write_misses);
-
-    double l2_read_hits = std::stod(hw_data[HW_L2_RH]);
-    double l2_read_misses = std::stod(hw_data[HW_L2_RM]);
-    double l2_write_hits = std::stod(hw_data[HW_L2_WH]);
-    double l2_write_misses = std::stod(hw_data[HW_L2_WM]);
-
-    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L2_RH]))
-      l2_read_hits = power_stats->get_l2_read_hits(1) - power_stats->l2r_hits_kernel;
-    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L2_RM]))
-      l2_read_misses = power_stats->get_l2_read_misses(1)  - power_stats->l2r_misses_kernel;
-    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L2_WH]))
-      l2_write_hits = power_stats->get_l2_write_hits(1) - power_stats->l2w_hits_kernel;
-    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_L2_WM]))
-      l2_write_misses = power_stats->get_l2_write_misses(1) - power_stats->l2w_misses_kernel;
-
-    wrapper->set_l2cache_power(l2_read_hits, l2_read_misses, l2_write_hits, l2_write_misses);
-    
-    float active_sms = (*power_stats->m_active_sms) / stat_sample_freq;
-    float num_cores = shdr_config->num_shader();
-    float num_idle_core = num_cores - active_sms;
-    wrapper->set_num_cores(num_cores);
-    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_NUM_SM_IDLE]))
-      wrapper->set_idle_core_power(num_idle_core);
-    else 
-      wrapper->set_idle_core_power(std::stod(hw_data[HW_NUM_SM_IDLE])); 
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_PIPE_DUTY]))
+    wrapper->set_duty_cycle_power(pipeline_duty_cycle);
+  else
+    wrapper->set_duty_cycle_power(std::stod(hw_data[HW_PIPE_DUTY]));
+
+  // Memory Controller
+
+  double dram_reads = std::stod(hw_data[HW_DRAM_RD]);
+  double dram_writes = std::stod(hw_data[HW_DRAM_WR]);
+  double dram_pre = 0;
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_DRAM_RD]))
+    dram_reads = power_stats->get_dram_rd(1) - power_stats->dram_rd_kernel;
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_DRAM_WR]))
+    dram_writes = power_stats->get_dram_wr(1) - power_stats->dram_wr_kernel;
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_DRAM_RD]))
+    dram_pre = power_stats->get_dram_pre(1) - power_stats->dram_pre_kernel;
+
+  wrapper->set_mem_ctrl_power(dram_reads, dram_writes, dram_pre);
+
+  if (aggregate_power_stats) {
+    power_stats->ialu_acc_execution += power_stats->get_ialu_accessess(1);
+    power_stats->imul24_acc_execution += power_stats->get_intmul24_accessess(1);
+    power_stats->imul32_acc_execution += power_stats->get_intmul32_accessess(1);
+    power_stats->imul_acc_execution += power_stats->get_intmul_accessess(1);
+    power_stats->idiv_acc_execution += power_stats->get_intdiv_accessess(1);
+    power_stats->dp_acc_execution += power_stats->get_dp_accessess(1);
+    power_stats->dpmul_acc_execution += power_stats->get_dpmul_accessess(1);
+    power_stats->dpdiv_acc_execution += power_stats->get_dpdiv_accessess(1);
+    power_stats->fp_acc_execution += power_stats->get_fp_accessess(1);
+    power_stats->fpmul_acc_execution += power_stats->get_fpmul_accessess(1);
+    power_stats->fpdiv_acc_execution += power_stats->get_fpdiv_accessess(1);
+    power_stats->sqrt_acc_execution += power_stats->get_sqrt_accessess(1);
+    power_stats->log_acc_execution += power_stats->get_log_accessess(1);
+    power_stats->sin_acc_execution += power_stats->get_sin_accessess(1);
+    power_stats->exp_acc_execution += power_stats->get_exp_accessess(1);
+    power_stats->tensor_acc_execution += power_stats->get_tensor_accessess(1);
+    power_stats->tex_acc_execution += power_stats->get_tex_accessess(1);
+    power_stats->tot_fpu_acc_execution += power_stats->get_tot_fpu_accessess(1);
+    power_stats->tot_sfu_acc_execution += power_stats->get_tot_sfu_accessess(1);
+    power_stats->tot_threads_acc_execution +=
+        power_stats->get_tot_threads_kernel(1);
+    power_stats->tot_warps_acc_execution +=
+        power_stats->get_tot_warps_kernel(1);
+
+    power_stats->sp_active_lanes_execution +=
+        (power_stats->get_sp_active_lanes() * shdr_config->num_shader() *
+         shdr_config->gpgpu_num_sp_units);
+    power_stats->sfu_active_lanes_execution +=
+        (power_stats->get_sfu_active_lanes() * shdr_config->num_shader() *
+         shdr_config->gpgpu_num_sp_units);
+
+    wrapper->set_int_accesses(
+        power_stats->ialu_acc_execution, power_stats->imul24_acc_execution,
+        power_stats->imul32_acc_execution, power_stats->imul_acc_execution,
+        power_stats->idiv_acc_execution);
+
+    wrapper->set_dp_accesses(power_stats->dp_acc_execution,
+                             power_stats->dpmul_acc_execution,
+                             power_stats->dpdiv_acc_execution);
+
+    wrapper->set_fp_accesses(power_stats->fp_acc_execution,
+                             power_stats->fpmul_acc_execution,
+                             power_stats->fpdiv_acc_execution);
+
+    wrapper->set_trans_accesses(
+        power_stats->sqrt_acc_execution, power_stats->log_acc_execution,
+        power_stats->sin_acc_execution, power_stats->exp_acc_execution);
+
+    wrapper->set_tensor_accesses(power_stats->tensor_acc_execution);
+
+    wrapper->set_tex_accesses(power_stats->tex_acc_execution);
+
+    wrapper->set_exec_unit_power(power_stats->ialu_acc_execution,
+                                 power_stats->tot_fpu_acc_execution,
+                                 power_stats->tot_sfu_acc_execution);
+
+    wrapper->set_avg_active_threads(
+        (double)((double)power_stats->tot_threads_acc_execution /
+                 (double)power_stats->tot_warps_acc_execution));
 
-    float pipeline_duty_cycle =
-        ((*power_stats->m_average_pipeline_duty_cycle / (stat_sample_freq)) <
-         0.8)
-            ? ((*power_stats->m_average_pipeline_duty_cycle) / stat_sample_freq)
-            : 0.8;
-    
-    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_PIPE_DUTY]))
-      wrapper->set_duty_cycle_power(pipeline_duty_cycle);
-    else
-      wrapper->set_duty_cycle_power(std::stod(hw_data[HW_PIPE_DUTY]));
+    // Average active lanes for sp and sfu pipelines
+    float avg_sp_active_lanes =
+        (power_stats->sp_active_lanes_execution) / shdr_config->num_shader() /
+        shdr_config->gpgpu_num_sp_units / stat_sample_freq;
+    float avg_sfu_active_lanes =
+        (power_stats->sfu_active_lanes_execution) / shdr_config->num_shader() /
+        shdr_config->gpgpu_num_sp_units / stat_sample_freq;
+    if (avg_sp_active_lanes > 32.0) avg_sp_active_lanes = 32.0;
+    if (avg_sfu_active_lanes > 32.0) avg_sfu_active_lanes = 32.0;
+    assert(avg_sp_active_lanes <= 32);
+    assert(avg_sfu_active_lanes <= 32);
+    wrapper->set_active_lanes_power(avg_sp_active_lanes, avg_sfu_active_lanes);
+  } else {
+    wrapper->set_int_accesses(power_stats->get_ialu_accessess(1),
+                              power_stats->get_intmul24_accessess(1),
+                              power_stats->get_intmul32_accessess(1),
+                              power_stats->get_intmul_accessess(1),
+                              power_stats->get_intdiv_accessess(1));
 
-    // Memory Controller
-  
-    double dram_reads = std::stod(hw_data[HW_DRAM_RD]);
-    double dram_writes = std::stod(hw_data[HW_DRAM_WR]);
-    double dram_pre = 0;
-    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_DRAM_RD]))
-      dram_reads = power_stats->get_dram_rd(1) - power_stats->dram_rd_kernel;
-    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_DRAM_WR]))
-      dram_writes = power_stats->get_dram_wr(1) - power_stats->dram_wr_kernel;
-    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_DRAM_RD]))
-      dram_pre = power_stats->get_dram_pre(1) - power_stats->dram_pre_kernel;
-
-
-    wrapper->set_mem_ctrl_power(dram_reads, dram_writes, dram_pre);
-
-    if(aggregate_power_stats){
-      power_stats->ialu_acc_execution += power_stats->get_ialu_accessess(1);
-      power_stats->imul24_acc_execution += power_stats->get_intmul24_accessess(1);
-      power_stats->imul32_acc_execution += power_stats->get_intmul32_accessess(1);
-      power_stats->imul_acc_execution += power_stats->get_intmul_accessess(1);
-      power_stats->idiv_acc_execution += power_stats->get_intdiv_accessess(1);
-      power_stats->dp_acc_execution += power_stats->get_dp_accessess(1);
-      power_stats->dpmul_acc_execution += power_stats->get_dpmul_accessess(1);
-      power_stats->dpdiv_acc_execution += power_stats->get_dpdiv_accessess(1);
-      power_stats->fp_acc_execution += power_stats->get_fp_accessess(1);
-      power_stats->fpmul_acc_execution += power_stats->get_fpmul_accessess(1);
-      power_stats->fpdiv_acc_execution += power_stats->get_fpdiv_accessess(1);
-      power_stats->sqrt_acc_execution += power_stats->get_sqrt_accessess(1);
-      power_stats->log_acc_execution += power_stats->get_log_accessess(1);
-      power_stats->sin_acc_execution += power_stats->get_sin_accessess(1);
-      power_stats->exp_acc_execution += power_stats->get_exp_accessess(1);
-      power_stats->tensor_acc_execution += power_stats->get_tensor_accessess(1);
-      power_stats->tex_acc_execution += power_stats->get_tex_accessess(1);
-      power_stats->tot_fpu_acc_execution += power_stats->get_tot_fpu_accessess(1);
-      power_stats->tot_sfu_acc_execution += power_stats->get_tot_sfu_accessess(1);
-      power_stats->tot_threads_acc_execution += power_stats->get_tot_threads_kernel(1);
-      power_stats->tot_warps_acc_execution += power_stats->get_tot_warps_kernel(1);
-      
-      power_stats->sp_active_lanes_execution += (power_stats->get_sp_active_lanes() * shdr_config->num_shader() * shdr_config->gpgpu_num_sp_units);
-      power_stats->sfu_active_lanes_execution += (power_stats->get_sfu_active_lanes() * shdr_config->num_shader() * shdr_config->gpgpu_num_sp_units);
-
-      wrapper->set_int_accesses(power_stats->ialu_acc_execution, 
-                                power_stats->imul24_acc_execution, 
-                                power_stats->imul32_acc_execution, 
-                                power_stats->imul_acc_execution, 
-                                power_stats->idiv_acc_execution);
-
-      wrapper->set_dp_accesses(power_stats->dp_acc_execution, 
-                                power_stats->dpmul_acc_execution, 
-                                power_stats->dpdiv_acc_execution);
-
-      wrapper->set_fp_accesses(power_stats->fp_acc_execution, 
-                              power_stats->fpmul_acc_execution, 
-                              power_stats->fpdiv_acc_execution);
-
-      wrapper->set_trans_accesses(power_stats->sqrt_acc_execution, 
-                                  power_stats->log_acc_execution, 
-                                  power_stats->sin_acc_execution, 
-                                  power_stats->exp_acc_execution);
-
-      wrapper->set_tensor_accesses(power_stats->tensor_acc_execution);
-
-      wrapper->set_tex_accesses(power_stats->tex_acc_execution);
-
-      wrapper->set_exec_unit_power(power_stats->ialu_acc_execution,
-                                   power_stats->tot_fpu_acc_execution,
-                                   power_stats->tot_sfu_acc_execution);
-
-      wrapper->set_avg_active_threads((double)((double)power_stats->tot_threads_acc_execution / (double)power_stats->tot_warps_acc_execution));
-
-      // Average active lanes for sp and sfu pipelines
-      float avg_sp_active_lanes =
-          (power_stats->sp_active_lanes_execution) / shdr_config->num_shader() / shdr_config->gpgpu_num_sp_units / stat_sample_freq;
-      float avg_sfu_active_lanes =
-          (power_stats->sfu_active_lanes_execution) / shdr_config->num_shader() / shdr_config->gpgpu_num_sp_units / stat_sample_freq;
-      if(avg_sp_active_lanes >32.0 )
-        avg_sp_active_lanes = 32.0;
-      if(avg_sfu_active_lanes >32.0 )
-        avg_sfu_active_lanes = 32.0;
-      assert(avg_sp_active_lanes <= 32);
-      assert(avg_sfu_active_lanes <= 32);
-      wrapper->set_active_lanes_power(avg_sp_active_lanes, avg_sfu_active_lanes);
-    }
-    else{
-      wrapper->set_int_accesses(power_stats->get_ialu_accessess(1), 
-                                power_stats->get_intmul24_accessess(1), 
-                                power_stats->get_intmul32_accessess(1), 
-                                power_stats->get_intmul_accessess(1), 
-                                power_stats->get_intdiv_accessess(1));
-
-      wrapper->set_dp_accesses(power_stats->get_dp_accessess(1), 
-                                power_stats->get_dpmul_accessess(1), 
-                                power_stats->get_dpdiv_accessess(1));
-
-      wrapper->set_fp_accesses(power_stats->get_fp_accessess(1), 
-                              power_stats->get_fpmul_accessess(1), 
-                              power_stats->get_fpdiv_accessess(1));
-
-      wrapper->set_trans_accesses(power_stats->get_sqrt_accessess(1), 
-                                  power_stats->get_log_accessess(1), 
-                                  power_stats->get_sin_accessess(1), 
-                                  power_stats->get_exp_accessess(1));
-
-      wrapper->set_tensor_accesses(power_stats->get_tensor_accessess(1));
-
-      wrapper->set_tex_accesses(power_stats->get_tex_accessess(1));
-
-      wrapper->set_exec_unit_power(power_stats->get_tot_fpu_accessess(1),
-                                   power_stats->get_ialu_accessess(1),
-                                   power_stats->get_tot_sfu_accessess(1));
-
-      wrapper->set_avg_active_threads(power_stats->get_active_threads(1));
-
-      // Average active lanes for sp and sfu pipelines
-      float avg_sp_active_lanes =
-          (power_stats->get_sp_active_lanes()) / stat_sample_freq;
-      float avg_sfu_active_lanes =
-          (power_stats->get_sfu_active_lanes()) / stat_sample_freq;
-      if(avg_sp_active_lanes >32.0 )
-        avg_sp_active_lanes = 32.0;
-      if(avg_sfu_active_lanes >32.0 )
-        avg_sfu_active_lanes = 32.0;
-      assert(avg_sp_active_lanes <= 32);
-      assert(avg_sfu_active_lanes <= 32);
-      wrapper->set_active_lanes_power(avg_sp_active_lanes, avg_sfu_active_lanes);
-    }
+    wrapper->set_dp_accesses(power_stats->get_dp_accessess(1),
+                             power_stats->get_dpmul_accessess(1),
+                             power_stats->get_dpdiv_accessess(1));
 
-  
-    double n_icnt_simt_to_mem =
-      (double)
-          (power_stats->get_icnt_simt_to_mem(1) - power_stats->noc_tr_kernel);  // # flits from SIMT clusters
-                                                // to memory partitions
-    double n_icnt_mem_to_simt =
-      (double)
-          (power_stats->get_icnt_mem_to_simt(1)- power_stats->noc_rc_kernel);  // # flits from memory
-                                                // partitions to SIMT clusters
-    if((power_simulation_mode == 2) && (accelwattch_hybrid_configuration[HW_NOC]))   
-      wrapper->set_NoC_power(n_icnt_mem_to_simt + n_icnt_simt_to_mem);  // Number of flits traversing the interconnect from Accel-Sim
-    else
-      wrapper->set_NoC_power(std::stod(hw_data[HW_NOC]));  // Number of flits traversing the interconnect from HW
-   
-    wrapper->compute();
+    wrapper->set_fp_accesses(power_stats->get_fp_accessess(1),
+                             power_stats->get_fpmul_accessess(1),
+                             power_stats->get_fpdiv_accessess(1));
 
-    wrapper->update_components_power();
+    wrapper->set_trans_accesses(
+        power_stats->get_sqrt_accessess(1), power_stats->get_log_accessess(1),
+        power_stats->get_sin_accessess(1), power_stats->get_exp_accessess(1));
 
-    wrapper->power_metrics_calculations();
+    wrapper->set_tensor_accesses(power_stats->get_tensor_accessess(1));
 
-    wrapper->dump();
-    power_stats->l1r_hits_kernel = power_stats->get_l1d_read_hits(1);
-    power_stats->l1r_misses_kernel = power_stats->get_l1d_read_misses(1);
-    power_stats->l1w_hits_kernel = power_stats->get_l1d_write_hits(1);
-    power_stats->l1w_misses_kernel = power_stats->get_l1d_write_misses(1);
-    power_stats->shared_accesses_kernel = power_stats->get_const_accessess(1);
-    power_stats->cc_accesses_kernel = power_stats->get_shmem_access(1);
-    power_stats->dram_rd_kernel = power_stats->get_dram_rd(1);
-    power_stats->dram_wr_kernel = power_stats->get_dram_wr(1);
-    power_stats->dram_pre_kernel = power_stats->get_dram_pre(1);
-    power_stats->l1i_hits_kernel = power_stats->get_inst_c_hits(1);
-    power_stats->l1i_misses_kernel = power_stats->get_inst_c_misses(1);
-    power_stats->l2r_hits_kernel = power_stats->get_l2_read_hits(1);
-    power_stats->l2r_misses_kernel = power_stats->get_l2_read_misses(1);
-    power_stats->l2w_hits_kernel =  power_stats->get_l2_write_hits(1); 
-    power_stats->l2w_misses_kernel = power_stats->get_l2_write_misses(1);
-    power_stats->noc_tr_kernel = power_stats->get_icnt_simt_to_mem(1);
-    power_stats->noc_rc_kernel =  power_stats->get_icnt_mem_to_simt(1);
-
-
-    power_stats->clear();
+    wrapper->set_tex_accesses(power_stats->get_tex_accessess(1));
+
+    wrapper->set_exec_unit_power(power_stats->get_tot_fpu_accessess(1),
+                                 power_stats->get_ialu_accessess(1),
+                                 power_stats->get_tot_sfu_accessess(1));
+
+    wrapper->set_avg_active_threads(power_stats->get_active_threads(1));
+
+    // Average active lanes for sp and sfu pipelines
+    float avg_sp_active_lanes =
+        (power_stats->get_sp_active_lanes()) / stat_sample_freq;
+    float avg_sfu_active_lanes =
+        (power_stats->get_sfu_active_lanes()) / stat_sample_freq;
+    if (avg_sp_active_lanes > 32.0) avg_sp_active_lanes = 32.0;
+    if (avg_sfu_active_lanes > 32.0) avg_sfu_active_lanes = 32.0;
+    assert(avg_sp_active_lanes <= 32);
+    assert(avg_sfu_active_lanes <= 32);
+    wrapper->set_active_lanes_power(avg_sp_active_lanes, avg_sfu_active_lanes);
+  }
+
+  double n_icnt_simt_to_mem =
+      (double)(power_stats->get_icnt_simt_to_mem(1) -
+               power_stats->noc_tr_kernel);  // # flits from SIMT clusters
+                                             // to memory partitions
+  double n_icnt_mem_to_simt =
+      (double)(power_stats->get_icnt_mem_to_simt(1) -
+               power_stats->noc_rc_kernel);  // # flits from memory
+                                             // partitions to SIMT clusters
+  if ((power_simulation_mode == 2) &&
+      (accelwattch_hybrid_configuration[HW_NOC]))
+    wrapper->set_NoC_power(
+        n_icnt_mem_to_simt +
+        n_icnt_simt_to_mem);  // Number of flits traversing the interconnect
+                              // from Accel-Sim
+  else
+    wrapper->set_NoC_power(
+        std::stod(hw_data[HW_NOC]));  // Number of flits traversing the
+                                      // interconnect from HW
+
+  wrapper->compute();
+
+  wrapper->update_components_power();
+
+  wrapper->power_metrics_calculations();
+
+  wrapper->dump();
+  power_stats->l1r_hits_kernel = power_stats->get_l1d_read_hits(1);
+  power_stats->l1r_misses_kernel = power_stats->get_l1d_read_misses(1);
+  power_stats->l1w_hits_kernel = power_stats->get_l1d_write_hits(1);
+  power_stats->l1w_misses_kernel = power_stats->get_l1d_write_misses(1);
+  power_stats->shared_accesses_kernel = power_stats->get_const_accessess(1);
+  power_stats->cc_accesses_kernel = power_stats->get_shmem_access(1);
+  power_stats->dram_rd_kernel = power_stats->get_dram_rd(1);
+  power_stats->dram_wr_kernel = power_stats->get_dram_wr(1);
+  power_stats->dram_pre_kernel = power_stats->get_dram_pre(1);
+  power_stats->l1i_hits_kernel = power_stats->get_inst_c_hits(1);
+  power_stats->l1i_misses_kernel = power_stats->get_inst_c_misses(1);
+  power_stats->l2r_hits_kernel = power_stats->get_l2_read_hits(1);
+  power_stats->l2r_misses_kernel = power_stats->get_l2_read_misses(1);
+  power_stats->l2w_hits_kernel = power_stats->get_l2_write_hits(1);
+  power_stats->l2w_misses_kernel = power_stats->get_l2_write_misses(1);
+  power_stats->noc_tr_kernel = power_stats->get_icnt_simt_to_mem(1);
+  power_stats->noc_rc_kernel = power_stats->get_icnt_mem_to_simt(1);
+
+  power_stats->clear();
 }
\ No newline at end of file
diff --git a/src/gpgpu-sim/power_interface.h b/src/gpgpu-sim/power_interface.h
index 1c6c51068..3c043e6b6 100644
--- a/src/gpgpu-sim/power_interface.h
+++ b/src/gpgpu-sim/power_interface.h
@@ -1,17 +1,18 @@
-// Copyright (c) 2009-2021, Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas, 
-// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
-// The University of British Columbia, Northwestern University, Purdue University
-// All rights reserved.
+// Copyright (c) 2009-2021, Tor M. Aamodt, Ahmed El-Shafiey, Tayler
+// Hetherington, Vijay Kandiah, Nikos Hardavellas, Mahmoud Khairy, Junrui Pan,
+// Timothy G. Rogers The University of British Columbia, Northwestern
+// University, Purdue University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// 1. Redistributions of source code must retain the above copyright notice, this
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
 //    list of conditions and the following disclaimer;
 // 2. Redistributions in binary form must reproduce the above copyright notice,
 //    this list of conditions and the following disclaimer in the documentation
 //    and/or other materials provided with the distribution;
-// 3. Neither the names of The University of British Columbia, Northwestern 
+// 3. Neither the names of The University of British Columbia, Northwestern
 //    University nor the names of their contributors may be used to
 //    endorse or promote products derived from this software without specific
 //    prior written permission.
@@ -47,16 +48,18 @@ void mcpat_cycle(const gpgpu_sim_config &config,
                  unsigned tot_cycle, unsigned cycle, unsigned tot_inst,
                  unsigned inst, bool dvfs_enabled);
 
-void calculate_hw_mcpat(const gpgpu_sim_config &config,
-                 const shader_core_config *shdr_config,
-                 class gpgpu_sim_wrapper *wrapper,
-                 class power_stat_t *power_stats, unsigned stat_sample_freq,
-                 unsigned tot_cycle, unsigned cycle, unsigned tot_inst,
-                 unsigned inst, int power_simulation_mode, bool dvfs_enabled, 
-                 char* hwpowerfile, char* benchname, std::string executed_kernelname, 
-                 const bool *accelwattch_hybrid_configuration, bool aggregate_power_stats);
+void calculate_hw_mcpat(
+    const gpgpu_sim_config &config, const shader_core_config *shdr_config,
+    class gpgpu_sim_wrapper *wrapper, class power_stat_t *power_stats,
+    unsigned stat_sample_freq, unsigned tot_cycle, unsigned cycle,
+    unsigned tot_inst, unsigned inst, int power_simulation_mode,
+    bool dvfs_enabled, char *hwpowerfile, char *benchname,
+    std::string executed_kernelname,
+    const bool *accelwattch_hybrid_configuration, bool aggregate_power_stats);
 
-bool parse_hw_file(char* hwpowerfile, bool find_target_kernel, vector<string> &hw_data, char* benchname, std::string executed_kernelname);
+bool parse_hw_file(char *hwpowerfile, bool find_target_kernel,
+                   vector<string> &hw_data, char *benchname,
+                   std::string executed_kernelname);
 
 void mcpat_reset_perf_count(class gpgpu_sim_wrapper *wrapper);
 
diff --git a/src/gpgpu-sim/power_stat.cc b/src/gpgpu-sim/power_stat.cc
index d0e673cb3..dead4a0d7 100644
--- a/src/gpgpu-sim/power_stat.cc
+++ b/src/gpgpu-sim/power_stat.cc
@@ -1,17 +1,18 @@
-// Copyright (c) 2009-2021,  Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas, 
-// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
-// The University of British Columbia, Northwestern University, Purdue University
-// All rights reserved.
+// Copyright (c) 2009-2021,  Tor M. Aamodt, Ahmed El-Shafiey, Tayler
+// Hetherington, Vijay Kandiah, Nikos Hardavellas, Mahmoud Khairy, Junrui Pan,
+// Timothy G. Rogers The University of British Columbia, Northwestern
+// University, Purdue University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// 1. Redistributions of source code must retain the above copyright notice, this
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
 //    list of conditions and the following disclaimer;
 // 2. Redistributions in binary form must reproduce the above copyright notice,
 //    this list of conditions and the following disclaimer in the documentation
 //    and/or other materials provided with the distribution;
-// 3. Neither the names of The University of British Columbia, Northwestern 
+// 3. Neither the names of The University of British Columbia, Northwestern
 //    University nor the names of their contributors may be used to
 //    endorse or promote products derived from this software without specific
 //    prior written permission.
@@ -56,60 +57,58 @@ power_mem_stat_t::power_mem_stat_t(const memory_config *mem_config,
   init();
 }
 
-void power_stat_t::clear(){
-  for(unsigned i=0; i< NUM_STAT_IDX; ++i){
+void power_stat_t::clear() {
+  for (unsigned i = 0; i < NUM_STAT_IDX; ++i) {
     pwr_mem_stat->core_cache_stats[i].clear();
     pwr_mem_stat->l2_cache_stats[i].clear();
-    for(unsigned j=0; j<m_config->num_shader(); ++j){
-      pwr_core_stat->m_pipeline_duty_cycle[i][j]=0;                
-      pwr_core_stat->m_num_decoded_insn[i][j]=0;
-      pwr_core_stat->m_num_FPdecoded_insn[i][j]=0;
-      pwr_core_stat->m_num_INTdecoded_insn[i][j]=0;
-      pwr_core_stat->m_num_storequeued_insn[i][j]=0;
-      pwr_core_stat->m_num_loadqueued_insn[i][j]=0;
-      pwr_core_stat->m_num_tex_inst[i][j]=0;
-      pwr_core_stat->m_num_ialu_acesses[i][j]=0;                   
-      pwr_core_stat->m_num_fp_acesses[i][j]=0;                   
-      pwr_core_stat->m_num_imul_acesses[i][j]=0;                   
-      pwr_core_stat->m_num_imul24_acesses[i][j]=0;                   
-      pwr_core_stat->m_num_imul32_acesses[i][j]=0;                   
-      pwr_core_stat->m_num_fpmul_acesses[i][j]=0;                   
-      pwr_core_stat->m_num_idiv_acesses[i][j]=0;                   
-      pwr_core_stat->m_num_fpdiv_acesses[i][j]=0;                   
-      pwr_core_stat->m_num_dp_acesses[i][j]=0;                   
-      pwr_core_stat->m_num_dpmul_acesses[i][j]=0;                   
-      pwr_core_stat->m_num_dpdiv_acesses[i][j]=0;                   
-      pwr_core_stat->m_num_tensor_core_acesses[i][j]=0;                   
-      pwr_core_stat->m_num_const_acesses[i][j]=0;                   
-      pwr_core_stat->m_num_tex_acesses[i][j]=0;                   
-      pwr_core_stat->m_num_sp_acesses[i][j]=0;                   
-      pwr_core_stat->m_num_sfu_acesses[i][j]=0;                   
-      pwr_core_stat->m_num_sqrt_acesses[i][j]=0;                   
-      pwr_core_stat->m_num_log_acesses[i][j]=0;                   
-      pwr_core_stat->m_num_sin_acesses[i][j]=0;                   
-      pwr_core_stat->m_num_exp_acesses[i][j]=0;                   
-      pwr_core_stat->m_num_mem_acesses[i][j]=0;                   
-      pwr_core_stat->m_num_sp_committed[i][j]=0;
-      pwr_core_stat->m_num_sfu_committed[i][j]=0;
-      pwr_core_stat->m_num_mem_committed[i][j]=0;
-      pwr_core_stat->m_read_regfile_acesses[i][j]=0;
-      pwr_core_stat->m_write_regfile_acesses[i][j]=0;
-      pwr_core_stat->m_non_rf_operands[i][j]=0;
-      pwr_core_stat->m_active_sp_lanes[i][j]=0;
-      pwr_core_stat->m_active_sfu_lanes[i][j]=0;
-      pwr_core_stat->m_active_exu_threads[i][j]=0;                   
-      pwr_core_stat->m_active_exu_warps[i][j]=0;
+    for (unsigned j = 0; j < m_config->num_shader(); ++j) {
+      pwr_core_stat->m_pipeline_duty_cycle[i][j] = 0;
+      pwr_core_stat->m_num_decoded_insn[i][j] = 0;
+      pwr_core_stat->m_num_FPdecoded_insn[i][j] = 0;
+      pwr_core_stat->m_num_INTdecoded_insn[i][j] = 0;
+      pwr_core_stat->m_num_storequeued_insn[i][j] = 0;
+      pwr_core_stat->m_num_loadqueued_insn[i][j] = 0;
+      pwr_core_stat->m_num_tex_inst[i][j] = 0;
+      pwr_core_stat->m_num_ialu_acesses[i][j] = 0;
+      pwr_core_stat->m_num_fp_acesses[i][j] = 0;
+      pwr_core_stat->m_num_imul_acesses[i][j] = 0;
+      pwr_core_stat->m_num_imul24_acesses[i][j] = 0;
+      pwr_core_stat->m_num_imul32_acesses[i][j] = 0;
+      pwr_core_stat->m_num_fpmul_acesses[i][j] = 0;
+      pwr_core_stat->m_num_idiv_acesses[i][j] = 0;
+      pwr_core_stat->m_num_fpdiv_acesses[i][j] = 0;
+      pwr_core_stat->m_num_dp_acesses[i][j] = 0;
+      pwr_core_stat->m_num_dpmul_acesses[i][j] = 0;
+      pwr_core_stat->m_num_dpdiv_acesses[i][j] = 0;
+      pwr_core_stat->m_num_tensor_core_acesses[i][j] = 0;
+      pwr_core_stat->m_num_const_acesses[i][j] = 0;
+      pwr_core_stat->m_num_tex_acesses[i][j] = 0;
+      pwr_core_stat->m_num_sp_acesses[i][j] = 0;
+      pwr_core_stat->m_num_sfu_acesses[i][j] = 0;
+      pwr_core_stat->m_num_sqrt_acesses[i][j] = 0;
+      pwr_core_stat->m_num_log_acesses[i][j] = 0;
+      pwr_core_stat->m_num_sin_acesses[i][j] = 0;
+      pwr_core_stat->m_num_exp_acesses[i][j] = 0;
+      pwr_core_stat->m_num_mem_acesses[i][j] = 0;
+      pwr_core_stat->m_num_sp_committed[i][j] = 0;
+      pwr_core_stat->m_num_sfu_committed[i][j] = 0;
+      pwr_core_stat->m_num_mem_committed[i][j] = 0;
+      pwr_core_stat->m_read_regfile_acesses[i][j] = 0;
+      pwr_core_stat->m_write_regfile_acesses[i][j] = 0;
+      pwr_core_stat->m_non_rf_operands[i][j] = 0;
+      pwr_core_stat->m_active_sp_lanes[i][j] = 0;
+      pwr_core_stat->m_active_sfu_lanes[i][j] = 0;
+      pwr_core_stat->m_active_exu_threads[i][j] = 0;
+      pwr_core_stat->m_active_exu_warps[i][j] = 0;
     }
     for (unsigned j = 0; j < m_mem_config->m_n_mem; ++j) {
-      pwr_mem_stat->n_rd[i][j]=0;
-      pwr_mem_stat->n_wr[i][j]=0;
-      pwr_mem_stat->n_pre[i][j]=0;
+      pwr_mem_stat->n_rd[i][j] = 0;
+      pwr_mem_stat->n_wr[i][j] = 0;
+      pwr_mem_stat->n_pre[i][j] = 0;
     }
   }
 }
 
-
-
 void power_mem_stat_t::init() {
   shmem_access[CURRENT_STAT_IDX] =
       m_core_stats->gpgpu_n_shmem_bank_access;  // Shared memory access
@@ -175,7 +174,8 @@ void power_mem_stat_t::print(FILE *fout) const {
   unsigned total_mem_writes = 0;
   for (unsigned i = 0; i < m_config->m_n_mem; ++i) {
     total_mem_reads += n_rd[CURRENT_STAT_IDX][i];
-    total_mem_writes += n_wr[CURRENT_STAT_IDX][i] + n_wr_WB[CURRENT_STAT_IDX][i];
+    total_mem_writes +=
+        n_wr[CURRENT_STAT_IDX][i] + n_wr_WB[CURRENT_STAT_IDX][i];
   }
   fprintf(fout, "Total memory controller accesses: %u\n",
           total_mem_reads + total_mem_writes);
@@ -205,165 +205,272 @@ void power_core_stat_t::print(FILE *fout) {
   // per core statistics
   fprintf(fout, "Power Metrics: \n");
   for (unsigned i = 0; i < m_config->num_shader(); i++) {
-        fprintf(fout,"core %u:\n",i);
-        fprintf(fout,"\tpipeline duty cycle =%f\n",m_pipeline_duty_cycle[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal Deocded Instructions=%u\n",m_num_decoded_insn[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal FP Deocded Instructions=%u\n",m_num_FPdecoded_insn[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal INT Deocded Instructions=%u\n",m_num_INTdecoded_insn[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal LOAD Queued Instructions=%u\n",m_num_loadqueued_insn[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal STORE Queued Instructions=%u\n",m_num_storequeued_insn[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal IALU Acesses=%f\n",m_num_ialu_acesses[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal FP Acesses=%f\n",m_num_fp_acesses[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal DP Acesses=%f\n",m_num_dp_acesses[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal IMUL Acesses=%f\n",m_num_imul_acesses[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal IMUL24 Acesses=%f\n",m_num_imul24_acesses[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal IMUL32 Acesses=%f\n",m_num_imul32_acesses[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal IDIV Acesses=%f\n",m_num_idiv_acesses[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal FPMUL Acesses=%f\n",m_num_fpmul_acesses[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal DPMUL Acesses=%f\n",m_num_dpmul_acesses[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal SQRT Acesses=%f\n",m_num_sqrt_acesses[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal LOG Acesses=%f\n",m_num_log_acesses[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal SIN Acesses=%f\n",m_num_sin_acesses[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal EXP Acesses=%f\n",m_num_exp_acesses[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal FPDIV Acesses=%f\n",m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal DPDIV Acesses=%f\n",m_num_dpdiv_acesses[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal TENSOR Acesses=%f\n",m_num_tensor_core_acesses[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal CONST Acesses=%f\n",m_num_const_acesses[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal TEX Acesses=%f\n",m_num_tex_acesses[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal SFU Acesses=%f\n",m_num_sfu_acesses[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal SP Acesses=%f\n",m_num_sp_acesses[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal MEM Acesses=%f\n",m_num_mem_acesses[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal SFU Commissions=%u\n",m_num_sfu_committed[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal SP Commissions=%u\n",m_num_sp_committed[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal MEM Commissions=%u\n",m_num_mem_committed[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal REG Reads=%u\n",m_read_regfile_acesses[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal REG Writes=%u\n",m_write_regfile_acesses[CURRENT_STAT_IDX][i]);
-        fprintf(fout,"\tTotal NON REG=%u\n",m_non_rf_operands[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "core %u:\n", i);
+    fprintf(fout, "\tpipeline duty cycle =%f\n",
+            m_pipeline_duty_cycle[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal Deocded Instructions=%u\n",
+            m_num_decoded_insn[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal FP Deocded Instructions=%u\n",
+            m_num_FPdecoded_insn[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal INT Deocded Instructions=%u\n",
+            m_num_INTdecoded_insn[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal LOAD Queued Instructions=%u\n",
+            m_num_loadqueued_insn[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal STORE Queued Instructions=%u\n",
+            m_num_storequeued_insn[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal IALU Acesses=%f\n",
+            m_num_ialu_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal FP Acesses=%f\n",
+            m_num_fp_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal DP Acesses=%f\n",
+            m_num_dp_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal IMUL Acesses=%f\n",
+            m_num_imul_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal IMUL24 Acesses=%f\n",
+            m_num_imul24_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal IMUL32 Acesses=%f\n",
+            m_num_imul32_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal IDIV Acesses=%f\n",
+            m_num_idiv_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal FPMUL Acesses=%f\n",
+            m_num_fpmul_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal DPMUL Acesses=%f\n",
+            m_num_dpmul_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal SQRT Acesses=%f\n",
+            m_num_sqrt_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal LOG Acesses=%f\n",
+            m_num_log_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal SIN Acesses=%f\n",
+            m_num_sin_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal EXP Acesses=%f\n",
+            m_num_exp_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal FPDIV Acesses=%f\n",
+            m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal DPDIV Acesses=%f\n",
+            m_num_dpdiv_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal TENSOR Acesses=%f\n",
+            m_num_tensor_core_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal CONST Acesses=%f\n",
+            m_num_const_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal TEX Acesses=%f\n",
+            m_num_tex_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal SFU Acesses=%f\n",
+            m_num_sfu_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal SP Acesses=%f\n",
+            m_num_sp_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal MEM Acesses=%f\n",
+            m_num_mem_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal SFU Commissions=%u\n",
+            m_num_sfu_committed[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal SP Commissions=%u\n",
+            m_num_sp_committed[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal MEM Commissions=%u\n",
+            m_num_mem_committed[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal REG Reads=%u\n",
+            m_read_regfile_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal REG Writes=%u\n",
+            m_write_regfile_acesses[CURRENT_STAT_IDX][i]);
+    fprintf(fout, "\tTotal NON REG=%u\n",
+            m_non_rf_operands[CURRENT_STAT_IDX][i]);
   }
 }
 void power_core_stat_t::init() {
-    m_pipeline_duty_cycle[CURRENT_STAT_IDX]=m_core_stats->m_pipeline_duty_cycle;
-    m_num_decoded_insn[CURRENT_STAT_IDX]=m_core_stats->m_num_decoded_insn;
-    m_num_FPdecoded_insn[CURRENT_STAT_IDX]=m_core_stats->m_num_FPdecoded_insn;
-    m_num_INTdecoded_insn[CURRENT_STAT_IDX]=m_core_stats->m_num_INTdecoded_insn;
-    m_num_storequeued_insn[CURRENT_STAT_IDX]=m_core_stats->m_num_storequeued_insn;
-    m_num_loadqueued_insn[CURRENT_STAT_IDX]=m_core_stats->m_num_loadqueued_insn;
-    m_num_ialu_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_ialu_acesses;
-    m_num_fp_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_fp_acesses;
-    m_num_imul_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_imul_acesses;
-    m_num_imul24_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_imul24_acesses;
-    m_num_imul32_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_imul32_acesses;
-    m_num_fpmul_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_fpmul_acesses;
-    m_num_idiv_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_idiv_acesses;
-    m_num_fpdiv_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_fpdiv_acesses;
-    m_num_dp_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_dp_acesses;
-    m_num_dpmul_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_dpmul_acesses;
-    m_num_dpdiv_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_dpdiv_acesses;
-    m_num_sp_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_sp_acesses;
-    m_num_sfu_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_sfu_acesses;
-    m_num_sqrt_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_sqrt_acesses;
-    m_num_log_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_log_acesses;
-    m_num_sin_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_sin_acesses;
-    m_num_exp_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_exp_acesses;
-    m_num_tensor_core_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_tensor_core_acesses;
-    m_num_const_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_const_acesses;
-    m_num_tex_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_tex_acesses;
-    m_num_mem_acesses[CURRENT_STAT_IDX]=m_core_stats->m_num_mem_acesses;
-    m_num_sp_committed[CURRENT_STAT_IDX]=m_core_stats->m_num_sp_committed;
-    m_num_sfu_committed[CURRENT_STAT_IDX]=m_core_stats->m_num_sfu_committed;
-    m_num_mem_committed[CURRENT_STAT_IDX]=m_core_stats->m_num_mem_committed;
-    m_read_regfile_acesses[CURRENT_STAT_IDX]=m_core_stats->m_read_regfile_acesses;
-    m_write_regfile_acesses[CURRENT_STAT_IDX]=m_core_stats->m_write_regfile_acesses;
-    m_non_rf_operands[CURRENT_STAT_IDX]=m_core_stats->m_non_rf_operands;
-    m_active_sp_lanes[CURRENT_STAT_IDX]=m_core_stats->m_active_sp_lanes;
-    m_active_sfu_lanes[CURRENT_STAT_IDX]=m_core_stats->m_active_sfu_lanes;
-    m_active_exu_threads[CURRENT_STAT_IDX]=m_core_stats->m_active_exu_threads;
-    m_active_exu_warps[CURRENT_STAT_IDX]=m_core_stats->m_active_exu_warps;
-    m_num_tex_inst[CURRENT_STAT_IDX]=m_core_stats->m_num_tex_inst;
-
-    m_pipeline_duty_cycle[PREV_STAT_IDX]=(float*)calloc(m_config->num_shader(),sizeof(float));
-    m_num_decoded_insn[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
-    m_num_FPdecoded_insn[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
-    m_num_INTdecoded_insn[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
-    m_num_storequeued_insn[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
-    m_num_loadqueued_insn[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
-    m_num_tex_inst[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
-
-    m_num_ialu_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
-    m_num_fp_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
-    m_num_imul_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
-    m_num_imul24_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
-    m_num_imul32_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
-    m_num_fpmul_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
-    m_num_idiv_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
-    m_num_fpdiv_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
-    m_num_dp_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
-    m_num_dpmul_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
-    m_num_dpdiv_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
-    m_num_tensor_core_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
-    m_num_const_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
-    m_num_tex_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
-    m_num_sp_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
-    m_num_sfu_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
-    m_num_sqrt_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
-    m_num_log_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
-    m_num_sin_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
-    m_num_exp_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
-    m_num_mem_acesses[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
-    m_num_sp_committed[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
-    m_num_sfu_committed[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
-    m_num_mem_committed[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
-    m_read_regfile_acesses[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
-    m_write_regfile_acesses[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
-    m_non_rf_operands[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
-    m_active_sp_lanes[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
-    m_active_sfu_lanes[PREV_STAT_IDX]=(unsigned *)calloc(m_config->num_shader(),sizeof(unsigned));
-    m_active_exu_threads[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
-    m_active_exu_warps[PREV_STAT_IDX]=(double *)calloc(m_config->num_shader(),sizeof(double));
+  m_pipeline_duty_cycle[CURRENT_STAT_IDX] = m_core_stats->m_pipeline_duty_cycle;
+  m_num_decoded_insn[CURRENT_STAT_IDX] = m_core_stats->m_num_decoded_insn;
+  m_num_FPdecoded_insn[CURRENT_STAT_IDX] = m_core_stats->m_num_FPdecoded_insn;
+  m_num_INTdecoded_insn[CURRENT_STAT_IDX] = m_core_stats->m_num_INTdecoded_insn;
+  m_num_storequeued_insn[CURRENT_STAT_IDX] =
+      m_core_stats->m_num_storequeued_insn;
+  m_num_loadqueued_insn[CURRENT_STAT_IDX] = m_core_stats->m_num_loadqueued_insn;
+  m_num_ialu_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_ialu_acesses;
+  m_num_fp_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_fp_acesses;
+  m_num_imul_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_imul_acesses;
+  m_num_imul24_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_imul24_acesses;
+  m_num_imul32_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_imul32_acesses;
+  m_num_fpmul_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_fpmul_acesses;
+  m_num_idiv_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_idiv_acesses;
+  m_num_fpdiv_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_fpdiv_acesses;
+  m_num_dp_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_dp_acesses;
+  m_num_dpmul_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_dpmul_acesses;
+  m_num_dpdiv_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_dpdiv_acesses;
+  m_num_sp_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_sp_acesses;
+  m_num_sfu_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_sfu_acesses;
+  m_num_sqrt_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_sqrt_acesses;
+  m_num_log_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_log_acesses;
+  m_num_sin_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_sin_acesses;
+  m_num_exp_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_exp_acesses;
+  m_num_tensor_core_acesses[CURRENT_STAT_IDX] =
+      m_core_stats->m_num_tensor_core_acesses;
+  m_num_const_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_const_acesses;
+  m_num_tex_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_tex_acesses;
+  m_num_mem_acesses[CURRENT_STAT_IDX] = m_core_stats->m_num_mem_acesses;
+  m_num_sp_committed[CURRENT_STAT_IDX] = m_core_stats->m_num_sp_committed;
+  m_num_sfu_committed[CURRENT_STAT_IDX] = m_core_stats->m_num_sfu_committed;
+  m_num_mem_committed[CURRENT_STAT_IDX] = m_core_stats->m_num_mem_committed;
+  m_read_regfile_acesses[CURRENT_STAT_IDX] =
+      m_core_stats->m_read_regfile_acesses;
+  m_write_regfile_acesses[CURRENT_STAT_IDX] =
+      m_core_stats->m_write_regfile_acesses;
+  m_non_rf_operands[CURRENT_STAT_IDX] = m_core_stats->m_non_rf_operands;
+  m_active_sp_lanes[CURRENT_STAT_IDX] = m_core_stats->m_active_sp_lanes;
+  m_active_sfu_lanes[CURRENT_STAT_IDX] = m_core_stats->m_active_sfu_lanes;
+  m_active_exu_threads[CURRENT_STAT_IDX] = m_core_stats->m_active_exu_threads;
+  m_active_exu_warps[CURRENT_STAT_IDX] = m_core_stats->m_active_exu_warps;
+  m_num_tex_inst[CURRENT_STAT_IDX] = m_core_stats->m_num_tex_inst;
 
+  m_pipeline_duty_cycle[PREV_STAT_IDX] =
+      (float *)calloc(m_config->num_shader(), sizeof(float));
+  m_num_decoded_insn[PREV_STAT_IDX] =
+      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
+  m_num_FPdecoded_insn[PREV_STAT_IDX] =
+      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
+  m_num_INTdecoded_insn[PREV_STAT_IDX] =
+      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
+  m_num_storequeued_insn[PREV_STAT_IDX] =
+      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
+  m_num_loadqueued_insn[PREV_STAT_IDX] =
+      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
+  m_num_tex_inst[PREV_STAT_IDX] =
+      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
 
+  m_num_ialu_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_fp_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_imul_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_imul24_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_imul32_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_fpmul_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_idiv_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_fpdiv_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_dp_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_dpmul_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_dpdiv_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_tensor_core_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_const_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_tex_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_sp_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_sfu_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_sqrt_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_log_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_sin_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_exp_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_mem_acesses[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_num_sp_committed[PREV_STAT_IDX] =
+      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
+  m_num_sfu_committed[PREV_STAT_IDX] =
+      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
+  m_num_mem_committed[PREV_STAT_IDX] =
+      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
+  m_read_regfile_acesses[PREV_STAT_IDX] =
+      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
+  m_write_regfile_acesses[PREV_STAT_IDX] =
+      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
+  m_non_rf_operands[PREV_STAT_IDX] =
+      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
+  m_active_sp_lanes[PREV_STAT_IDX] =
+      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
+  m_active_sfu_lanes[PREV_STAT_IDX] =
+      (unsigned *)calloc(m_config->num_shader(), sizeof(unsigned));
+  m_active_exu_threads[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
+  m_active_exu_warps[PREV_STAT_IDX] =
+      (double *)calloc(m_config->num_shader(), sizeof(double));
 }
 
 void power_core_stat_t::save_stats() {
   for (unsigned i = 0; i < m_config->num_shader(); ++i) {
-    m_pipeline_duty_cycle[PREV_STAT_IDX][i]=m_pipeline_duty_cycle[CURRENT_STAT_IDX][i];
-    m_num_decoded_insn[PREV_STAT_IDX][i]= m_num_decoded_insn[CURRENT_STAT_IDX][i];
-    m_num_FPdecoded_insn[PREV_STAT_IDX][i]=m_num_FPdecoded_insn[CURRENT_STAT_IDX][i];
-    m_num_INTdecoded_insn[PREV_STAT_IDX][i]=m_num_INTdecoded_insn[CURRENT_STAT_IDX][i];
-    m_num_storequeued_insn[PREV_STAT_IDX][i]=m_num_storequeued_insn[CURRENT_STAT_IDX][i];
-    m_num_loadqueued_insn[PREV_STAT_IDX][i]=m_num_loadqueued_insn[CURRENT_STAT_IDX][i];
-    m_num_ialu_acesses[PREV_STAT_IDX][i]=m_num_ialu_acesses[CURRENT_STAT_IDX][i];
-    m_num_fp_acesses[PREV_STAT_IDX][i]=m_num_fp_acesses[CURRENT_STAT_IDX][i];
-    m_num_tex_inst[PREV_STAT_IDX][i]=m_num_tex_inst[CURRENT_STAT_IDX][i];
-    m_num_imul_acesses[PREV_STAT_IDX][i]=m_num_imul_acesses[CURRENT_STAT_IDX][i];
-    m_num_imul24_acesses[PREV_STAT_IDX][i]=m_num_imul24_acesses[CURRENT_STAT_IDX][i];
-    m_num_imul32_acesses[PREV_STAT_IDX][i]=m_num_imul32_acesses[CURRENT_STAT_IDX][i];
-    m_num_fpmul_acesses[PREV_STAT_IDX][i]=m_num_fpmul_acesses[CURRENT_STAT_IDX][i];
-    m_num_idiv_acesses[PREV_STAT_IDX][i]=m_num_idiv_acesses[CURRENT_STAT_IDX][i];
-    m_num_fpdiv_acesses[PREV_STAT_IDX][i]=m_num_fpdiv_acesses[CURRENT_STAT_IDX][i];
-    m_num_sp_acesses[PREV_STAT_IDX][i]=m_num_sp_acesses[CURRENT_STAT_IDX][i];
-    m_num_sfu_acesses[PREV_STAT_IDX][i]=m_num_sfu_acesses[CURRENT_STAT_IDX][i];
-    m_num_sqrt_acesses[PREV_STAT_IDX][i]=m_num_sqrt_acesses[CURRENT_STAT_IDX][i];
-    m_num_log_acesses[PREV_STAT_IDX][i]=m_num_log_acesses[CURRENT_STAT_IDX][i];
-    m_num_sin_acesses[PREV_STAT_IDX][i]=m_num_sin_acesses[CURRENT_STAT_IDX][i];
-    m_num_exp_acesses[PREV_STAT_IDX][i]=m_num_exp_acesses[CURRENT_STAT_IDX][i];
-    m_num_dp_acesses[PREV_STAT_IDX][i]=m_num_dp_acesses[CURRENT_STAT_IDX][i];
-    m_num_dpmul_acesses[PREV_STAT_IDX][i]=m_num_dpmul_acesses[CURRENT_STAT_IDX][i];
-    m_num_dpdiv_acesses[PREV_STAT_IDX][i]=m_num_dpdiv_acesses[CURRENT_STAT_IDX][i];
-    m_num_tensor_core_acesses[PREV_STAT_IDX][i]=m_num_tensor_core_acesses[CURRENT_STAT_IDX][i];
-    m_num_const_acesses[PREV_STAT_IDX][i]=m_num_const_acesses[CURRENT_STAT_IDX][i];
-    m_num_tex_acesses[PREV_STAT_IDX][i]=m_num_tex_acesses[CURRENT_STAT_IDX][i];
-    m_num_mem_acesses[PREV_STAT_IDX][i]=m_num_mem_acesses[CURRENT_STAT_IDX][i];
-    m_num_sp_committed[PREV_STAT_IDX][i]=m_num_sp_committed[CURRENT_STAT_IDX][i];
-    m_num_sfu_committed[PREV_STAT_IDX][i]=m_num_sfu_committed[CURRENT_STAT_IDX][i];
-    m_num_mem_committed[PREV_STAT_IDX][i]=m_num_mem_committed[CURRENT_STAT_IDX][i];
-    m_read_regfile_acesses[PREV_STAT_IDX][i]=m_read_regfile_acesses[CURRENT_STAT_IDX][i];
-    m_write_regfile_acesses[PREV_STAT_IDX][i]=m_write_regfile_acesses[CURRENT_STAT_IDX][i];
-    m_non_rf_operands[PREV_STAT_IDX][i]=m_non_rf_operands[CURRENT_STAT_IDX][i];
-    m_active_sp_lanes[PREV_STAT_IDX][i]=m_active_sp_lanes[CURRENT_STAT_IDX][i];
-    m_active_sfu_lanes[PREV_STAT_IDX][i]=m_active_sfu_lanes[CURRENT_STAT_IDX][i];
-    m_active_exu_threads[PREV_STAT_IDX][i]=m_active_exu_threads[CURRENT_STAT_IDX][i];
-    m_active_exu_warps[PREV_STAT_IDX][i]=m_active_exu_warps[CURRENT_STAT_IDX][i];
+    m_pipeline_duty_cycle[PREV_STAT_IDX][i] =
+        m_pipeline_duty_cycle[CURRENT_STAT_IDX][i];
+    m_num_decoded_insn[PREV_STAT_IDX][i] =
+        m_num_decoded_insn[CURRENT_STAT_IDX][i];
+    m_num_FPdecoded_insn[PREV_STAT_IDX][i] =
+        m_num_FPdecoded_insn[CURRENT_STAT_IDX][i];
+    m_num_INTdecoded_insn[PREV_STAT_IDX][i] =
+        m_num_INTdecoded_insn[CURRENT_STAT_IDX][i];
+    m_num_storequeued_insn[PREV_STAT_IDX][i] =
+        m_num_storequeued_insn[CURRENT_STAT_IDX][i];
+    m_num_loadqueued_insn[PREV_STAT_IDX][i] =
+        m_num_loadqueued_insn[CURRENT_STAT_IDX][i];
+    m_num_ialu_acesses[PREV_STAT_IDX][i] =
+        m_num_ialu_acesses[CURRENT_STAT_IDX][i];
+    m_num_fp_acesses[PREV_STAT_IDX][i] = m_num_fp_acesses[CURRENT_STAT_IDX][i];
+    m_num_tex_inst[PREV_STAT_IDX][i] = m_num_tex_inst[CURRENT_STAT_IDX][i];
+    m_num_imul_acesses[PREV_STAT_IDX][i] =
+        m_num_imul_acesses[CURRENT_STAT_IDX][i];
+    m_num_imul24_acesses[PREV_STAT_IDX][i] =
+        m_num_imul24_acesses[CURRENT_STAT_IDX][i];
+    m_num_imul32_acesses[PREV_STAT_IDX][i] =
+        m_num_imul32_acesses[CURRENT_STAT_IDX][i];
+    m_num_fpmul_acesses[PREV_STAT_IDX][i] =
+        m_num_fpmul_acesses[CURRENT_STAT_IDX][i];
+    m_num_idiv_acesses[PREV_STAT_IDX][i] =
+        m_num_idiv_acesses[CURRENT_STAT_IDX][i];
+    m_num_fpdiv_acesses[PREV_STAT_IDX][i] =
+        m_num_fpdiv_acesses[CURRENT_STAT_IDX][i];
+    m_num_sp_acesses[PREV_STAT_IDX][i] = m_num_sp_acesses[CURRENT_STAT_IDX][i];
+    m_num_sfu_acesses[PREV_STAT_IDX][i] =
+        m_num_sfu_acesses[CURRENT_STAT_IDX][i];
+    m_num_sqrt_acesses[PREV_STAT_IDX][i] =
+        m_num_sqrt_acesses[CURRENT_STAT_IDX][i];
+    m_num_log_acesses[PREV_STAT_IDX][i] =
+        m_num_log_acesses[CURRENT_STAT_IDX][i];
+    m_num_sin_acesses[PREV_STAT_IDX][i] =
+        m_num_sin_acesses[CURRENT_STAT_IDX][i];
+    m_num_exp_acesses[PREV_STAT_IDX][i] =
+        m_num_exp_acesses[CURRENT_STAT_IDX][i];
+    m_num_dp_acesses[PREV_STAT_IDX][i] = m_num_dp_acesses[CURRENT_STAT_IDX][i];
+    m_num_dpmul_acesses[PREV_STAT_IDX][i] =
+        m_num_dpmul_acesses[CURRENT_STAT_IDX][i];
+    m_num_dpdiv_acesses[PREV_STAT_IDX][i] =
+        m_num_dpdiv_acesses[CURRENT_STAT_IDX][i];
+    m_num_tensor_core_acesses[PREV_STAT_IDX][i] =
+        m_num_tensor_core_acesses[CURRENT_STAT_IDX][i];
+    m_num_const_acesses[PREV_STAT_IDX][i] =
+        m_num_const_acesses[CURRENT_STAT_IDX][i];
+    m_num_tex_acesses[PREV_STAT_IDX][i] =
+        m_num_tex_acesses[CURRENT_STAT_IDX][i];
+    m_num_mem_acesses[PREV_STAT_IDX][i] =
+        m_num_mem_acesses[CURRENT_STAT_IDX][i];
+    m_num_sp_committed[PREV_STAT_IDX][i] =
+        m_num_sp_committed[CURRENT_STAT_IDX][i];
+    m_num_sfu_committed[PREV_STAT_IDX][i] =
+        m_num_sfu_committed[CURRENT_STAT_IDX][i];
+    m_num_mem_committed[PREV_STAT_IDX][i] =
+        m_num_mem_committed[CURRENT_STAT_IDX][i];
+    m_read_regfile_acesses[PREV_STAT_IDX][i] =
+        m_read_regfile_acesses[CURRENT_STAT_IDX][i];
+    m_write_regfile_acesses[PREV_STAT_IDX][i] =
+        m_write_regfile_acesses[CURRENT_STAT_IDX][i];
+    m_non_rf_operands[PREV_STAT_IDX][i] =
+        m_non_rf_operands[CURRENT_STAT_IDX][i];
+    m_active_sp_lanes[PREV_STAT_IDX][i] =
+        m_active_sp_lanes[CURRENT_STAT_IDX][i];
+    m_active_sfu_lanes[PREV_STAT_IDX][i] =
+        m_active_sfu_lanes[CURRENT_STAT_IDX][i];
+    m_active_exu_threads[PREV_STAT_IDX][i] =
+        m_active_exu_threads[CURRENT_STAT_IDX][i];
+    m_active_exu_warps[PREV_STAT_IDX][i] =
+        m_active_exu_warps[CURRENT_STAT_IDX][i];
   }
 }
 
@@ -390,12 +497,12 @@ power_stat_t::power_stat_t(const shader_core_config *shader_config,
   dram_rd_kernel = 0;
   dram_wr_kernel = 0;
   dram_pre_kernel = 0;
-  l1i_hits_kernel =0;
-  l1i_misses_kernel =0;
-  l2r_hits_kernel =0;
-  l2r_misses_kernel =0;
-  l2w_hits_kernel =0;
-  l2w_misses_kernel =0;
+  l1i_hits_kernel = 0;
+  l1i_misses_kernel = 0;
+  l2r_hits_kernel = 0;
+  l2r_misses_kernel = 0;
+  l2w_hits_kernel = 0;
+  l2w_misses_kernel = 0;
   noc_tr_kernel = 0;
   noc_rc_kernel = 0;
 
diff --git a/src/gpgpu-sim/power_stat.h b/src/gpgpu-sim/power_stat.h
index d40f1d98a..13f144ab4 100644
--- a/src/gpgpu-sim/power_stat.h
+++ b/src/gpgpu-sim/power_stat.h
@@ -1,17 +1,18 @@
-// Copyright (c) 2009-2021,  Tor M. Aamodt, Ahmed El-Shafiey, Tayler Hetherington, Vijay Kandiah, Nikos Hardavellas, 
-// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
-// The University of British Columbia, Northwestern University, Purdue University
-// All rights reserved.
+// Copyright (c) 2009-2021,  Tor M. Aamodt, Ahmed El-Shafiey, Tayler
+// Hetherington, Vijay Kandiah, Nikos Hardavellas, Mahmoud Khairy, Junrui Pan,
+// Timothy G. Rogers The University of British Columbia, Northwestern
+// University, Purdue University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// 1. Redistributions of source code must retain the above copyright notice, this
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
 //    list of conditions and the following disclaimer;
 // 2. Redistributions in binary form must reproduce the above copyright notice,
 //    this list of conditions and the following disclaimer in the documentation
 //    and/or other materials provided with the distribution;
-// 3. Neither the names of The University of British Columbia, Northwestern 
+// 3. Neither the names of The University of British Columbia, Northwestern
 //    University nor the names of their contributors may be used to
 //    endorse or promote products derived from this software without specific
 //    prior written permission.
@@ -53,40 +54,40 @@ struct shader_core_power_stats_pod {
   unsigned
       *m_num_INTdecoded_insn[NUM_STAT_IDX];  // number of instructions committed
                                              // by this shader core
-    unsigned *m_num_storequeued_insn[NUM_STAT_IDX];
-    unsigned *m_num_loadqueued_insn[NUM_STAT_IDX];
-    unsigned *m_num_tex_inst[NUM_STAT_IDX];
-    double *m_num_ialu_acesses[NUM_STAT_IDX];
-    double *m_num_fp_acesses[NUM_STAT_IDX];
-    double *m_num_imul_acesses[NUM_STAT_IDX];
-    double *m_num_imul32_acesses[NUM_STAT_IDX];
-    double *m_num_imul24_acesses[NUM_STAT_IDX];
-    double *m_num_fpmul_acesses[NUM_STAT_IDX];
-    double *m_num_idiv_acesses[NUM_STAT_IDX];
-    double *m_num_fpdiv_acesses[NUM_STAT_IDX];
-    double *m_num_dp_acesses[NUM_STAT_IDX];
-    double *m_num_dpmul_acesses[NUM_STAT_IDX];
-    double *m_num_dpdiv_acesses[NUM_STAT_IDX];
-    double *m_num_sp_acesses[NUM_STAT_IDX];
-    double *m_num_sfu_acesses[NUM_STAT_IDX];
-    double *m_num_sqrt_acesses[NUM_STAT_IDX];
-    double *m_num_log_acesses[NUM_STAT_IDX];
-    double *m_num_sin_acesses[NUM_STAT_IDX];
-    double *m_num_exp_acesses[NUM_STAT_IDX];
-    double *m_num_tensor_core_acesses[NUM_STAT_IDX];
-    double *m_num_const_acesses[NUM_STAT_IDX];
-    double *m_num_tex_acesses[NUM_STAT_IDX];
-    double *m_num_mem_acesses[NUM_STAT_IDX];
-    unsigned *m_num_sp_committed[NUM_STAT_IDX];
-    unsigned *m_num_sfu_committed[NUM_STAT_IDX];
-    unsigned *m_num_mem_committed[NUM_STAT_IDX];
-    unsigned *m_active_sp_lanes[NUM_STAT_IDX];
-    unsigned *m_active_sfu_lanes[NUM_STAT_IDX];
-    double *m_active_exu_threads[NUM_STAT_IDX];
-    double *m_active_exu_warps[NUM_STAT_IDX];    
-    unsigned *m_read_regfile_acesses[NUM_STAT_IDX];
-    unsigned *m_write_regfile_acesses[NUM_STAT_IDX];
-    unsigned *m_non_rf_operands[NUM_STAT_IDX];
+  unsigned *m_num_storequeued_insn[NUM_STAT_IDX];
+  unsigned *m_num_loadqueued_insn[NUM_STAT_IDX];
+  unsigned *m_num_tex_inst[NUM_STAT_IDX];
+  double *m_num_ialu_acesses[NUM_STAT_IDX];
+  double *m_num_fp_acesses[NUM_STAT_IDX];
+  double *m_num_imul_acesses[NUM_STAT_IDX];
+  double *m_num_imul32_acesses[NUM_STAT_IDX];
+  double *m_num_imul24_acesses[NUM_STAT_IDX];
+  double *m_num_fpmul_acesses[NUM_STAT_IDX];
+  double *m_num_idiv_acesses[NUM_STAT_IDX];
+  double *m_num_fpdiv_acesses[NUM_STAT_IDX];
+  double *m_num_dp_acesses[NUM_STAT_IDX];
+  double *m_num_dpmul_acesses[NUM_STAT_IDX];
+  double *m_num_dpdiv_acesses[NUM_STAT_IDX];
+  double *m_num_sp_acesses[NUM_STAT_IDX];
+  double *m_num_sfu_acesses[NUM_STAT_IDX];
+  double *m_num_sqrt_acesses[NUM_STAT_IDX];
+  double *m_num_log_acesses[NUM_STAT_IDX];
+  double *m_num_sin_acesses[NUM_STAT_IDX];
+  double *m_num_exp_acesses[NUM_STAT_IDX];
+  double *m_num_tensor_core_acesses[NUM_STAT_IDX];
+  double *m_num_const_acesses[NUM_STAT_IDX];
+  double *m_num_tex_acesses[NUM_STAT_IDX];
+  double *m_num_mem_acesses[NUM_STAT_IDX];
+  unsigned *m_num_sp_committed[NUM_STAT_IDX];
+  unsigned *m_num_sfu_committed[NUM_STAT_IDX];
+  unsigned *m_num_mem_committed[NUM_STAT_IDX];
+  unsigned *m_active_sp_lanes[NUM_STAT_IDX];
+  unsigned *m_active_sfu_lanes[NUM_STAT_IDX];
+  double *m_active_exu_threads[NUM_STAT_IDX];
+  double *m_active_exu_warps[NUM_STAT_IDX];
+  unsigned *m_read_regfile_acesses[NUM_STAT_IDX];
+  unsigned *m_write_regfile_acesses[NUM_STAT_IDX];
+  unsigned *m_non_rf_operands[NUM_STAT_IDX];
 };
 
 class power_core_stat_t : public shader_core_power_stats_pod {
@@ -97,7 +98,6 @@ class power_core_stat_t : public shader_core_power_stats_pod {
   void print(FILE *fout);
   void init();
   void save_stats();
- 
 
  private:
   shader_core_stats *m_core_stats;
@@ -206,35 +206,37 @@ class power_stat_t {
   double get_total_inst(bool aggregate_stat) {
     double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      if(aggregate_stat)
+      if (aggregate_stat)
         total_inst += (pwr_core_stat->m_num_decoded_insn[CURRENT_STAT_IDX][i]);
       else
         total_inst += (pwr_core_stat->m_num_decoded_insn[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_decoded_insn[PREV_STAT_IDX][i]);
+                      (pwr_core_stat->m_num_decoded_insn[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
   double get_total_int_inst(bool aggregate_stat) {
     double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      if(aggregate_stat)
-          total_inst +=
-          (pwr_core_stat->m_num_INTdecoded_insn[CURRENT_STAT_IDX][i]);
-      else 
+      if (aggregate_stat)
         total_inst +=
-          (pwr_core_stat->m_num_INTdecoded_insn[CURRENT_STAT_IDX][i]) -
-          (pwr_core_stat->m_num_INTdecoded_insn[PREV_STAT_IDX][i]);
+            (pwr_core_stat->m_num_INTdecoded_insn[CURRENT_STAT_IDX][i]);
+      else
+        total_inst +=
+            (pwr_core_stat->m_num_INTdecoded_insn[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_INTdecoded_insn[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
   double get_total_fp_inst(bool aggregate_stat) {
     double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      if(aggregate_stat)
-        total_inst += (pwr_core_stat->m_num_FPdecoded_insn[CURRENT_STAT_IDX][i]);
-      else 
-        total_inst += (pwr_core_stat->m_num_FPdecoded_insn[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_FPdecoded_insn[PREV_STAT_IDX][i]);
+      if (aggregate_stat)
+        total_inst +=
+            (pwr_core_stat->m_num_FPdecoded_insn[CURRENT_STAT_IDX][i]);
+      else
+        total_inst +=
+            (pwr_core_stat->m_num_FPdecoded_insn[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_FPdecoded_insn[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
@@ -283,43 +285,45 @@ class power_stat_t {
   double get_committed_inst(bool aggregate_stat) {
     double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      if(aggregate_stat)
-        total_inst += (pwr_core_stat->m_num_mem_committed[CURRENT_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_sfu_committed[CURRENT_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_sp_committed[CURRENT_STAT_IDX][i]);
+      if (aggregate_stat)
+        total_inst +=
+            (pwr_core_stat->m_num_mem_committed[CURRENT_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_sfu_committed[CURRENT_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_sp_committed[CURRENT_STAT_IDX][i]);
       else
-        total_inst += (pwr_core_stat->m_num_mem_committed[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_mem_committed[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_sfu_committed[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_sfu_committed[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_sp_committed[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_sp_committed[PREV_STAT_IDX][i]);
+        total_inst +=
+            (pwr_core_stat->m_num_mem_committed[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_mem_committed[PREV_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_sfu_committed[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_sfu_committed[PREV_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_sp_committed[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_sp_committed[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
   double get_regfile_reads(bool aggregate_stat) {
     double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      if(aggregate_stat)
-         total_inst +=
-          (pwr_core_stat->m_read_regfile_acesses[CURRENT_STAT_IDX][i]);
+      if (aggregate_stat)
+        total_inst +=
+            (pwr_core_stat->m_read_regfile_acesses[CURRENT_STAT_IDX][i]);
       else
         total_inst +=
-          (pwr_core_stat->m_read_regfile_acesses[CURRENT_STAT_IDX][i]) -
-          (pwr_core_stat->m_read_regfile_acesses[PREV_STAT_IDX][i]);
+            (pwr_core_stat->m_read_regfile_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_read_regfile_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
   double get_regfile_writes(bool aggregate_stat) {
     double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      if(aggregate_stat)
+      if (aggregate_stat)
         total_inst +=
-          (pwr_core_stat->m_write_regfile_acesses[CURRENT_STAT_IDX][i]);
+            (pwr_core_stat->m_write_regfile_acesses[CURRENT_STAT_IDX][i]);
       else
         total_inst +=
-          (pwr_core_stat->m_write_regfile_acesses[CURRENT_STAT_IDX][i]) -
-          (pwr_core_stat->m_write_regfile_acesses[PREV_STAT_IDX][i]);
+            (pwr_core_stat->m_write_regfile_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_write_regfile_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
@@ -337,11 +341,11 @@ class power_stat_t {
   double get_non_regfile_operands(bool aggregate_stat) {
     double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      if(aggregate_stat)
-         total_inst += (pwr_core_stat->m_non_rf_operands[CURRENT_STAT_IDX][i]);
+      if (aggregate_stat)
+        total_inst += (pwr_core_stat->m_non_rf_operands[CURRENT_STAT_IDX][i]);
       else
         total_inst += (pwr_core_stat->m_non_rf_operands[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_non_rf_operands[PREV_STAT_IDX][i]);
+                      (pwr_core_stat->m_non_rf_operands[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
@@ -364,45 +368,49 @@ class power_stat_t {
     return total_inst;
   }
 
-  double get_sqrt_accessess(bool aggregate_stat){
-      double total_inst=0;
-      for(unsigned i=0; i<m_config->num_shader();i++){
-          if(aggregate_stat)
-            total_inst+=(pwr_core_stat->m_num_sqrt_acesses[CURRENT_STAT_IDX][i]);
-          else
-            total_inst+=(pwr_core_stat->m_num_sqrt_acesses[CURRENT_STAT_IDX][i]) - (pwr_core_stat->m_num_sqrt_acesses[PREV_STAT_IDX][i]);
-      }
-      return total_inst;
-  }
-  double get_log_accessess(bool aggregate_stat){
-      double total_inst=0;
-      for(unsigned i=0; i<m_config->num_shader();i++){
-        if(aggregate_stat)
-          total_inst+=(pwr_core_stat->m_num_log_acesses[CURRENT_STAT_IDX][i]);
-        else 
-          total_inst+=(pwr_core_stat->m_num_log_acesses[CURRENT_STAT_IDX][i]) - (pwr_core_stat->m_num_log_acesses[PREV_STAT_IDX][i]);
-      }
-      return total_inst;
-  }
-  double get_sin_accessess(bool aggregate_stat){
-      double total_inst=0;
-      for(unsigned i=0; i<m_config->num_shader();i++){
-        if(aggregate_stat)  
-          total_inst+=(pwr_core_stat->m_num_sin_acesses[CURRENT_STAT_IDX][i]);
-        else 
-          total_inst+=(pwr_core_stat->m_num_sin_acesses[CURRENT_STAT_IDX][i]) - (pwr_core_stat->m_num_sin_acesses[PREV_STAT_IDX][i]);
-      }
-      return total_inst;
-  }
-  double get_exp_accessess(bool aggregate_stat){
-      double total_inst=0;
-      for(unsigned i=0; i<m_config->num_shader();i++){
-        if(aggregate_stat)  
-          total_inst+=(pwr_core_stat->m_num_exp_acesses[CURRENT_STAT_IDX][i]);
-        else  
-          total_inst+=(pwr_core_stat->m_num_exp_acesses[CURRENT_STAT_IDX][i]) - (pwr_core_stat->m_num_exp_acesses[PREV_STAT_IDX][i]);
-      }
-      return total_inst;
+  double get_sqrt_accessess(bool aggregate_stat) {
+    double total_inst = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if (aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_sqrt_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_sqrt_acesses[CURRENT_STAT_IDX][i]) -
+                      (pwr_core_stat->m_num_sqrt_acesses[PREV_STAT_IDX][i]);
+    }
+    return total_inst;
+  }
+  double get_log_accessess(bool aggregate_stat) {
+    double total_inst = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if (aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_log_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_log_acesses[CURRENT_STAT_IDX][i]) -
+                      (pwr_core_stat->m_num_log_acesses[PREV_STAT_IDX][i]);
+    }
+    return total_inst;
+  }
+  double get_sin_accessess(bool aggregate_stat) {
+    double total_inst = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if (aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_sin_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_sin_acesses[CURRENT_STAT_IDX][i]) -
+                      (pwr_core_stat->m_num_sin_acesses[PREV_STAT_IDX][i]);
+    }
+    return total_inst;
+  }
+  double get_exp_accessess(bool aggregate_stat) {
+    double total_inst = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if (aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_exp_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_exp_acesses[CURRENT_STAT_IDX][i]) -
+                      (pwr_core_stat->m_num_exp_acesses[PREV_STAT_IDX][i]);
+    }
+    return total_inst;
   }
 
   double get_mem_accessess() {
@@ -417,11 +425,11 @@ class power_stat_t {
   double get_intdiv_accessess(bool aggregate_stat) {
     double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      if(aggregate_stat)
+      if (aggregate_stat)
         total_inst += (pwr_core_stat->m_num_idiv_acesses[CURRENT_STAT_IDX][i]);
       else
         total_inst += (pwr_core_stat->m_num_idiv_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_idiv_acesses[PREV_STAT_IDX][i]);
+                      (pwr_core_stat->m_num_idiv_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
@@ -429,11 +437,12 @@ class power_stat_t {
   double get_fpdiv_accessess(bool aggregate_stat) {
     double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      if(aggregate_stat)
+      if (aggregate_stat)
         total_inst += (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]);
-      else  
-        total_inst += (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_fpdiv_acesses[PREV_STAT_IDX][i]);
+      else
+        total_inst +=
+            (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_fpdiv_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
@@ -441,11 +450,13 @@ class power_stat_t {
   double get_intmul32_accessess(bool aggregate_stat) {
     double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      if(aggregate_stat)
-        total_inst += (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]);
-      else  
-        total_inst += (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_imul32_acesses[PREV_STAT_IDX][i]);
+      if (aggregate_stat)
+        total_inst +=
+            (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst +=
+            (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_imul32_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
@@ -453,118 +464,126 @@ class power_stat_t {
   double get_intmul24_accessess(bool aggregate_stat) {
     double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      if(aggregate_stat)
-        total_inst += (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]);
-      else  
-        total_inst += (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_imul24_acesses[PREV_STAT_IDX][i]);
+      if (aggregate_stat)
+        total_inst +=
+            (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst +=
+            (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_imul24_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  double get_intmul_accessess(bool aggregate_stat){
-      double total_inst=0;
-      for(unsigned i=0; i<m_config->num_shader();i++){
-        if(aggregate_stat)
-          total_inst+= (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i]); 
-        else  
-          total_inst+= (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i]) - 
-                       (pwr_core_stat->m_num_imul_acesses[PREV_STAT_IDX][i]);
-      }
-      return total_inst;
+  double get_intmul_accessess(bool aggregate_stat) {
+    double total_inst = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if (aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i]) -
+                      (pwr_core_stat->m_num_imul_acesses[PREV_STAT_IDX][i]);
+    }
+    return total_inst;
   }
 
-  double get_fpmul_accessess(bool aggregate_stat){
-    double total_inst=0;
-    for(unsigned i=0; i<m_config->num_shader();i++){
-        if(aggregate_stat)
-          total_inst += (pwr_core_stat->m_num_fpmul_acesses[CURRENT_STAT_IDX][i]);
-        else
-          total_inst += (pwr_core_stat->m_num_fpmul_acesses[CURRENT_STAT_IDX][i]) - 
-                      (pwr_core_stat->m_num_fpmul_acesses[PREV_STAT_IDX][i]);
+  double get_fpmul_accessess(bool aggregate_stat) {
+    double total_inst = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if (aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_fpmul_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst +=
+            (pwr_core_stat->m_num_fpmul_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_fpmul_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  double get_fp_accessess(bool aggregate_stat){
-    double total_inst=0;
-    for(unsigned i=0; i<m_config->num_shader();i++){
-        if(aggregate_stat)
-          total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i]);
-        else  
-          total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i]) - 
+  double get_fp_accessess(bool aggregate_stat) {
+    double total_inst = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if (aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i]) -
                       (pwr_core_stat->m_num_fp_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  double get_dp_accessess(bool aggregate_stat){
-    double total_inst=0;
-    for(unsigned i=0; i<m_config->num_shader();i++){
-        if(aggregate_stat)
-          total_inst += (pwr_core_stat->m_num_dp_acesses[CURRENT_STAT_IDX][i]);
-        else  
-          total_inst += (pwr_core_stat->m_num_dp_acesses[CURRENT_STAT_IDX][i]) - 
+  double get_dp_accessess(bool aggregate_stat) {
+    double total_inst = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if (aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_dp_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst += (pwr_core_stat->m_num_dp_acesses[CURRENT_STAT_IDX][i]) -
                       (pwr_core_stat->m_num_dp_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  double get_dpmul_accessess(bool aggregate_stat){
-    double total_inst=0;
-    for(unsigned i=0; i<m_config->num_shader();i++){
-      if(aggregate_stat)  
+  double get_dpmul_accessess(bool aggregate_stat) {
+    double total_inst = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if (aggregate_stat)
         total_inst += (pwr_core_stat->m_num_dpmul_acesses[CURRENT_STAT_IDX][i]);
-      else  
-        total_inst += (pwr_core_stat->m_num_dpmul_acesses[CURRENT_STAT_IDX][i]) - 
-                      (pwr_core_stat->m_num_dpmul_acesses[PREV_STAT_IDX][i]);
+      else
+        total_inst +=
+            (pwr_core_stat->m_num_dpmul_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_dpmul_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  double get_dpdiv_accessess(bool aggregate_stat){
-    double total_inst=0;
-    for(unsigned i=0; i<m_config->num_shader();i++){
-      if(aggregate_stat)  
+  double get_dpdiv_accessess(bool aggregate_stat) {
+    double total_inst = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if (aggregate_stat)
         total_inst += (pwr_core_stat->m_num_dpdiv_acesses[CURRENT_STAT_IDX][i]);
-      else  
-        total_inst += (pwr_core_stat->m_num_dpdiv_acesses[CURRENT_STAT_IDX][i]) - 
-                      (pwr_core_stat->m_num_dpdiv_acesses[PREV_STAT_IDX][i]);
+      else
+        total_inst +=
+            (pwr_core_stat->m_num_dpdiv_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_dpdiv_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  double get_tensor_accessess(bool aggregate_stat){
-    double total_inst=0;
-    for(unsigned i=0; i<m_config->num_shader();i++){
-      if(aggregate_stat)  
-        total_inst += (pwr_core_stat->m_num_tensor_core_acesses[CURRENT_STAT_IDX][i]);
-      else  
-        total_inst += (pwr_core_stat->m_num_tensor_core_acesses[CURRENT_STAT_IDX][i]) - 
-                      (pwr_core_stat->m_num_tensor_core_acesses[PREV_STAT_IDX][i]);
+  double get_tensor_accessess(bool aggregate_stat) {
+    double total_inst = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if (aggregate_stat)
+        total_inst +=
+            (pwr_core_stat->m_num_tensor_core_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst +=
+            (pwr_core_stat->m_num_tensor_core_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_tensor_core_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  double get_const_accessess(bool aggregate_stat){
-    double total_inst=0;
-    for(unsigned i=0; i<m_config->num_shader();i++){
-        if(aggregate_stat)
-          total_inst += pwr_core_stat->m_num_const_acesses[CURRENT_STAT_IDX][i];
-        else
-          total_inst += (pwr_core_stat->m_num_const_acesses[CURRENT_STAT_IDX][i]) - 
-                      (pwr_core_stat->m_num_const_acesses[PREV_STAT_IDX][i]);
+  double get_const_accessess(bool aggregate_stat) {
+    double total_inst = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if (aggregate_stat)
+        total_inst += pwr_core_stat->m_num_const_acesses[CURRENT_STAT_IDX][i];
+      else
+        total_inst +=
+            (pwr_core_stat->m_num_const_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_const_acesses[PREV_STAT_IDX][i]);
     }
     return (total_inst);
   }
 
-  double get_tex_accessess(bool aggregate_stat){
-    double total_inst=0;
-    for(unsigned i=0; i<m_config->num_shader();i++){
-      if(aggregate_stat)  
+  double get_tex_accessess(bool aggregate_stat) {
+    double total_inst = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if (aggregate_stat)
         total_inst += (pwr_core_stat->m_num_tex_acesses[CURRENT_STAT_IDX][i]);
-      else  
-        total_inst += (pwr_core_stat->m_num_tex_acesses[CURRENT_STAT_IDX][i]) - 
+      else
+        total_inst += (pwr_core_stat->m_num_tex_acesses[CURRENT_STAT_IDX][i]) -
                       (pwr_core_stat->m_num_tex_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
@@ -590,23 +609,24 @@ class power_stat_t {
            m_config->gpgpu_num_sfu_units;
   }
 
-
   float get_active_threads(bool aggregate_stat) {
     unsigned total_threads = 0;
     unsigned total_warps = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      if(aggregate_stat){
-        total_threads += (pwr_core_stat->m_active_exu_threads[CURRENT_STAT_IDX][i]) ;
+      if (aggregate_stat) {
+        total_threads +=
+            (pwr_core_stat->m_active_exu_threads[CURRENT_STAT_IDX][i]);
         total_warps += (pwr_core_stat->m_active_exu_warps[CURRENT_STAT_IDX][i]);
+      } else {
+        total_threads +=
+            (pwr_core_stat->m_active_exu_threads[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_active_exu_threads[PREV_STAT_IDX][i]);
+        total_warps +=
+            (pwr_core_stat->m_active_exu_warps[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_active_exu_warps[PREV_STAT_IDX][i]);
       }
-      else{
-        total_threads += (pwr_core_stat->m_active_exu_threads[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_active_exu_threads[PREV_STAT_IDX][i]);
-        total_warps += (pwr_core_stat->m_active_exu_warps[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_active_exu_warps[PREV_STAT_IDX][i]);
-        }
     }
-    if(total_warps != 0)
+    if (total_warps != 0)
       return (float)((float)total_threads / (float)total_warps);
     else
       return 0;
@@ -615,98 +635,98 @@ class power_stat_t {
   unsigned long long get_tot_threads_kernel(bool aggregate_stat) {
     unsigned total_threads = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      if(aggregate_stat){
-        total_threads += (pwr_core_stat->m_active_exu_threads[CURRENT_STAT_IDX][i]) ;
+      if (aggregate_stat) {
+        total_threads +=
+            (pwr_core_stat->m_active_exu_threads[CURRENT_STAT_IDX][i]);
+      } else {
+        total_threads +=
+            (pwr_core_stat->m_active_exu_threads[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_active_exu_threads[PREV_STAT_IDX][i]);
       }
-      else{
-        total_threads += (pwr_core_stat->m_active_exu_threads[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_active_exu_threads[PREV_STAT_IDX][i]);
-        }
     }
 
-      return total_threads;
+    return total_threads;
   }
   unsigned long long get_tot_warps_kernel(bool aggregate_stat) {
     unsigned long long total_warps = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      if(aggregate_stat){
+      if (aggregate_stat) {
         total_warps += (pwr_core_stat->m_active_exu_warps[CURRENT_STAT_IDX][i]);
+      } else {
+        total_warps +=
+            (pwr_core_stat->m_active_exu_warps[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_active_exu_warps[PREV_STAT_IDX][i]);
       }
-      else{
-        total_warps += (pwr_core_stat->m_active_exu_warps[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_active_exu_warps[PREV_STAT_IDX][i]);
-        }
     }
-      return total_warps;
+    return total_warps;
   }
 
-
-  double get_tot_fpu_accessess(bool aggregate_stat){
-    double total_inst=0;
-    for(unsigned i=0; i<m_config->num_shader();i++){
-      if(aggregate_stat)
-        total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i])+
-                    (pwr_core_stat->m_num_dp_acesses[CURRENT_STAT_IDX][i]);
+  double get_tot_fpu_accessess(bool aggregate_stat) {
+    double total_inst = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if (aggregate_stat)
+        total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i]) +
+                      (pwr_core_stat->m_num_dp_acesses[CURRENT_STAT_IDX][i]);
       else
-        total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i]) - 
-                    (pwr_core_stat->m_num_fp_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_dp_acesses[CURRENT_STAT_IDX][i]) - 
-                    (pwr_core_stat->m_num_dp_acesses[PREV_STAT_IDX][i]);
+        total_inst += (pwr_core_stat->m_num_fp_acesses[CURRENT_STAT_IDX][i]) -
+                      (pwr_core_stat->m_num_fp_acesses[PREV_STAT_IDX][i]) +
+                      (pwr_core_stat->m_num_dp_acesses[CURRENT_STAT_IDX][i]) -
+                      (pwr_core_stat->m_num_dp_acesses[PREV_STAT_IDX][i]);
     }
-    //total_inst += get_total_load_inst()+get_total_store_inst()+get_tex_inst();
+    // total_inst +=
+    // get_total_load_inst()+get_total_store_inst()+get_tex_inst();
     return total_inst;
   }
 
-
-
-  double get_tot_sfu_accessess(bool aggregate_stat){
-    double total_inst=0;
-    for(unsigned i=0; i<m_config->num_shader();i++){
-      if(aggregate_stat)
-        total_inst += (pwr_core_stat->m_num_idiv_acesses[CURRENT_STAT_IDX][i])+
-                    (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i])+
-                    (pwr_core_stat->m_num_sqrt_acesses[CURRENT_STAT_IDX][i])+
-                    (pwr_core_stat->m_num_log_acesses[CURRENT_STAT_IDX][i])+
-                    (pwr_core_stat->m_num_sin_acesses[CURRENT_STAT_IDX][i])+
-                    (pwr_core_stat->m_num_exp_acesses[CURRENT_STAT_IDX][i])+
-                    (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i])+
-                    (pwr_core_stat->m_num_fpmul_acesses[CURRENT_STAT_IDX][i])+
-                    (pwr_core_stat->m_num_dpmul_acesses[CURRENT_STAT_IDX][i])+
-                    (pwr_core_stat->m_num_dpdiv_acesses[CURRENT_STAT_IDX][i])+
-                    (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i])+
-                    (pwr_core_stat->m_num_tensor_core_acesses[CURRENT_STAT_IDX][i])+
-                    (pwr_core_stat->m_num_tex_acesses[CURRENT_STAT_IDX][i]);
-        else
-            total_inst += (pwr_core_stat->m_num_idiv_acesses[CURRENT_STAT_IDX][i]) - 
-                    (pwr_core_stat->m_num_idiv_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]) - 
-                    (pwr_core_stat->m_num_imul32_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_sqrt_acesses[CURRENT_STAT_IDX][i]) - 
-                    (pwr_core_stat->m_num_sqrt_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_log_acesses[CURRENT_STAT_IDX][i]) - 
-                    (pwr_core_stat->m_num_log_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_sin_acesses[CURRENT_STAT_IDX][i]) - 
-                    (pwr_core_stat->m_num_sin_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_exp_acesses[CURRENT_STAT_IDX][i]) - 
-                    (pwr_core_stat->m_num_exp_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]) - 
-                    (pwr_core_stat->m_num_fpdiv_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_fpmul_acesses[CURRENT_STAT_IDX][i]) - 
-                    (pwr_core_stat->m_num_fpmul_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_dpmul_acesses[CURRENT_STAT_IDX][i]) - 
-                    (pwr_core_stat->m_num_dpmul_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_dpdiv_acesses[CURRENT_STAT_IDX][i]) - 
-                    (pwr_core_stat->m_num_dpdiv_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]) - 
-                    (pwr_core_stat->m_num_imul24_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i]) - 
-                    (pwr_core_stat->m_num_imul_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_tensor_core_acesses[CURRENT_STAT_IDX][i]) - 
-                    (pwr_core_stat->m_num_tensor_core_acesses[PREV_STAT_IDX][i]) +
-                    (pwr_core_stat->m_num_tex_acesses[CURRENT_STAT_IDX][i]) - 
-                    (pwr_core_stat->m_num_tex_acesses[PREV_STAT_IDX][i]);
-
+  double get_tot_sfu_accessess(bool aggregate_stat) {
+    double total_inst = 0;
+    for (unsigned i = 0; i < m_config->num_shader(); i++) {
+      if (aggregate_stat)
+        total_inst +=
+            (pwr_core_stat->m_num_idiv_acesses[CURRENT_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_sqrt_acesses[CURRENT_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_log_acesses[CURRENT_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_sin_acesses[CURRENT_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_exp_acesses[CURRENT_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_fpmul_acesses[CURRENT_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_dpmul_acesses[CURRENT_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_dpdiv_acesses[CURRENT_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_tensor_core_acesses[CURRENT_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_tex_acesses[CURRENT_STAT_IDX][i]);
+      else
+        total_inst +=
+            (pwr_core_stat->m_num_idiv_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_idiv_acesses[PREV_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_imul32_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_imul32_acesses[PREV_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_sqrt_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_sqrt_acesses[PREV_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_log_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_log_acesses[PREV_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_sin_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_sin_acesses[PREV_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_exp_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_exp_acesses[PREV_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_fpdiv_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_fpdiv_acesses[PREV_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_fpmul_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_fpmul_acesses[PREV_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_dpmul_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_dpmul_acesses[PREV_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_dpdiv_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_dpdiv_acesses[PREV_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_imul24_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_imul24_acesses[PREV_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_imul_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_imul_acesses[PREV_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_tensor_core_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_tensor_core_acesses[PREV_STAT_IDX][i]) +
+            (pwr_core_stat->m_num_tex_acesses[CURRENT_STAT_IDX][i]) -
+            (pwr_core_stat->m_num_tex_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
@@ -714,11 +734,11 @@ class power_stat_t {
   double get_ialu_accessess(bool aggregate_stat) {
     double total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      if(aggregate_stat)
+      if (aggregate_stat)
         total_inst += (pwr_core_stat->m_num_ialu_acesses[CURRENT_STAT_IDX][i]);
-      else  
+      else
         total_inst += (pwr_core_stat->m_num_ialu_acesses[CURRENT_STAT_IDX][i]) -
-                    (pwr_core_stat->m_num_ialu_acesses[PREV_STAT_IDX][i]);
+                      (pwr_core_stat->m_num_ialu_acesses[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
@@ -805,17 +825,16 @@ class power_stat_t {
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
-    if(aggregate_stat)
+    if (aggregate_stat)
       return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status));
+          access_type, num_access_type, request_status, num_request_status));
     else
       return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status)) -
-           (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status));
+                 access_type, num_access_type, request_status,
+                 num_request_status)) -
+             (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status));
   }
   double get_inst_c_misses(bool aggregate_stat) {
     enum mem_access_type access_type[] = {INST_ACC_R};
@@ -824,46 +843,45 @@ class power_stat_t {
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
-    if(aggregate_stat)
+    if (aggregate_stat)
       return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status));
+          access_type, num_access_type, request_status, num_request_status));
     else
       return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status)) -
-           (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status));
+                 access_type, num_access_type, request_status,
+                 num_request_status)) -
+             (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status));
   }
   double get_inst_c_hits(bool aggregate_stat) {
-    return (get_inst_c_accesses(aggregate_stat) - get_inst_c_misses(aggregate_stat));
+    return (get_inst_c_accesses(aggregate_stat) -
+            get_inst_c_misses(aggregate_stat));
   }
 
   double get_l1d_read_accesses(bool aggregate_stat) {
     enum mem_access_type access_type[] = {GLOBAL_ACC_R, LOCAL_ACC_R};
-    enum cache_request_status request_status[] = {HIT, MISS, SECTOR_MISS}; 
+    enum cache_request_status request_status[] = {HIT, MISS, SECTOR_MISS};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
 
-    if(aggregate_stat){
+    if (aggregate_stat) {
       return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status));
-    }
-    else{
+          access_type, num_access_type, request_status, num_request_status));
+    } else {
       return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status)) -
-           (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status));
-      }
+                 access_type, num_access_type, request_status,
+                 num_request_status)) -
+             (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status));
+    }
   }
   double get_l1d_read_misses(bool aggregate_stat) {
-    return (get_l1d_read_accesses(aggregate_stat) - get_l1d_read_hits(aggregate_stat));
+    return (get_l1d_read_accesses(aggregate_stat) -
+            get_l1d_read_hits(aggregate_stat));
   }
   double get_l1d_read_hits(bool aggregate_stat) {
     enum mem_access_type access_type[] = {GLOBAL_ACC_R, LOCAL_ACC_R};
@@ -873,19 +891,17 @@ class power_stat_t {
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
 
-    if(aggregate_stat){
-       return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status));
-    }
-    else{
+    if (aggregate_stat) {
       return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status)) -
-           (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status));
-      }
+          access_type, num_access_type, request_status, num_request_status));
+    } else {
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status)) -
+             (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status));
+    }
   }
   double get_l1d_write_accesses(bool aggregate_stat) {
     enum mem_access_type access_type[] = {GLOBAL_ACC_W, LOCAL_ACC_W};
@@ -895,22 +911,21 @@ class power_stat_t {
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
 
-    if(aggregate_stat){
-       return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status));
-    }
-    else{
+    if (aggregate_stat) {
       return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status)) -
-           (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status));
-      }
+          access_type, num_access_type, request_status, num_request_status));
+    } else {
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status)) -
+             (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status));
+    }
   }
   double get_l1d_write_misses(bool aggregate_stat) {
-    return (get_l1d_write_accesses(aggregate_stat) - get_l1d_write_hits(aggregate_stat));
+    return (get_l1d_write_accesses(aggregate_stat) -
+            get_l1d_write_hits(aggregate_stat));
   }
   double get_l1d_write_hits(bool aggregate_stat) {
     enum mem_access_type access_type[] = {GLOBAL_ACC_W, LOCAL_ACC_W};
@@ -920,19 +935,17 @@ class power_stat_t {
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
 
-    if(aggregate_stat){
-       return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status));
-    }
-    else{
+    if (aggregate_stat) {
       return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status)) -
-           (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status));
-      }
+          access_type, num_access_type, request_status, num_request_status));
+    } else {
+      return (pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status)) -
+             (pwr_mem_stat->core_cache_stats[PREV_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status));
+    }
   }
   double get_cache_misses() {
     return get_l1d_read_misses(0) + get_constant_c_misses() +
@@ -949,111 +962,107 @@ class power_stat_t {
   double get_shmem_access(bool aggregate_stat) {
     unsigned total_inst = 0;
     for (unsigned i = 0; i < m_config->num_shader(); i++) {
-      if(aggregate_stat)
+      if (aggregate_stat)
         total_inst += (pwr_mem_stat->shmem_access[CURRENT_STAT_IDX][i]);
       else
         total_inst += (pwr_mem_stat->shmem_access[CURRENT_STAT_IDX][i]) -
-                    (pwr_mem_stat->shmem_access[PREV_STAT_IDX][i]);
+                      (pwr_mem_stat->shmem_access[PREV_STAT_IDX][i]);
     }
     return total_inst;
   }
 
-  unsigned long long  get_l2_read_accesses(bool aggregate_stat) {
+  unsigned long long get_l2_read_accesses(bool aggregate_stat) {
     enum mem_access_type access_type[] = {
         GLOBAL_ACC_R, LOCAL_ACC_R, CONST_ACC_R, TEXTURE_ACC_R, INST_ACC_R};
-    enum cache_request_status request_status[] = {HIT, HIT_RESERVED, MISS, SECTOR_MISS}; 
+    enum cache_request_status request_status[] = {HIT, HIT_RESERVED, MISS,
+                                                  SECTOR_MISS};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
-    if(aggregate_stat){
-       return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status));
-    }
-    else{
+    if (aggregate_stat) {
       return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status)) -
-           (pwr_mem_stat->l2_cache_stats[PREV_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status));
+          access_type, num_access_type, request_status, num_request_status));
+    } else {
+      return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status)) -
+             (pwr_mem_stat->l2_cache_stats[PREV_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status));
     }
   }
 
   unsigned long long get_l2_read_misses(bool aggregate_stat) {
-    return (get_l2_read_accesses(aggregate_stat) - get_l2_read_hits(aggregate_stat));
+    return (get_l2_read_accesses(aggregate_stat) -
+            get_l2_read_hits(aggregate_stat));
   }
 
   unsigned long long get_l2_read_hits(bool aggregate_stat) {
-       enum mem_access_type access_type[] = {
+    enum mem_access_type access_type[] = {
         GLOBAL_ACC_R, LOCAL_ACC_R, CONST_ACC_R, TEXTURE_ACC_R, INST_ACC_R};
-    enum cache_request_status request_status[] =  {HIT, HIT_RESERVED};
+    enum cache_request_status request_status[] = {HIT, HIT_RESERVED};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
-    if(aggregate_stat){
-       return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status));
-    }
-    else{
+    if (aggregate_stat) {
       return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status)) -
-           (pwr_mem_stat->l2_cache_stats[PREV_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status));
+          access_type, num_access_type, request_status, num_request_status));
+    } else {
+      return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status)) -
+             (pwr_mem_stat->l2_cache_stats[PREV_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status));
     }
   }
 
   unsigned long long get_l2_write_accesses(bool aggregate_stat) {
     enum mem_access_type access_type[] = {GLOBAL_ACC_W, LOCAL_ACC_W,
                                           L1_WRBK_ACC};
-    enum cache_request_status request_status[] = {HIT, HIT_RESERVED, MISS, SECTOR_MISS}; 
+    enum cache_request_status request_status[] = {HIT, HIT_RESERVED, MISS,
+                                                  SECTOR_MISS};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
-    if(aggregate_stat){
+    if (aggregate_stat) {
       return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status));
-    }
-    else{
+          access_type, num_access_type, request_status, num_request_status));
+    } else {
       return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status)) -
-           (pwr_mem_stat->l2_cache_stats[PREV_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status));
+                 access_type, num_access_type, request_status,
+                 num_request_status)) -
+             (pwr_mem_stat->l2_cache_stats[PREV_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status));
     }
   }
 
   unsigned long long get_l2_write_misses(bool aggregate_stat) {
-    return (get_l2_write_accesses(aggregate_stat) - get_l2_write_hits(aggregate_stat));
+    return (get_l2_write_accesses(aggregate_stat) -
+            get_l2_write_hits(aggregate_stat));
   }
   unsigned long long get_l2_write_hits(bool aggregate_stat) {
-        enum mem_access_type access_type[] = {GLOBAL_ACC_W, LOCAL_ACC_W,
+    enum mem_access_type access_type[] = {GLOBAL_ACC_W, LOCAL_ACC_W,
                                           L1_WRBK_ACC};
     enum cache_request_status request_status[] = {HIT, HIT_RESERVED};
     unsigned num_access_type =
         sizeof(access_type) / sizeof(enum mem_access_type);
     unsigned num_request_status =
         sizeof(request_status) / sizeof(enum cache_request_status);
-    if(aggregate_stat){
+    if (aggregate_stat) {
       return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status));
-    }
-    else{
+          access_type, num_access_type, request_status, num_request_status));
+    } else {
       return (pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status)) -
-           (pwr_mem_stat->l2_cache_stats[PREV_STAT_IDX].get_stats(
-               access_type, num_access_type, request_status,
-               num_request_status));
+                 access_type, num_access_type, request_status,
+                 num_request_status)) -
+             (pwr_mem_stat->l2_cache_stats[PREV_STAT_IDX].get_stats(
+                 access_type, num_access_type, request_status,
+                 num_request_status));
     }
   }
   double get_dram_cmd() {
@@ -1091,12 +1100,11 @@ class power_stat_t {
   double get_dram_pre(bool aggregate_stat) {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
-      if(aggregate_stat){
+      if (aggregate_stat) {
         total += pwr_mem_stat->n_pre[CURRENT_STAT_IDX][i];
-      }
-      else{
+      } else {
         total += (pwr_mem_stat->n_pre[CURRENT_STAT_IDX][i] -
-                pwr_mem_stat->n_pre[PREV_STAT_IDX][i]);
+                  pwr_mem_stat->n_pre[PREV_STAT_IDX][i]);
       }
     }
     return total;
@@ -1104,12 +1112,11 @@ class power_stat_t {
   double get_dram_rd(bool aggregate_stat) {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
-      if(aggregate_stat){
+      if (aggregate_stat) {
         total += pwr_mem_stat->n_rd[CURRENT_STAT_IDX][i];
-      }
-      else{
+      } else {
         total += (pwr_mem_stat->n_rd[CURRENT_STAT_IDX][i] -
-                pwr_mem_stat->n_rd[PREV_STAT_IDX][i]);
+                  pwr_mem_stat->n_rd[PREV_STAT_IDX][i]);
       }
     }
     return total;
@@ -1117,15 +1124,14 @@ class power_stat_t {
   double get_dram_wr(bool aggregate_stat) {
     unsigned total = 0;
     for (unsigned i = 0; i < m_mem_config->m_n_mem; ++i) {
-      if(aggregate_stat){
-        total += pwr_mem_stat->n_wr[CURRENT_STAT_IDX][i] + 
-                pwr_mem_stat->n_wr_WB[CURRENT_STAT_IDX][i];
-      }
-      else{
-        total += (pwr_mem_stat->n_wr[CURRENT_STAT_IDX][i] - 
-                pwr_mem_stat->n_wr[PREV_STAT_IDX][i]) +
-                (pwr_mem_stat->n_wr_WB[CURRENT_STAT_IDX][i] - 
-                pwr_mem_stat->n_wr_WB[PREV_STAT_IDX][i]);
+      if (aggregate_stat) {
+        total += pwr_mem_stat->n_wr[CURRENT_STAT_IDX][i] +
+                 pwr_mem_stat->n_wr_WB[CURRENT_STAT_IDX][i];
+      } else {
+        total += (pwr_mem_stat->n_wr[CURRENT_STAT_IDX][i] -
+                  pwr_mem_stat->n_wr[PREV_STAT_IDX][i]) +
+                 (pwr_mem_stat->n_wr_WB[CURRENT_STAT_IDX][i] -
+                  pwr_mem_stat->n_wr_WB[PREV_STAT_IDX][i]);
       }
     }
     return total;
@@ -1141,13 +1147,12 @@ class power_stat_t {
 
   unsigned long long get_icnt_simt_to_mem(bool aggregate_stat) {
     long total = 0;
-    for (unsigned i = 0; i < m_config->n_simt_clusters; ++i){
-      if(aggregate_stat){
+    for (unsigned i = 0; i < m_config->n_simt_clusters; ++i) {
+      if (aggregate_stat) {
         total += pwr_mem_stat->n_simt_to_mem[CURRENT_STAT_IDX][i];
-      }
-      else{
+      } else {
         total += (pwr_mem_stat->n_simt_to_mem[CURRENT_STAT_IDX][i] -
-                pwr_mem_stat->n_simt_to_mem[PREV_STAT_IDX][i]);
+                  pwr_mem_stat->n_simt_to_mem[PREV_STAT_IDX][i]);
       }
     }
     return total;
@@ -1156,13 +1161,13 @@ class power_stat_t {
   unsigned long long get_icnt_mem_to_simt(bool aggregate_stat) {
     long total = 0;
     for (unsigned i = 0; i < m_config->n_simt_clusters; ++i) {
-      if(aggregate_stat){
+      if (aggregate_stat) {
         total += pwr_mem_stat->n_mem_to_simt[CURRENT_STAT_IDX][i];
       }
-      
-      else{
+
+      else {
         total += (pwr_mem_stat->n_mem_to_simt[CURRENT_STAT_IDX][i] -
-                pwr_mem_stat->n_mem_to_simt[PREV_STAT_IDX][i]);
+                  pwr_mem_stat->n_mem_to_simt[PREV_STAT_IDX][i]);
       }
     }
     return total;
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 855aa1c14..9fe4c092c 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -1,18 +1,19 @@
 // Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Ali Bakhoda,
-// George L. Yuan, Andrew Turner, Inderpreet Singh, Vijay Kandiah, Nikos Hardavellas, 
-// Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
-// The University of British Columbia, Northwestern University, Purdue University
-// All rights reserved.
+// George L. Yuan, Andrew Turner, Inderpreet Singh, Vijay Kandiah, Nikos
+// Hardavellas, Mahmoud Khairy, Junrui Pan, Timothy G. Rogers The University of
+// British Columbia, Northwestern University, Purdue University All rights
+// reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// 1. Redistributions of source code must retain the above copyright notice, this
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
 //    list of conditions and the following disclaimer;
 // 2. Redistributions in binary form must reproduce the above copyright notice,
 //    this list of conditions and the following disclaimer in the documentation
 //    and/or other materials provided with the distribution;
-// 3. Neither the names of The University of British Columbia, Northwestern 
+// 3. Neither the names of The University of British Columbia, Northwestern
 //    University nor the names of their contributors may be used to
 //    endorse or promote products derived from this software without specific
 //    prior written permission.
@@ -187,20 +188,16 @@ void shader_core_ctx::create_schedulers() {
   // must currently occur after all inputs have been initialized.
   std::string sched_config = m_config->gpgpu_scheduler_string;
   const concrete_scheduler scheduler =
-      sched_config.find("lrr") != std::string::npos
-          ? CONCRETE_SCHEDULER_LRR
-          : sched_config.find("two_level_active") != std::string::npos
-                ? CONCRETE_SCHEDULER_TWO_LEVEL_ACTIVE
-                : sched_config.find("gto") != std::string::npos
-                      ? CONCRETE_SCHEDULER_GTO
-                      : sched_config.find("rrr") != std::string::npos
-                            ? CONCRETE_SCHEDULER_RRR
-                      : sched_config.find("old") != std::string::npos
-                            ? CONCRETE_SCHEDULER_OLDEST_FIRST
-                            : sched_config.find("warp_limiting") !=
-                                      std::string::npos
-                                  ? CONCRETE_SCHEDULER_WARP_LIMITING
-                                  : NUM_CONCRETE_SCHEDULERS;
+      sched_config.find("lrr") != std::string::npos ? CONCRETE_SCHEDULER_LRR
+      : sched_config.find("two_level_active") != std::string::npos
+          ? CONCRETE_SCHEDULER_TWO_LEVEL_ACTIVE
+      : sched_config.find("gto") != std::string::npos ? CONCRETE_SCHEDULER_GTO
+      : sched_config.find("rrr") != std::string::npos ? CONCRETE_SCHEDULER_RRR
+      : sched_config.find("old") != std::string::npos
+          ? CONCRETE_SCHEDULER_OLDEST_FIRST
+      : sched_config.find("warp_limiting") != std::string::npos
+          ? CONCRETE_SCHEDULER_WARP_LIMITING
+          : NUM_CONCRETE_SCHEDULERS;
   assert(scheduler != NUM_CONCRETE_SCHEDULERS);
 
   for (unsigned i = 0; i < m_config->gpgpu_num_sched_per_core; i++) {
@@ -487,8 +484,8 @@ shader_core_ctx::shader_core_ctx(class gpgpu_sim *gpu,
   m_sid = shader_id;
   m_tpc = tpc_id;
 
-  if(get_gpu()->get_config().g_power_simulation_enabled){
-    scaling_coeffs =  get_gpu()->get_scaling_coeffs();
+  if (get_gpu()->get_config().g_power_simulation_enabled) {
+    scaling_coeffs = get_gpu()->get_scaling_coeffs();
   }
 
   m_last_inst_gpu_sim_cycle = 0;
@@ -641,7 +638,8 @@ void shader_core_stats::print(FILE *fout) const {
   fprintf(fout, "gpgpu_n_param_mem_insn = %d\n", gpgpu_n_param_insn);
 
   fprintf(fout, "gpgpu_n_shmem_bkconflict = %d\n", gpgpu_n_shmem_bkconflict);
-  fprintf(fout, "gpgpu_n_l1cache_bkconflict = %d\n", gpgpu_n_l1cache_bkconflict);
+  fprintf(fout, "gpgpu_n_l1cache_bkconflict = %d\n",
+          gpgpu_n_l1cache_bkconflict);
 
   fprintf(fout, "gpgpu_n_intrawarp_mshr_merge = %d\n",
           gpgpu_n_intrawarp_mshr_merge);
@@ -893,7 +891,9 @@ void shader_core_ctx::decode() {
     m_warp[m_inst_fetch_buffer.m_warp_id]->inc_inst_in_pipeline();
     if (pI1) {
       m_stats->m_num_decoded_insn[m_sid]++;
-      if ((pI1->oprnd_type == INT_OP) || (pI1->oprnd_type == UN_OP))  { //these counters get added up in mcPat to compute scheduler power
+      if ((pI1->oprnd_type == INT_OP) ||
+          (pI1->oprnd_type == UN_OP)) {  // these counters get added up in mcPat
+                                         // to compute scheduler power
         m_stats->m_num_INTdecoded_insn[m_sid]++;
       } else if (pI1->oprnd_type == FP_OP) {
         m_stats->m_num_FPdecoded_insn[m_sid]++;
@@ -904,7 +904,9 @@ void shader_core_ctx::decode() {
         m_warp[m_inst_fetch_buffer.m_warp_id]->ibuffer_fill(1, pI2);
         m_warp[m_inst_fetch_buffer.m_warp_id]->inc_inst_in_pipeline();
         m_stats->m_num_decoded_insn[m_sid]++;
-        if ((pI1->oprnd_type == INT_OP) || (pI1->oprnd_type == UN_OP))  { //these counters get added up in mcPat to compute scheduler power
+        if ((pI1->oprnd_type == INT_OP) ||
+            (pI1->oprnd_type == UN_OP)) {  // these counters get added up in
+                                           // mcPat to compute scheduler power
           m_stats->m_num_INTdecoded_insn[m_sid]++;
         } else if (pI2->oprnd_type == FP_OP) {
           m_stats->m_num_FPdecoded_insn[m_sid]++;
@@ -950,7 +952,8 @@ void shader_core_ctx::fetch() {
               m_threadState[tid].m_active = false;
               unsigned cta_id = m_warp[warp_id]->get_cta_id();
               if (m_thread[tid] == NULL) {
-                register_cta_thread_exit(cta_id, m_warp[warp_id]->get_kernel_info());
+                register_cta_thread_exit(cta_id,
+                                         m_warp[warp_id]->get_kernel_info());
               } else {
                 register_cta_thread_exit(cta_id,
                                          &(m_thread[tid]->get_kernel()));
@@ -987,11 +990,10 @@ void shader_core_ctx::fetch() {
               m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
           std::list<cache_event> events;
           enum cache_request_status status;
-          if (m_config->perfect_inst_const_cache){
+          if (m_config->perfect_inst_const_cache) {
             status = HIT;
             shader_cache_access_log(m_sid, INSTRUCTION, 0);
-          }
-          else
+          } else
             status = m_L1I->access(
                 (new_addr_type)ppc, mf,
                 m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle, events);
@@ -1050,14 +1052,13 @@ void shader_core_ctx::issue_warp(register_set &pipe_reg_set,
   if (next_inst->m_is_ldgsts) {
     if (m_warp[warp_id]->m_ldgdepbar_buf.size() == ldgdepbar_id + 1) {
       m_warp[warp_id]->m_ldgdepbar_buf[ldgdepbar_id].push_back(*next_inst);
-    }
-    else {
+    } else {
       assert(m_warp[warp_id]->m_ldgdepbar_buf.size() < ldgdepbar_id + 1);
       std::vector<warp_inst_t> l;
       l.push_back(*next_inst);
       m_warp[warp_id]->m_ldgdepbar_buf.push_back(l);
     }
-    // If the mask of the instruction is all 0, then the address is also 0, 
+    // If the mask of the instruction is all 0, then the address is also 0,
     // so that there's no need to check through the writeback
     if (next_inst->get_active_mask() == 0) {
       (m_warp[warp_id]->m_ldgdepbar_buf.back()).back().pc = -1;
@@ -1071,7 +1072,7 @@ void shader_core_ctx::issue_warp(register_set &pipe_reg_set,
 
   } else if (next_inst->op == MEMORY_BARRIER_OP) {
     m_warp[warp_id]->set_membar();
-  } else if (next_inst->m_is_ldgdepbar) { // Add for LDGDEPBAR
+  } else if (next_inst->m_is_ldgdepbar) {  // Add for LDGDEPBAR
     m_warp[warp_id]->m_ldgdepbar_id++;
     // If there are no added LDGSTS, insert an empty vector
     if (m_warp[warp_id]->m_ldgdepbar_buf.size() != ldgdepbar_id + 1) {
@@ -1082,16 +1083,20 @@ void shader_core_ctx::issue_warp(register_set &pipe_reg_set,
   } else if (next_inst->m_is_depbar) {  // Add for DEPBAR
     // Set to true immediately when a DEPBAR instruction is met
     m_warp[warp_id]->m_waiting_ldgsts = true;
-    m_warp[warp_id]->m_depbar_group = next_inst->m_depbar_group_no; // set in trace_driven.cc
+    m_warp[warp_id]->m_depbar_group =
+        next_inst->m_depbar_group_no;  // set in trace_driven.cc
 
-    // Record the last group that's possbily being monitored by this DEPBAR instr
+    // Record the last group that's possbily being monitored by this DEPBAR
+    // instr
     m_warp[warp_id]->m_depbar_start_id = m_warp[warp_id]->m_ldgdepbar_id - 1;
-    
-    // Record the last group that's actually being monitored by this DEPBAR instr
-    unsigned int end_group = m_warp[warp_id]->m_ldgdepbar_id - m_warp[warp_id]->m_depbar_group;
 
-    // Check for the case that the LDGSTSs monitored have finished when encountering the 
-    // DEPBAR instruction 
+    // Record the last group that's actually being monitored by this DEPBAR
+    // instr
+    unsigned int end_group =
+        m_warp[warp_id]->m_ldgdepbar_id - m_warp[warp_id]->m_depbar_group;
+
+    // Check for the case that the LDGSTSs monitored have finished when
+    // encountering the DEPBAR instruction
     bool done_flag = true;
     for (int i = 0; i < end_group; i++) {
       for (int j = 0; j < m_warp[warp_id]->m_ldgdepbar_buf[i].size(); j++) {
@@ -1101,7 +1106,7 @@ void shader_core_ctx::issue_warp(register_set &pipe_reg_set,
         }
       }
     }
-  
+
   UpdateDEPBAR:
     if (done_flag) {
       if (m_warp[warp_id]->m_waiting_ldgsts) {
@@ -1184,11 +1189,12 @@ void scheduler_unit::order_rrr(
   if (m_num_issued_last_cycle > 0 || warp(m_current_turn_warp).done_exit() ||
       warp(m_current_turn_warp).waiting()) {
     std::vector<shd_warp_t *>::const_iterator iter =
-      (last_issued_from_input == input_list.end()) ? 
-        input_list.begin() : last_issued_from_input + 1;
+        (last_issued_from_input == input_list.end())
+            ? input_list.begin()
+            : last_issued_from_input + 1;
     for (unsigned count = 0; count < num_warps_to_add; ++iter, ++count) {
       if (iter == input_list.end()) {
-      iter = input_list.begin();
+        iter = input_list.begin();
       }
       unsigned warp_id = (*iter)->get_warp_id();
       if (!(*iter)->done_exit() && !(*iter)->waiting()) {
@@ -1854,33 +1860,38 @@ void ldst_unit::get_L1T_sub_stats(struct cache_sub_stats &css) const {
 // Add this function to unset depbar
 void shader_core_ctx::unset_depbar(const warp_inst_t &inst) {
   bool done_flag = true;
-  unsigned int end_group = m_warp[inst.warp_id()]->m_depbar_start_id == 0 ? 
-    m_warp[inst.warp_id()]->m_ldgdepbar_buf.size() :
-    (m_warp[inst.warp_id()]->m_depbar_start_id - m_warp[inst.warp_id()]->m_depbar_group + 1);
+  unsigned int end_group = m_warp[inst.warp_id()]->m_depbar_start_id == 0
+                               ? m_warp[inst.warp_id()]->m_ldgdepbar_buf.size()
+                               : (m_warp[inst.warp_id()]->m_depbar_start_id -
+                                  m_warp[inst.warp_id()]->m_depbar_group + 1);
 
-  if (inst.m_is_ldgsts) { 
+  if (inst.m_is_ldgsts) {
     for (int i = 0; i < m_warp[inst.warp_id()]->m_ldgdepbar_buf.size(); i++) {
-      for (int j = 0; j < m_warp[inst.warp_id()]->m_ldgdepbar_buf[i].size(); j++) {
+      for (int j = 0; j < m_warp[inst.warp_id()]->m_ldgdepbar_buf[i].size();
+           j++) {
         if (m_warp[inst.warp_id()]->m_ldgdepbar_buf[i][j].pc == inst.pc) {
-          // Handle the case that same pc results in multiple LDGSTS instructions
-          if (m_warp[inst.warp_id()]->m_ldgdepbar_buf[i][j].get_addr(0) == inst.get_addr(0)) {
+          // Handle the case that same pc results in multiple LDGSTS
+          // instructions
+          if (m_warp[inst.warp_id()]->m_ldgdepbar_buf[i][j].get_addr(0) ==
+              inst.get_addr(0)) {
             m_warp[inst.warp_id()]->m_ldgdepbar_buf[i][j].pc = -1;
             goto DoneWB;
           }
-        }  
+        }
       }
     }
 
   DoneWB:
     for (int i = 0; i < end_group; i++) {
-      for (int j = 0; j < m_warp[inst.warp_id()]->m_ldgdepbar_buf[i].size(); j++) {
+      for (int j = 0; j < m_warp[inst.warp_id()]->m_ldgdepbar_buf[i].size();
+           j++) {
         if (m_warp[inst.warp_id()]->m_ldgdepbar_buf[i][j].pc != -1) {
           done_flag = false;
           goto UpdateDEPBAR;
         }
       }
     }
-  
+
   UpdateDEPBAR:
     if (done_flag) {
       if (m_warp[inst.warp_id()]->m_waiting_ldgsts) {
@@ -2000,7 +2011,7 @@ mem_stage_stall_type ldst_unit::process_cache_access(
     if (inst.is_load()) {
       for (unsigned r = 0; r < MAX_OUTPUT_VALUES; r++)
         if (inst.out[r] > 0) m_pending_writes[inst.warp_id()][inst.out[r]]--;
-      
+
       // release LDGSTS
       if (inst.m_is_ldgsts) {
         m_pending_ldgsts[inst.warp_id()][inst.pc][inst.get_addr(0)]--;
@@ -2139,8 +2150,12 @@ void ldst_unit::L1_latency_queue_cycle() {
 
           // release LDGSTS
           if (mf_next->get_inst().m_is_ldgsts) {
-            m_pending_ldgsts[mf_next->get_inst().warp_id()][mf_next->get_inst().pc][mf_next->get_inst().get_addr(0)]--;
-            if (m_pending_ldgsts[mf_next->get_inst().warp_id()][mf_next->get_inst().pc][mf_next->get_inst().get_addr(0)] == 0) {
+            m_pending_ldgsts[mf_next->get_inst().warp_id()]
+                            [mf_next->get_inst().pc]
+                            [mf_next->get_inst().get_addr(0)]--;
+            if (m_pending_ldgsts[mf_next->get_inst().warp_id()]
+                                [mf_next->get_inst().pc]
+                                [mf_next->get_inst().get_addr(0)] == 0) {
               m_core->unset_depbar(mf_next->get_inst());
             }
           }
@@ -2207,7 +2222,8 @@ bool ldst_unit::constant_cycle(warp_inst_t &inst, mem_stage_stall_type &rc_fail,
     while (inst.accessq_count() > 0) inst.accessq_pop_back();
     if (inst.is_load()) {
       for (unsigned r = 0; r < MAX_OUTPUT_VALUES; r++)
-        if (inst.out[r] > 0) m_pending_writes[inst.warp_id()][inst.out[r]] -= access_count;
+        if (inst.out[r] > 0)
+          m_pending_writes[inst.warp_id()][inst.out[r]] -= access_count;
     }
   } else {
     fail = process_memory_access_queue(m_L1C, inst);
@@ -2395,7 +2411,7 @@ void sp_unit::active_lanes_in_pipeline() {
 void dp_unit::active_lanes_in_pipeline() {
   unsigned active_count = pipelined_simd_unit::get_active_lanes_in_pipeline();
   assert(active_count <= m_core->get_config()->warp_size);
-  //m_core->incspactivelanes_stat(active_count);
+  // m_core->incspactivelanes_stat(active_count);
   m_core->incfuactivelanes_stat(active_count);
   m_core->incfumemactivelanes_stat(active_count);
 }
@@ -2527,9 +2543,9 @@ void pipelined_simd_unit::cycle() {
     if (!m_dispatch_reg->dispatch_delay()) {
       int start_stage =
           m_dispatch_reg->latency - m_dispatch_reg->initiation_interval;
-      if(m_pipeline_reg[start_stage]->empty()) {
-      	move_warp(m_pipeline_reg[start_stage], m_dispatch_reg);
-      	active_insts_in_pipeline++;
+      if (m_pipeline_reg[start_stage]->empty()) {
+        move_warp(m_pipeline_reg[start_stage], m_dispatch_reg);
+        active_insts_in_pipeline++;
       }
     }
   }
@@ -2682,10 +2698,12 @@ void ldst_unit::writeback() {
                                           m_next_wb.out[r]);
             insn_completed = true;
           }
-        }
-        else if (m_next_wb.m_is_ldgsts) { // for LDGSTS instructions where no output register is used
-          m_pending_ldgsts[m_next_wb.warp_id()][m_next_wb.pc][m_next_wb.get_addr(0)]--;
-          if (m_pending_ldgsts[m_next_wb.warp_id()][m_next_wb.pc][m_next_wb.get_addr(0)] == 0) {
+        } else if (m_next_wb.m_is_ldgsts) {  // for LDGSTS instructions where no
+                                             // output register is used
+          m_pending_ldgsts[m_next_wb.warp_id()][m_next_wb.pc]
+                          [m_next_wb.get_addr(0)]--;
+          if (m_pending_ldgsts[m_next_wb.warp_id()][m_next_wb.pc]
+                              [m_next_wb.get_addr(0)] == 0) {
             insn_completed = true;
           }
           break;
@@ -2923,7 +2941,8 @@ void ldst_unit::cycle() {
           // release LDGSTS
           if (m_dispatch_reg->m_is_ldgsts) {
             // m_pending_ldgsts[m_dispatch_reg->warp_id()][m_dispatch_reg->pc][m_dispatch_reg->get_addr(0)]--;
-            if (m_pending_ldgsts[m_dispatch_reg->warp_id()][m_dispatch_reg->pc][m_dispatch_reg->get_addr(0)] == 0) {
+            if (m_pending_ldgsts[m_dispatch_reg->warp_id()][m_dispatch_reg->pc]
+                                [m_dispatch_reg->get_addr(0)] == 0) {
               m_core->unset_depbar(*m_dispatch_reg);
             }
           }
@@ -3223,68 +3242,68 @@ void warp_inst_t::print(FILE *fout) const {
   m_config->gpgpu_ctx->func_sim->ptx_print_insn(pc, fout);
   fprintf(fout, "\n");
 }
-void shader_core_ctx::incexecstat(warp_inst_t *&inst)
-{
-    // Latency numbers for next operations are used to scale the power values
-    // for special operations, according observations from microbenchmarking
-    // TODO: put these numbers in the xml configuration
-  if(get_gpu()->get_config().g_power_simulation_enabled){
-    switch(inst->sp_op){
-    case INT__OP:
-      incialu_stat(inst->active_count(), scaling_coeffs->int_coeff);
-      break;
-    case INT_MUL_OP:
-      incimul_stat(inst->active_count(), scaling_coeffs->int_mul_coeff);
-      break;
-    case INT_MUL24_OP:
-      incimul24_stat(inst->active_count(), scaling_coeffs->int_mul24_coeff);
-      break;
-    case INT_MUL32_OP:
-      incimul32_stat(inst->active_count(), scaling_coeffs->int_mul32_coeff);
-      break;
-    case INT_DIV_OP:
-      incidiv_stat(inst->active_count(), scaling_coeffs->int_div_coeff);
-      break;
-    case FP__OP:
-      incfpalu_stat(inst->active_count(),scaling_coeffs->fp_coeff);
-      break;
-    case FP_MUL_OP:
-      incfpmul_stat(inst->active_count(), scaling_coeffs->fp_mul_coeff);
-      break;
-    case FP_DIV_OP:
-      incfpdiv_stat(inst->active_count(), scaling_coeffs->fp_div_coeff);
-      break;
-    case DP___OP:
-      incdpalu_stat(inst->active_count(), scaling_coeffs->dp_coeff);
-      break;
-    case DP_MUL_OP:
-      incdpmul_stat(inst->active_count(), scaling_coeffs->dp_mul_coeff);
-      break;
-    case DP_DIV_OP:
-      incdpdiv_stat(inst->active_count(), scaling_coeffs->dp_div_coeff);
-      break;
-    case FP_SQRT_OP:
-      incsqrt_stat(inst->active_count(), scaling_coeffs->sqrt_coeff);
-      break;
-    case FP_LG_OP:
-      inclog_stat(inst->active_count(), scaling_coeffs->log_coeff);
-      break;
-    case FP_SIN_OP:
-      incsin_stat(inst->active_count(), scaling_coeffs->sin_coeff);
-      break;
-    case FP_EXP_OP:
-      incexp_stat(inst->active_count(), scaling_coeffs->exp_coeff);
-      break;
-    case TENSOR__OP:
-      inctensor_stat(inst->active_count(), scaling_coeffs->tensor_coeff);
-      break;
-    case TEX__OP:
-      inctex_stat(inst->active_count(), scaling_coeffs->tex_coeff);
-      break;
-    default:
-      break;
+void shader_core_ctx::incexecstat(warp_inst_t *&inst) {
+  // Latency numbers for next operations are used to scale the power values
+  // for special operations, according observations from microbenchmarking
+  // TODO: put these numbers in the xml configuration
+  if (get_gpu()->get_config().g_power_simulation_enabled) {
+    switch (inst->sp_op) {
+      case INT__OP:
+        incialu_stat(inst->active_count(), scaling_coeffs->int_coeff);
+        break;
+      case INT_MUL_OP:
+        incimul_stat(inst->active_count(), scaling_coeffs->int_mul_coeff);
+        break;
+      case INT_MUL24_OP:
+        incimul24_stat(inst->active_count(), scaling_coeffs->int_mul24_coeff);
+        break;
+      case INT_MUL32_OP:
+        incimul32_stat(inst->active_count(), scaling_coeffs->int_mul32_coeff);
+        break;
+      case INT_DIV_OP:
+        incidiv_stat(inst->active_count(), scaling_coeffs->int_div_coeff);
+        break;
+      case FP__OP:
+        incfpalu_stat(inst->active_count(), scaling_coeffs->fp_coeff);
+        break;
+      case FP_MUL_OP:
+        incfpmul_stat(inst->active_count(), scaling_coeffs->fp_mul_coeff);
+        break;
+      case FP_DIV_OP:
+        incfpdiv_stat(inst->active_count(), scaling_coeffs->fp_div_coeff);
+        break;
+      case DP___OP:
+        incdpalu_stat(inst->active_count(), scaling_coeffs->dp_coeff);
+        break;
+      case DP_MUL_OP:
+        incdpmul_stat(inst->active_count(), scaling_coeffs->dp_mul_coeff);
+        break;
+      case DP_DIV_OP:
+        incdpdiv_stat(inst->active_count(), scaling_coeffs->dp_div_coeff);
+        break;
+      case FP_SQRT_OP:
+        incsqrt_stat(inst->active_count(), scaling_coeffs->sqrt_coeff);
+        break;
+      case FP_LG_OP:
+        inclog_stat(inst->active_count(), scaling_coeffs->log_coeff);
+        break;
+      case FP_SIN_OP:
+        incsin_stat(inst->active_count(), scaling_coeffs->sin_coeff);
+        break;
+      case FP_EXP_OP:
+        incexp_stat(inst->active_count(), scaling_coeffs->exp_coeff);
+        break;
+      case TENSOR__OP:
+        inctensor_stat(inst->active_count(), scaling_coeffs->tensor_coeff);
+        break;
+      case TEX__OP:
+        inctex_stat(inst->active_count(), scaling_coeffs->tex_coeff);
+        break;
+      default:
+        break;
     }
-    if(inst->const_cache_operand) //warp has const address space load as one operand
+    if (inst->const_cache_operand)  // warp has const address space load as one
+                                    // operand
       inc_const_accesses(1);
   }
 }
@@ -4034,7 +4053,9 @@ void shader_core_ctx::get_icnt_power_stats(long &n_simt_to_mem,
   n_mem_to_simt += m_stats->n_mem_to_simt[m_sid];
 }
 
-kernel_info_t* shd_warp_t::get_kernel_info() const { return m_shader->get_kernel_info(); }
+kernel_info_t *shd_warp_t::get_kernel_info() const {
+  return m_shader->get_kernel_info();
+}
 
 bool shd_warp_t::functional_done() const {
   return get_n_completed() == m_warp_size;
@@ -4069,7 +4090,8 @@ bool shd_warp_t::waiting() {
 
 void shd_warp_t::print(FILE *fout) const {
   if (!done_exit()) {
-    fprintf(fout, "w%02u npc: 0x%04llx, done:%c%c%c%c:%2u i:%u s:%u a:%u (done: ",
+    fprintf(fout,
+            "w%02u npc: 0x%04llx, done:%c%c%c%c:%2u i:%u s:%u a:%u (done: ",
             m_warp_id, m_next_pc, (functional_done() ? 'f' : ' '),
             (stores_done() ? 's' : ' '), (inst_in_pipeline() ? ' ' : 'i'),
             (done_exit() ? 'e' : ' '), n_completed, m_inst_in_pipeline,
@@ -4156,18 +4178,18 @@ void opndcoll_rfu_t::init(unsigned num_banks, shader_core_ctx *shader) {
       unsigned cusPerSched = m_cu.size() / m_num_warp_scheds;
       reg_id = j / cusPerSched;
     }
-    m_cu[j]->init(j, num_banks, shader->get_config(), this,
-                  sub_core_model, reg_id, m_num_banks_per_sched);
+    m_cu[j]->init(j, num_banks, shader->get_config(), this, sub_core_model,
+                  reg_id, m_num_banks_per_sched);
   }
   for (unsigned j = 0; j < m_dispatch_units.size(); j++) {
-    m_dispatch_units[j].init(sub_core_model,m_num_warp_scheds);
+    m_dispatch_units[j].init(sub_core_model, m_num_warp_scheds);
   }
   m_initialized = true;
 }
 
 unsigned register_bank(int regnum, int wid, unsigned num_banks,
-                  bool sub_core_model,
-                  unsigned banks_per_sched, unsigned sched_id) {
+                       bool sub_core_model, unsigned banks_per_sched,
+                       unsigned sched_id) {
   int bank = regnum;
   bank += wid;
   if (sub_core_model) {
@@ -4186,14 +4208,13 @@ bool opndcoll_rfu_t::writeback(warp_inst_t &inst) {
     int reg_num = inst.arch_reg.dst[op];  // this math needs to match that used
                                           // in function_info::ptx_decode_inst
     if (reg_num >= 0) {                   // valid register
-      unsigned bank = register_bank(reg_num, inst.warp_id(), m_num_banks,
-                                    sub_core_model,
-                                    m_num_banks_per_sched, inst.get_schd_id());
+      unsigned bank =
+          register_bank(reg_num, inst.warp_id(), m_num_banks, sub_core_model,
+                        m_num_banks_per_sched, inst.get_schd_id());
       if (m_arbiter.bank_idle(bank)) {
         m_arbiter.allocate_bank_for_write(
-            bank,
-            op_t(&inst, reg_num, m_num_banks, sub_core_model,
-                 m_num_banks_per_sched, inst.get_schd_id()));
+            bank, op_t(&inst, reg_num, m_num_banks, sub_core_model,
+                       m_num_banks_per_sched, inst.get_schd_id()));
         inst.arch_reg.dst[op] = -1;
       } else {
         return false;
@@ -4301,9 +4322,8 @@ void opndcoll_rfu_t::allocate_reads() {
     const op_t &rr = *r;
     unsigned reg = rr.get_reg();
     unsigned wid = rr.get_wid();
-    unsigned bank =
-        register_bank(reg, wid, m_num_banks, sub_core_model,
-                      m_num_banks_per_sched, rr.get_sid());
+    unsigned bank = register_bank(reg, wid, m_num_banks, sub_core_model,
+                                  m_num_banks_per_sched, rr.get_sid());
     m_arbiter.allocate_for_read(bank, rr);
     read_ops[bank] = rr;
   }
@@ -4353,10 +4373,12 @@ void opndcoll_rfu_t::collector_unit_t::dump(
   }
 }
 
-void opndcoll_rfu_t::collector_unit_t::init(
-    unsigned n, unsigned num_banks,
-    const core_config *config, opndcoll_rfu_t *rfu, bool sub_core_model,
-    unsigned reg_id, unsigned banks_per_sched) {
+void opndcoll_rfu_t::collector_unit_t::init(unsigned n, unsigned num_banks,
+                                            const core_config *config,
+                                            opndcoll_rfu_t *rfu,
+                                            bool sub_core_model,
+                                            unsigned reg_id,
+                                            unsigned banks_per_sched) {
   m_rfu = rfu;
   m_cuid = n;
   m_num_banks = num_banks;
@@ -4376,7 +4398,7 @@ bool opndcoll_rfu_t::collector_unit_t::allocate(register_set *pipeline_reg_set,
   warp_inst_t **pipeline_reg = pipeline_reg_set->get_ready();
   if ((pipeline_reg) and !((*pipeline_reg)->empty())) {
     m_warp_id = (*pipeline_reg)->warp_id();
-    std::vector<int> prev_regs; // remove duplicate regs within same instr
+    std::vector<int> prev_regs;  // remove duplicate regs within same instr
     for (unsigned op = 0; op < MAX_REG_OPERANDS; op++) {
       int reg_num =
           (*pipeline_reg)
@@ -4384,14 +4406,13 @@ bool opndcoll_rfu_t::collector_unit_t::allocate(register_set *pipeline_reg_set,
                                    // function_info::ptx_decode_inst
       bool new_reg = true;
       for (auto r : prev_regs) {
-        if (r == reg_num)
-          new_reg = false;
+        if (r == reg_num) new_reg = false;
       }
-      if (reg_num >= 0 && new_reg) {          // valid register
+      if (reg_num >= 0 && new_reg) {  // valid register
         prev_regs.push_back(reg_num);
-        m_src_op[op] = op_t(this, op, reg_num, m_num_banks,
-                            m_sub_core_model, m_num_banks_per_sched,
-                            (*pipeline_reg)->get_schd_id());
+        m_src_op[op] =
+            op_t(this, op, reg_num, m_num_banks, m_sub_core_model,
+                 m_num_banks_per_sched, (*pipeline_reg)->get_schd_id());
         m_not_ready.set(op);
       } else
         m_src_op[op] = op_t();
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 95e142e13..b1f904fd8 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -1,18 +1,19 @@
 // Copyright (c) 2009-2021, Tor M. Aamodt, Wilson W.L. Fung, Andrew Turner,
-// Ali Bakhoda, Vijay Kandiah, Nikos Hardavellas, 
+// Ali Bakhoda, Vijay Kandiah, Nikos Hardavellas,
 // Mahmoud Khairy, Junrui Pan, Timothy G. Rogers
-// The University of British Columbia, Northwestern University, Purdue University
-// All rights reserved.
+// The University of British Columbia, Northwestern University, Purdue
+// University All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are met:
 //
-// 1. Redistributions of source code must retain the above copyright notice, this
+// 1. Redistributions of source code must retain the above copyright notice,
+// this
 //    list of conditions and the following disclaimer;
 // 2. Redistributions in binary form must reproduce the above copyright notice,
 //    this list of conditions and the following disclaimer in the documentation
 //    and/or other materials provided with the distribution;
-// 3. Neither the names of The University of British Columbia, Northwestern 
+// 3. Neither the names of The University of British Columbia, Northwestern
 //    University nor the names of their contributors may be used to
 //    endorse or promote products derived from this software without specific
 //    prior written permission.
@@ -199,7 +200,7 @@ class shd_warp_t {
   void clear_membar() { m_membar = false; }
   bool get_membar() const { return m_membar; }
   virtual address_type get_pc() const { return m_next_pc; }
-  virtual kernel_info_t* get_kernel_info() const;
+  virtual kernel_info_t *get_kernel_info() const;
   void set_next_pc(address_type pc) { m_next_pc = pc; }
 
   void store_info_of_last_inst_at_barrier(const warp_inst_t *pI) {
@@ -318,12 +319,14 @@ class shd_warp_t {
   bool m_cdp_dummy;
 
   // Ni: LDGDEPBAR barrier support
-  public:
-    unsigned int m_ldgdepbar_id;  // LDGDEPBAR barrier ID
-    std::vector<std::vector<warp_inst_t>> m_ldgdepbar_buf;  // LDGDEPBAR barrier buffer
-    unsigned int m_depbar_start_id;
-    unsigned int m_depbar_group;
-    bool m_waiting_ldgsts; // Ni: Whether the warp is waiting for the LDGSTS instrs to finish
+ public:
+  unsigned int m_ldgdepbar_id;  // LDGDEPBAR barrier ID
+  std::vector<std::vector<warp_inst_t>>
+      m_ldgdepbar_buf;  // LDGDEPBAR barrier buffer
+  unsigned int m_depbar_start_id;
+  unsigned int m_depbar_group;
+  bool m_waiting_ldgsts;  // Ni: Whether the warp is waiting for the LDGSTS
+                          // instrs to finish
 };
 
 inline unsigned hw_tid_from_wid(unsigned wid, unsigned warp_size, unsigned i) {
@@ -337,8 +340,8 @@ const unsigned WARP_PER_CTA_MAX = 64;
 typedef std::bitset<WARP_PER_CTA_MAX> warp_set_t;
 
 unsigned register_bank(int regnum, int wid, unsigned num_banks,
-                  bool sub_core_model,
-                  unsigned banks_per_sched, unsigned sched_id);
+                       bool sub_core_model, unsigned banks_per_sched,
+                       unsigned sched_id);
 
 class shader_core_ctx;
 class shader_core_config;
@@ -681,28 +684,26 @@ class opndcoll_rfu_t {  // operand collector based register file unit
    public:
     op_t() { m_valid = false; }
     op_t(collector_unit_t *cu, unsigned op, unsigned reg, unsigned num_banks,
-        bool sub_core_model,
-         unsigned banks_per_sched, unsigned sched_id) {
+         bool sub_core_model, unsigned banks_per_sched, unsigned sched_id) {
       m_valid = true;
       m_warp = NULL;
       m_cu = cu;
       m_operand = op;
       m_register = reg;
       m_shced_id = sched_id;
-      m_bank = register_bank(reg, cu->get_warp_id(), num_banks,
-                             sub_core_model, banks_per_sched, sched_id);
+      m_bank = register_bank(reg, cu->get_warp_id(), num_banks, sub_core_model,
+                             banks_per_sched, sched_id);
     }
     op_t(const warp_inst_t *warp, unsigned reg, unsigned num_banks,
-         bool sub_core_model,
-         unsigned banks_per_sched, unsigned sched_id) {
+         bool sub_core_model, unsigned banks_per_sched, unsigned sched_id) {
       m_valid = true;
       m_warp = warp;
       m_register = reg;
       m_cu = NULL;
       m_operand = -1;
       m_shced_id = sched_id;
-      m_bank = register_bank(reg, warp->warp_id(), num_banks,
-                             sub_core_model, banks_per_sched, sched_id);
+      m_bank = register_bank(reg, warp->warp_id(), num_banks, sub_core_model,
+                             banks_per_sched, sched_id);
     }
 
     // accessors
@@ -950,9 +951,8 @@ class opndcoll_rfu_t {  // operand collector based register file unit
     unsigned get_reg_id() const { return m_reg_id; }
 
     // modifiers
-    void init(unsigned n, unsigned num_banks,
-              const core_config *config, opndcoll_rfu_t *rfu,
-              bool m_sub_core_model, unsigned reg_id,
+    void init(unsigned n, unsigned num_banks, const core_config *config,
+              opndcoll_rfu_t *rfu, bool m_sub_core_model, unsigned reg_id,
               unsigned num_banks_per_sched);
     bool allocate(register_set *pipeline_reg, register_set *output_reg);
 
@@ -996,8 +996,8 @@ class opndcoll_rfu_t {  // operand collector based register file unit
       // With sub-core enabled round robin starts with the next cu assigned to a
       // different sub-core than the one that dispatched last
       unsigned cusPerSched = m_num_collectors / m_num_warp_scheds;
-      unsigned rr_increment = m_sub_core_model ?
-                              cusPerSched - (m_last_cu % cusPerSched) : 1;
+      unsigned rr_increment =
+          m_sub_core_model ? cusPerSched - (m_last_cu % cusPerSched) : 1;
       for (unsigned n = 0; n < m_num_collectors; n++) {
         unsigned c = (m_last_cu + n + rr_increment) % m_num_collectors;
         if ((*m_collector_units)[c].ready()) {
@@ -1317,8 +1317,8 @@ class sp_unit : public pipelined_simd_unit {
 class specialized_unit : public pipelined_simd_unit {
  public:
   specialized_unit(register_set *result_port, const shader_core_config *config,
-                   shader_core_ctx *core, int supported_op,
-                   char *unit_name, unsigned latency, unsigned issue_reg_id);
+                   shader_core_ctx *core, int supported_op, char *unit_name,
+                   unsigned latency, unsigned issue_reg_id);
   virtual bool can_issue(const warp_inst_t &inst) const {
     if (inst.op != m_supported_op) {
       return false;
@@ -1348,13 +1348,14 @@ class ldst_unit : public pipelined_simd_unit {
             unsigned sid, unsigned tpc);
 
   // Add a structure to record the LDGSTS instructions,
-  // similar to m_pending_writes, but since LDGSTS does not have a output register
-  // to write to, so a new structure needs to be added
-  /* A multi-level map: unsigned (warp_id) -> unsigned (pc) -> unsigned (addr) -> unsigned (count)
+  // similar to m_pending_writes, but since LDGSTS does not have a output
+  // register to write to, so a new structure needs to be added
+  /* A multi-level map: unsigned (warp_id) -> unsigned (pc) -> unsigned (addr)
+   * -> unsigned (count)
    */
   std::map<unsigned /*warp_id*/,
-           std::map<unsigned /*pc*/, 
-                  std::map<unsigned /*addr*/, unsigned /*count*/>>>
+           std::map<unsigned /*pc*/,
+                    std::map<unsigned /*addr*/, unsigned /*count*/>>>
       m_pending_ldgsts;
   // modifiers
   virtual void issue(register_set &inst);
@@ -1766,8 +1767,8 @@ struct shader_core_stats_pod {
   unsigned *m_active_tensor_core_lanes;
   unsigned *m_active_fu_lanes;
   unsigned *m_active_fu_mem_lanes;
-  double *m_active_exu_threads; //For power model
-  double *m_active_exu_warps; //For power model
+  double *m_active_exu_threads;  // For power model
+  double *m_active_exu_warps;    // For power model
   unsigned *m_n_diverge;  // number of divergence occurring in this shader
   unsigned gpgpu_n_load_insn;
   unsigned gpgpu_n_store_insn;
@@ -1838,56 +1839,41 @@ class shader_core_stats : public shader_core_stats_pod {
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_num_loadqueued_insn =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
-    m_num_tex_inst = 
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+    m_num_tex_inst = (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_num_INTdecoded_insn =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
-    m_num_ialu_acesses =
-        (double *)calloc(config->num_shader(), sizeof(double));
-    m_num_fp_acesses =
-        (double *)calloc(config->num_shader(), sizeof(double));
-    m_num_imul_acesses =
-        (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_ialu_acesses = (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_fp_acesses = (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_imul_acesses = (double *)calloc(config->num_shader(), sizeof(double));
     m_num_imul24_acesses =
         (double *)calloc(config->num_shader(), sizeof(double));
     m_num_imul32_acesses =
         (double *)calloc(config->num_shader(), sizeof(double));
     m_num_fpmul_acesses =
         (double *)calloc(config->num_shader(), sizeof(double));
-    m_num_idiv_acesses =
-        (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_idiv_acesses = (double *)calloc(config->num_shader(), sizeof(double));
     m_num_fpdiv_acesses =
         (double *)calloc(config->num_shader(), sizeof(double));
-    m_num_dp_acesses = 
-        (double*) calloc(config->num_shader(),sizeof(double));
-    m_num_dpmul_acesses = 
-        (double*) calloc(config->num_shader(),sizeof(double));
-    m_num_dpdiv_acesses = 
-        (double*) calloc(config->num_shader(),sizeof(double));
-    m_num_sp_acesses =
+    m_num_dp_acesses = (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_dpmul_acesses =
         (double *)calloc(config->num_shader(), sizeof(double));
-    m_num_sfu_acesses =
+    m_num_dpdiv_acesses =
         (double *)calloc(config->num_shader(), sizeof(double));
-    m_num_tensor_core_acesses = 
+    m_num_sp_acesses = (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_sfu_acesses = (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_tensor_core_acesses =
         (double *)calloc(config->num_shader(), sizeof(double));
     m_num_const_acesses =
         (double *)calloc(config->num_shader(), sizeof(double));
-    m_num_tex_acesses =
-        (double *)calloc(config->num_shader(), sizeof(double));
-    m_num_sqrt_acesses = 
-        (double*) calloc(config->num_shader(),sizeof(double));
-    m_num_log_acesses = 
-        (double*) calloc(config->num_shader(),sizeof(double));
-    m_num_sin_acesses = 
-        (double*) calloc(config->num_shader(),sizeof(double));
-    m_num_exp_acesses = 
-        (double*) calloc(config->num_shader(),sizeof(double));
-    m_num_mem_acesses =
-        (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_tex_acesses = (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_sqrt_acesses = (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_log_acesses = (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_sin_acesses = (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_exp_acesses = (double *)calloc(config->num_shader(), sizeof(double));
+    m_num_mem_acesses = (double *)calloc(config->num_shader(), sizeof(double));
     m_num_sp_committed =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
-    m_num_tlb_hits = 
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+    m_num_tlb_hits = (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_num_tlb_accesses =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_active_sp_lanes =
@@ -1900,8 +1886,7 @@ class shader_core_stats : public shader_core_stats_pod {
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_active_exu_threads =
         (double *)calloc(config->num_shader(), sizeof(double));
-    m_active_exu_warps =
-        (double *)calloc(config->num_shader(), sizeof(double));
+    m_active_exu_warps = (double *)calloc(config->num_shader(), sizeof(double));
     m_active_fu_mem_lanes =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_num_sfu_committed =
@@ -1916,8 +1901,7 @@ class shader_core_stats : public shader_core_stats_pod {
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     m_non_rf_operands =
         (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
-    m_n_diverge = 
-        (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
+    m_n_diverge = (unsigned *)calloc(config->num_shader(), sizeof(unsigned));
     shader_cycle_distro =
         (unsigned *)calloc(config->warp_size + 3, sizeof(unsigned));
     last_shader_cycle_distro =
@@ -2153,206 +2137,244 @@ class shader_core_ctx : public core_t {
 
   void incload_stat() { m_stats->m_num_loadqueued_insn[m_sid]++; }
   void incstore_stat() { m_stats->m_num_storequeued_insn[m_sid]++; }
-  void incialu_stat(unsigned active_count,double latency) {
-    if(m_config->gpgpu_clock_gated_lanes==false){
-      m_stats->m_num_ialu_acesses[m_sid]=m_stats->m_num_ialu_acesses[m_sid]+(double)active_count*latency
-        + inactive_lanes_accesses_nonsfu(active_count, latency);
-    }else {
-      m_stats->m_num_ialu_acesses[m_sid]=m_stats->m_num_ialu_acesses[m_sid]+(double)active_count*latency;
-    }
-    m_stats->m_active_exu_threads[m_sid]+=active_count;
+  void incialu_stat(unsigned active_count, double latency) {
+    if (m_config->gpgpu_clock_gated_lanes == false) {
+      m_stats->m_num_ialu_acesses[m_sid] =
+          m_stats->m_num_ialu_acesses[m_sid] + (double)active_count * latency +
+          inactive_lanes_accesses_nonsfu(active_count, latency);
+    } else {
+      m_stats->m_num_ialu_acesses[m_sid] =
+          m_stats->m_num_ialu_acesses[m_sid] + (double)active_count * latency;
+    }
+    m_stats->m_active_exu_threads[m_sid] += active_count;
     m_stats->m_active_exu_warps[m_sid]++;
   }
-  void incimul_stat(unsigned active_count,double latency) {
-    if(m_config->gpgpu_clock_gated_lanes==false){
-      m_stats->m_num_imul_acesses[m_sid]=m_stats->m_num_imul_acesses[m_sid]+(double)active_count*latency
-        + inactive_lanes_accesses_nonsfu(active_count, latency);
-    }else {
-      m_stats->m_num_imul_acesses[m_sid]=m_stats->m_num_imul_acesses[m_sid]+(double)active_count*latency;
+  void incimul_stat(unsigned active_count, double latency) {
+    if (m_config->gpgpu_clock_gated_lanes == false) {
+      m_stats->m_num_imul_acesses[m_sid] =
+          m_stats->m_num_imul_acesses[m_sid] + (double)active_count * latency +
+          inactive_lanes_accesses_nonsfu(active_count, latency);
+    } else {
+      m_stats->m_num_imul_acesses[m_sid] =
+          m_stats->m_num_imul_acesses[m_sid] + (double)active_count * latency;
     }
-    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_threads[m_sid] += active_count;
     m_stats->m_active_exu_warps[m_sid]++;
   }
-  void incimul24_stat(unsigned active_count,double latency) {
-  if(m_config->gpgpu_clock_gated_lanes==false){
-    m_stats->m_num_imul24_acesses[m_sid]=m_stats->m_num_imul24_acesses[m_sid]+(double)active_count*latency
-        + inactive_lanes_accesses_nonsfu(active_count, latency);
-    }else {
-      m_stats->m_num_imul24_acesses[m_sid]=m_stats->m_num_imul24_acesses[m_sid]+(double)active_count*latency;
-    }
-    m_stats->m_active_exu_threads[m_sid]+=active_count;
-    m_stats->m_active_exu_warps[m_sid]++;    
-   }
-   void incimul32_stat(unsigned active_count,double latency) {
-    if(m_config->gpgpu_clock_gated_lanes==false){
-      m_stats->m_num_imul32_acesses[m_sid]=m_stats->m_num_imul32_acesses[m_sid]+(double)active_count*latency
-         + inactive_lanes_accesses_sfu(active_count, latency);          
-    }else{
-      m_stats->m_num_imul32_acesses[m_sid]=m_stats->m_num_imul32_acesses[m_sid]+(double)active_count*latency;
-    }
-    m_stats->m_active_exu_threads[m_sid]+=active_count;
+  void incimul24_stat(unsigned active_count, double latency) {
+    if (m_config->gpgpu_clock_gated_lanes == false) {
+      m_stats->m_num_imul24_acesses[m_sid] =
+          m_stats->m_num_imul24_acesses[m_sid] +
+          (double)active_count * latency +
+          inactive_lanes_accesses_nonsfu(active_count, latency);
+    } else {
+      m_stats->m_num_imul24_acesses[m_sid] =
+          m_stats->m_num_imul24_acesses[m_sid] + (double)active_count * latency;
+    }
+    m_stats->m_active_exu_threads[m_sid] += active_count;
     m_stats->m_active_exu_warps[m_sid]++;
   }
-   void incidiv_stat(unsigned active_count,double latency) {
-    if(m_config->gpgpu_clock_gated_lanes==false){
-      m_stats->m_num_idiv_acesses[m_sid]=m_stats->m_num_idiv_acesses[m_sid]+(double)active_count*latency
-         + inactive_lanes_accesses_sfu(active_count, latency); 
-    }else {
-      m_stats->m_num_idiv_acesses[m_sid]=m_stats->m_num_idiv_acesses[m_sid]+(double)active_count*latency;
-    }
-    m_stats->m_active_exu_threads[m_sid]+=active_count;
-    m_stats->m_active_exu_warps[m_sid]++;    
-  }
-   void incfpalu_stat(unsigned active_count,double latency) {
-    if(m_config->gpgpu_clock_gated_lanes==false){
-      m_stats->m_num_fp_acesses[m_sid]=m_stats->m_num_fp_acesses[m_sid]+(double)active_count*latency
-         + inactive_lanes_accesses_nonsfu(active_count, latency);
-    }else {
-    m_stats->m_num_fp_acesses[m_sid]=m_stats->m_num_fp_acesses[m_sid]+(double)active_count*latency;
-    }
-    m_stats->m_active_exu_threads[m_sid]+=active_count;
-    m_stats->m_active_exu_warps[m_sid]++;     
-  }
-   void incfpmul_stat(unsigned active_count,double latency) {
-              // printf("FP MUL stat increament\n");
-    if(m_config->gpgpu_clock_gated_lanes==false){
-      m_stats->m_num_fpmul_acesses[m_sid]=m_stats->m_num_fpmul_acesses[m_sid]+(double)active_count*latency
-        + inactive_lanes_accesses_nonsfu(active_count, latency);
-    }else {
-    m_stats->m_num_fpmul_acesses[m_sid]=m_stats->m_num_fpmul_acesses[m_sid]+(double)active_count*latency;
-    }
-    m_stats->m_active_exu_threads[m_sid]+=active_count;
+  void incimul32_stat(unsigned active_count, double latency) {
+    if (m_config->gpgpu_clock_gated_lanes == false) {
+      m_stats->m_num_imul32_acesses[m_sid] =
+          m_stats->m_num_imul32_acesses[m_sid] +
+          (double)active_count * latency +
+          inactive_lanes_accesses_sfu(active_count, latency);
+    } else {
+      m_stats->m_num_imul32_acesses[m_sid] =
+          m_stats->m_num_imul32_acesses[m_sid] + (double)active_count * latency;
+    }
+    m_stats->m_active_exu_threads[m_sid] += active_count;
     m_stats->m_active_exu_warps[m_sid]++;
-   }
-   void incfpdiv_stat(unsigned active_count,double latency) {
-    if(m_config->gpgpu_clock_gated_lanes==false){
-      m_stats->m_num_fpdiv_acesses[m_sid]=m_stats->m_num_fpdiv_acesses[m_sid]+(double)active_count*latency
-        + inactive_lanes_accesses_sfu(active_count, latency); 
-    }else {
-      m_stats->m_num_fpdiv_acesses[m_sid]=m_stats->m_num_fpdiv_acesses[m_sid]+(double)active_count*latency;
-    }
-    m_stats->m_active_exu_threads[m_sid]+=active_count;
+  }
+  void incidiv_stat(unsigned active_count, double latency) {
+    if (m_config->gpgpu_clock_gated_lanes == false) {
+      m_stats->m_num_idiv_acesses[m_sid] =
+          m_stats->m_num_idiv_acesses[m_sid] + (double)active_count * latency +
+          inactive_lanes_accesses_sfu(active_count, latency);
+    } else {
+      m_stats->m_num_idiv_acesses[m_sid] =
+          m_stats->m_num_idiv_acesses[m_sid] + (double)active_count * latency;
+    }
+    m_stats->m_active_exu_threads[m_sid] += active_count;
     m_stats->m_active_exu_warps[m_sid]++;
-   }
-   void incdpalu_stat(unsigned active_count,double latency) {
-    if(m_config->gpgpu_clock_gated_lanes==false){
-      m_stats->m_num_dp_acesses[m_sid]=m_stats->m_num_dp_acesses[m_sid]+(double)active_count*latency
-         + inactive_lanes_accesses_nonsfu(active_count, latency);
-    }else {
-    m_stats->m_num_dp_acesses[m_sid]=m_stats->m_num_dp_acesses[m_sid]+(double)active_count*latency;
-    }
-    m_stats->m_active_exu_threads[m_sid]+=active_count;
-    m_stats->m_active_exu_warps[m_sid]++; 
-   }
-   void incdpmul_stat(unsigned active_count,double latency) {
-              // printf("FP MUL stat increament\n");
-    if(m_config->gpgpu_clock_gated_lanes==false){
-      m_stats->m_num_dpmul_acesses[m_sid]=m_stats->m_num_dpmul_acesses[m_sid]+(double)active_count*latency
-        + inactive_lanes_accesses_nonsfu(active_count, latency);
-    }else {
-    m_stats->m_num_dpmul_acesses[m_sid]=m_stats->m_num_dpmul_acesses[m_sid]+(double)active_count*latency;
-    }
-    m_stats->m_active_exu_threads[m_sid]+=active_count;
+  }
+  void incfpalu_stat(unsigned active_count, double latency) {
+    if (m_config->gpgpu_clock_gated_lanes == false) {
+      m_stats->m_num_fp_acesses[m_sid] =
+          m_stats->m_num_fp_acesses[m_sid] + (double)active_count * latency +
+          inactive_lanes_accesses_nonsfu(active_count, latency);
+    } else {
+      m_stats->m_num_fp_acesses[m_sid] =
+          m_stats->m_num_fp_acesses[m_sid] + (double)active_count * latency;
+    }
+    m_stats->m_active_exu_threads[m_sid] += active_count;
     m_stats->m_active_exu_warps[m_sid]++;
-   }
-   void incdpdiv_stat(unsigned active_count,double latency) {
-    if(m_config->gpgpu_clock_gated_lanes==false){
-      m_stats->m_num_dpdiv_acesses[m_sid]=m_stats->m_num_dpdiv_acesses[m_sid]+(double)active_count*latency
-        + inactive_lanes_accesses_sfu(active_count, latency); 
-    }else {
-      m_stats->m_num_dpdiv_acesses[m_sid]=m_stats->m_num_dpdiv_acesses[m_sid]+(double)active_count*latency;
-    }
-    m_stats->m_active_exu_threads[m_sid]+=active_count;
+  }
+  void incfpmul_stat(unsigned active_count, double latency) {
+    // printf("FP MUL stat increament\n");
+    if (m_config->gpgpu_clock_gated_lanes == false) {
+      m_stats->m_num_fpmul_acesses[m_sid] =
+          m_stats->m_num_fpmul_acesses[m_sid] + (double)active_count * latency +
+          inactive_lanes_accesses_nonsfu(active_count, latency);
+    } else {
+      m_stats->m_num_fpmul_acesses[m_sid] =
+          m_stats->m_num_fpmul_acesses[m_sid] + (double)active_count * latency;
+    }
+    m_stats->m_active_exu_threads[m_sid] += active_count;
     m_stats->m_active_exu_warps[m_sid]++;
-   }
-
-   void incsqrt_stat(unsigned active_count,double latency) {
-    if(m_config->gpgpu_clock_gated_lanes==false){
-      m_stats->m_num_sqrt_acesses[m_sid]=m_stats->m_num_sqrt_acesses[m_sid]+(double)active_count*latency
-        + inactive_lanes_accesses_sfu(active_count, latency); 
-    }else{
-      m_stats->m_num_sqrt_acesses[m_sid]=m_stats->m_num_sqrt_acesses[m_sid]+(double)active_count*latency;
+  }
+  void incfpdiv_stat(unsigned active_count, double latency) {
+    if (m_config->gpgpu_clock_gated_lanes == false) {
+      m_stats->m_num_fpdiv_acesses[m_sid] =
+          m_stats->m_num_fpdiv_acesses[m_sid] + (double)active_count * latency +
+          inactive_lanes_accesses_sfu(active_count, latency);
+    } else {
+      m_stats->m_num_fpdiv_acesses[m_sid] =
+          m_stats->m_num_fpdiv_acesses[m_sid] + (double)active_count * latency;
+    }
+    m_stats->m_active_exu_threads[m_sid] += active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+  }
+  void incdpalu_stat(unsigned active_count, double latency) {
+    if (m_config->gpgpu_clock_gated_lanes == false) {
+      m_stats->m_num_dp_acesses[m_sid] =
+          m_stats->m_num_dp_acesses[m_sid] + (double)active_count * latency +
+          inactive_lanes_accesses_nonsfu(active_count, latency);
+    } else {
+      m_stats->m_num_dp_acesses[m_sid] =
+          m_stats->m_num_dp_acesses[m_sid] + (double)active_count * latency;
+    }
+    m_stats->m_active_exu_threads[m_sid] += active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+  }
+  void incdpmul_stat(unsigned active_count, double latency) {
+    // printf("FP MUL stat increament\n");
+    if (m_config->gpgpu_clock_gated_lanes == false) {
+      m_stats->m_num_dpmul_acesses[m_sid] =
+          m_stats->m_num_dpmul_acesses[m_sid] + (double)active_count * latency +
+          inactive_lanes_accesses_nonsfu(active_count, latency);
+    } else {
+      m_stats->m_num_dpmul_acesses[m_sid] =
+          m_stats->m_num_dpmul_acesses[m_sid] + (double)active_count * latency;
+    }
+    m_stats->m_active_exu_threads[m_sid] += active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+  }
+  void incdpdiv_stat(unsigned active_count, double latency) {
+    if (m_config->gpgpu_clock_gated_lanes == false) {
+      m_stats->m_num_dpdiv_acesses[m_sid] =
+          m_stats->m_num_dpdiv_acesses[m_sid] + (double)active_count * latency +
+          inactive_lanes_accesses_sfu(active_count, latency);
+    } else {
+      m_stats->m_num_dpdiv_acesses[m_sid] =
+          m_stats->m_num_dpdiv_acesses[m_sid] + (double)active_count * latency;
     }
-    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_threads[m_sid] += active_count;
     m_stats->m_active_exu_warps[m_sid]++;
-   }
+  }
 
-   void inclog_stat(unsigned active_count,double latency) {
-    if(m_config->gpgpu_clock_gated_lanes==false){
-      m_stats->m_num_log_acesses[m_sid]=m_stats->m_num_log_acesses[m_sid]+(double)active_count*latency
-        + inactive_lanes_accesses_sfu(active_count, latency); 
-    }else{
-      m_stats->m_num_log_acesses[m_sid]=m_stats->m_num_log_acesses[m_sid]+(double)active_count*latency;
+  void incsqrt_stat(unsigned active_count, double latency) {
+    if (m_config->gpgpu_clock_gated_lanes == false) {
+      m_stats->m_num_sqrt_acesses[m_sid] =
+          m_stats->m_num_sqrt_acesses[m_sid] + (double)active_count * latency +
+          inactive_lanes_accesses_sfu(active_count, latency);
+    } else {
+      m_stats->m_num_sqrt_acesses[m_sid] =
+          m_stats->m_num_sqrt_acesses[m_sid] + (double)active_count * latency;
     }
-    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_threads[m_sid] += active_count;
     m_stats->m_active_exu_warps[m_sid]++;
-   }
+  }
 
-   void incexp_stat(unsigned active_count,double latency) {
-    if(m_config->gpgpu_clock_gated_lanes==false){
-      m_stats->m_num_exp_acesses[m_sid]=m_stats->m_num_exp_acesses[m_sid]+(double)active_count*latency
-        + inactive_lanes_accesses_sfu(active_count, latency); 
-    }else{
-      m_stats->m_num_exp_acesses[m_sid]=m_stats->m_num_exp_acesses[m_sid]+(double)active_count*latency;
+  void inclog_stat(unsigned active_count, double latency) {
+    if (m_config->gpgpu_clock_gated_lanes == false) {
+      m_stats->m_num_log_acesses[m_sid] =
+          m_stats->m_num_log_acesses[m_sid] + (double)active_count * latency +
+          inactive_lanes_accesses_sfu(active_count, latency);
+    } else {
+      m_stats->m_num_log_acesses[m_sid] =
+          m_stats->m_num_log_acesses[m_sid] + (double)active_count * latency;
     }
-    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_threads[m_sid] += active_count;
     m_stats->m_active_exu_warps[m_sid]++;
   }
 
-   void incsin_stat(unsigned active_count,double latency) {
-    if(m_config->gpgpu_clock_gated_lanes==false){
-      m_stats->m_num_sin_acesses[m_sid]=m_stats->m_num_sin_acesses[m_sid]+(double)active_count*latency
-        + inactive_lanes_accesses_sfu(active_count, latency); 
-    }else{
-      m_stats->m_num_sin_acesses[m_sid]=m_stats->m_num_sin_acesses[m_sid]+(double)active_count*latency;
+  void incexp_stat(unsigned active_count, double latency) {
+    if (m_config->gpgpu_clock_gated_lanes == false) {
+      m_stats->m_num_exp_acesses[m_sid] =
+          m_stats->m_num_exp_acesses[m_sid] + (double)active_count * latency +
+          inactive_lanes_accesses_sfu(active_count, latency);
+    } else {
+      m_stats->m_num_exp_acesses[m_sid] =
+          m_stats->m_num_exp_acesses[m_sid] + (double)active_count * latency;
     }
-    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_threads[m_sid] += active_count;
     m_stats->m_active_exu_warps[m_sid]++;
   }
 
+  void incsin_stat(unsigned active_count, double latency) {
+    if (m_config->gpgpu_clock_gated_lanes == false) {
+      m_stats->m_num_sin_acesses[m_sid] =
+          m_stats->m_num_sin_acesses[m_sid] + (double)active_count * latency +
+          inactive_lanes_accesses_sfu(active_count, latency);
+    } else {
+      m_stats->m_num_sin_acesses[m_sid] =
+          m_stats->m_num_sin_acesses[m_sid] + (double)active_count * latency;
+    }
+    m_stats->m_active_exu_threads[m_sid] += active_count;
+    m_stats->m_active_exu_warps[m_sid]++;
+  }
 
-   void inctensor_stat(unsigned active_count,double latency) {
-    if(m_config->gpgpu_clock_gated_lanes==false){
-      m_stats->m_num_tensor_core_acesses[m_sid]=m_stats->m_num_tensor_core_acesses[m_sid]+(double)active_count*latency
-        + inactive_lanes_accesses_sfu(active_count, latency); 
-    }else{
-      m_stats->m_num_tensor_core_acesses[m_sid]=m_stats->m_num_tensor_core_acesses[m_sid]+(double)active_count*latency;
+  void inctensor_stat(unsigned active_count, double latency) {
+    if (m_config->gpgpu_clock_gated_lanes == false) {
+      m_stats->m_num_tensor_core_acesses[m_sid] =
+          m_stats->m_num_tensor_core_acesses[m_sid] +
+          (double)active_count * latency +
+          inactive_lanes_accesses_sfu(active_count, latency);
+    } else {
+      m_stats->m_num_tensor_core_acesses[m_sid] =
+          m_stats->m_num_tensor_core_acesses[m_sid] +
+          (double)active_count * latency;
     }
-    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_threads[m_sid] += active_count;
     m_stats->m_active_exu_warps[m_sid]++;
   }
 
-  void inctex_stat(unsigned active_count,double latency) {
-    if(m_config->gpgpu_clock_gated_lanes==false){
-      m_stats->m_num_tex_acesses[m_sid]=m_stats->m_num_tex_acesses[m_sid]+(double)active_count*latency
-        + inactive_lanes_accesses_sfu(active_count, latency); 
-    }else{
-      m_stats->m_num_tex_acesses[m_sid]=m_stats->m_num_tex_acesses[m_sid]+(double)active_count*latency;
+  void inctex_stat(unsigned active_count, double latency) {
+    if (m_config->gpgpu_clock_gated_lanes == false) {
+      m_stats->m_num_tex_acesses[m_sid] =
+          m_stats->m_num_tex_acesses[m_sid] + (double)active_count * latency +
+          inactive_lanes_accesses_sfu(active_count, latency);
+    } else {
+      m_stats->m_num_tex_acesses[m_sid] =
+          m_stats->m_num_tex_acesses[m_sid] + (double)active_count * latency;
     }
-    m_stats->m_active_exu_threads[m_sid]+=active_count;
+    m_stats->m_active_exu_threads[m_sid] += active_count;
     m_stats->m_active_exu_warps[m_sid]++;
   }
 
   void inc_const_accesses(unsigned active_count) {
-    m_stats->m_num_const_acesses[m_sid]=m_stats->m_num_const_acesses[m_sid]+active_count;
+    m_stats->m_num_const_acesses[m_sid] =
+        m_stats->m_num_const_acesses[m_sid] + active_count;
   }
 
   void incsfu_stat(unsigned active_count, double latency) {
     m_stats->m_num_sfu_acesses[m_sid] =
-        m_stats->m_num_sfu_acesses[m_sid] + (double)active_count*latency;
+        m_stats->m_num_sfu_acesses[m_sid] + (double)active_count * latency;
   }
   void incsp_stat(unsigned active_count, double latency) {
     m_stats->m_num_sp_acesses[m_sid] =
-        m_stats->m_num_sp_acesses[m_sid] + (double)active_count*latency;
+        m_stats->m_num_sp_acesses[m_sid] + (double)active_count * latency;
   }
   void incmem_stat(unsigned active_count, double latency) {
     if (m_config->gpgpu_clock_gated_lanes == false) {
       m_stats->m_num_mem_acesses[m_sid] =
-          m_stats->m_num_mem_acesses[m_sid] + (double)active_count*latency +
+          m_stats->m_num_mem_acesses[m_sid] + (double)active_count * latency +
           inactive_lanes_accesses_nonsfu(active_count, latency);
     } else {
       m_stats->m_num_mem_acesses[m_sid] =
-          m_stats->m_num_mem_acesses[m_sid] + (double)active_count*latency;
+          m_stats->m_num_mem_acesses[m_sid] + (double)active_count * latency;
     }
   }
   void incexecstat(warp_inst_t *&inst);

From 081da0abacbc3595b090094b1a66fc37d35bd82a Mon Sep 17 00:00:00 2001
From: Cesar Avalos <cesar.avalos3@gmail.com>
Date: Fri, 9 Aug 2024 18:42:11 -0400
Subject: [PATCH 146/154] Add support for SHF ptx instruction (#70)

---
 src/cuda-sim/instructions.cc | 32 ++++++++++++++++++++++++++++++++
 src/cuda-sim/opcodes.def     |  1 +
 src/cuda-sim/ptx.l           |  8 +++++++-
 src/cuda-sim/ptx.y           |  8 ++++++++
 src/cuda-sim/ptx_ir.cc       | 14 ++++++++++++++
 src/cuda-sim/ptx_ir.h        |  4 ++++
 6 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/src/cuda-sim/instructions.cc b/src/cuda-sim/instructions.cc
index 4792efc80..2314bef30 100644
--- a/src/cuda-sim/instructions.cc
+++ b/src/cuda-sim/instructions.cc
@@ -5441,6 +5441,38 @@ void shfl_impl(const ptx_instruction *pI, core_t *core, warp_inst_t inst) {
   }
 }
 
+void shf_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
+  ptx_reg_t a,b,c,d;
+  const operand_info &dst = pI->dst();
+  const operand_info &src1 = pI->src1();
+  const operand_info &src2 = pI->src2();
+  const operand_info &src3 = pI->src3();
+
+  // Only b32 is allowed  
+  unsigned i_type = pI->get_type();
+  a = thread->get_operand_value(src1, dst, i_type, thread, 1);
+  b = thread->get_operand_value(src2, dst, i_type, thread, 1);
+  c = thread->get_operand_value(src3, dst, i_type, thread, 1);
+
+  if(i_type != B32_TYPE)
+    printf("Only the b32 data_type is allowed per the ISA\n");
+
+  unsigned clamp_mode = pI->clamp_mode();
+  unsigned n = c.u32 & 0x1f;
+  if(clamp_mode) {
+    if(c.u32 < 32)
+      n = c;
+    else
+      n = 32;
+  }
+  if(pI->left_mode())
+    d.u32 = (b.u32 << n) | (a.u32 >> (32-n));
+  else
+    d.u32 = (b.u32 << (32-n)) | (a.u32 >> n);
+
+  thread->set_operand_value(dst, d, i_type, thread, pI);
+}
+
 void shl_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
   ptx_reg_t a, b, d;
   const operand_info &dst = pI->dst();
diff --git a/src/cuda-sim/opcodes.def b/src/cuda-sim/opcodes.def
index f5bf156e2..83a23ea77 100644
--- a/src/cuda-sim/opcodes.def
+++ b/src/cuda-sim/opcodes.def
@@ -103,6 +103,7 @@ OP_DEF(SELP_OP,selp_impl,"selp",1,1)
 OP_DEF(SETP_OP,setp_impl,"setp",1,1)
 OP_DEF(SET_OP,set_impl,"set",1,1)
 OP_W_DEF(SHFL_OP,shfl_impl,"shfl",1,10)
+OP_DEF(SHF_OP,shf_impl,"shf",1,1)
 OP_DEF(SHL_OP,shl_impl,"shl",1,1)
 OP_DEF(SHR_OP,shr_impl,"shr",1,1)
 OP_DEF(SIN_OP,sin_impl,"sin",1,4)
diff --git a/src/cuda-sim/ptx.l b/src/cuda-sim/ptx.l
index 15b3cf77e..0810ef6e2 100644
--- a/src/cuda-sim/ptx.l
+++ b/src/cuda-sim/ptx.l
@@ -134,6 +134,7 @@ selp	TC; yylval->int_value = SELP_OP; return OPCODE;
 setp    TC; yylval->int_value = SETP_OP; return OPCODE;
 set	TC; yylval->int_value = SET_OP; return OPCODE;
 shfl	TC; yylval->int_value = SHFL_OP; return OPCODE;
+shf	TC; yylval->int_value = SHF_OP; return OPCODE;
 shl     TC; yylval->int_value = SHL_OP; return OPCODE;
 shr     TC; yylval->int_value = SHR_OP; return OPCODE;
 sin	TC; yylval->int_value = SIN_OP; return OPCODE;
@@ -317,6 +318,9 @@ breakaddr  TC; yylval->int_value = BREAKADDR_OP; return OPCODE;
 
 \.sat	TC; return SAT_OPTION;
 
+\.l		TC; return LEFT_OPTION;
+\.r		TC; return RIGHT_OPTION;
+
 \.eq    TC; return EQ_OPTION;
 \.ne    TC; return NE_OPTION;
 \.lt    TC; return LT_OPTION;
@@ -354,6 +358,8 @@ breakaddr  TC; yylval->int_value = BREAKADDR_OP; return OPCODE;
 \.arrive TC; return ARRIVE_OPTION;
 \.red TC; return RED_OPTION;
 
+\.clamp	TC; return CLAMP_OPTION;
+\.wrap	TC; return WRAP_OPTION;
 
 \.approx TC; return APPROX_OPTION;
 \.full  TC; return FULL_OPTION;
@@ -488,4 +494,4 @@ int ptx_error( yyscan_t yyscanner, ptx_recognizer* recognizer, const char *s )
 	fflush(stdout);
 	//exit(1);
 	return 0;
-}
\ No newline at end of file
+}
diff --git a/src/cuda-sim/ptx.y b/src/cuda-sim/ptx.y
index b38f78352..61183e88c 100644
--- a/src/cuda-sim/ptx.y
+++ b/src/cuda-sim/ptx.y
@@ -220,6 +220,10 @@ class ptx_recognizer;
 %token	PRMT_RC16_MODE;
 %token	PRMT_ECL_MODE;
 %token	PRMT_ECR_MODE;
+%token	WRAP_OPTION;
+%token	CLAMP_OPTION;
+%token	LEFT_OPTION;
+%token	RIGHT_OPTION;
 
 %type <int_value> function_decl_header
 %type <ptr_value> function_decl
@@ -507,6 +511,10 @@ option: type_spec
 	| DOWN_OPTION { recognizer->add_option(DOWN_OPTION); }
 	| BFLY_OPTION { recognizer->add_option(BFLY_OPTION); }
 	| IDX_OPTION { recognizer->add_option(IDX_OPTION); }
+	| WRAP_OPTION { recognizer->add_option(WRAP_OPTION); }
+	| CLAMP_OPTION { recognizer->add_option(CLAMP_OPTION); }
+	| LEFT_OPTION { recognizer->add_option(LEFT_OPTION); }
+	| RIGHT_OPTION { recognizer->add_option(RIGHT_OPTION); }
 	;
 
 atomic_operation_spec: ATOMIC_AND { recognizer->add_option(ATOMIC_AND); }
diff --git a/src/cuda-sim/ptx_ir.cc b/src/cuda-sim/ptx_ir.cc
index d3095428f..139920930 100644
--- a/src/cuda-sim/ptx_ir.cc
+++ b/src/cuda-sim/ptx_ir.cc
@@ -1227,6 +1227,8 @@ ptx_instruction::ptx_instruction(
   m_rounding_mode = RN_OPTION;
   m_compare_op = -1;
   m_saturation_mode = 0;
+  m_clamp_mode = 0;
+  m_left_mode = 0;
   m_geom_spec = 0;
   m_vector_spec = 0;
   m_atomic_spec = 0;
@@ -1293,6 +1295,18 @@ ptx_instruction::ptx_instruction(
       case SAT_OPTION:
         m_saturation_mode = 1;
         break;
+      case WRAP_OPTION:
+        m_clamp_mode = 0;
+        break;
+      case CLAMP_OPTION:
+        m_clamp_mode = 1;
+        break;
+      case LEFT_OPTION:
+        m_left_mode = 1;
+        break;
+      case RIGHT_OPTION:
+        m_left_mode = 0;
+        break;
       case RNI_OPTION:
       case RZI_OPTION:
       case RMI_OPTION:
diff --git a/src/cuda-sim/ptx_ir.h b/src/cuda-sim/ptx_ir.h
index 8b1f19c86..46f183ba6 100644
--- a/src/cuda-sim/ptx_ir.h
+++ b/src/cuda-sim/ptx_ir.h
@@ -1085,6 +1085,8 @@ class ptx_instruction : public warp_inst_t {
   unsigned cache_option() const { return m_cache_option; }
   unsigned rounding_mode() const { return m_rounding_mode; }
   unsigned saturation_mode() const { return m_saturation_mode; }
+  unsigned clamp_mode() const {return m_clamp_mode;}
+  unsigned left_mode() const { return m_left_mode; }
   unsigned dimension() const { return m_geom_spec; }
   unsigned barrier_op() const { return m_barrier_op; }
   unsigned shfl_op() const { return m_shfl_op; }
@@ -1159,6 +1161,8 @@ class ptx_instruction : public warp_inst_t {
   unsigned m_rounding_mode;
   unsigned m_compare_op;
   unsigned m_saturation_mode;
+  unsigned m_clamp_mode;
+  unsigned m_left_mode;
   unsigned m_barrier_op;
   unsigned m_shfl_op;
   unsigned m_prmt_op;

From 42a0cde4b463794d041b544309afb69c315f78bc Mon Sep 17 00:00:00 2001
From: JRPan <25518778+JRPan@users.noreply.github.com>
Date: Fri, 16 Aug 2024 11:16:06 -0400
Subject: [PATCH 147/154] remove unused & unintilized variables & format
 automation (#69)

* run formatter only on PR

* remove unused & unintilized variables

* fix signed & unsigned comparison warning

* enable merge queue

* resolve conflict

* in formatter, checkout the forked repo, not the base repo in PR

* Try to use jenkins for formatter

* Automated Format

---------

Co-authored-by: purdue-jenkins <purdue-jenkins@users.noreply.github.com>
---
 .github/workflows/main.yml   | 29 ++++++++++++-----------------
 src/cuda-sim/instructions.cc | 16 ++++++++--------
 src/cuda-sim/ptx_ir.h        |  2 +-
 src/gpgpu-sim/gpu-sim.h      |  6 +++---
 src/gpgpu-sim/shader.h       |  4 ++--
 5 files changed, 26 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 39f65c94c..8e0ae2324 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -4,7 +4,6 @@ name: Short-Tests
 
 # Controls when the workflow will run
 on:
-  # Triggers the workflow on push or pull request events but only for the mydev branch
   push:
     branches-ignore:
       - "gh-readonly-queue**"
@@ -86,25 +85,21 @@ jobs:
       - name: Run Simulation
         run: /bin/bash $GITHUB_WORKSPACE/short-tests.sh
   format-code:
-    runs-on: ubuntu-latest
+    if: github.event_name == 'pull_request'
+    runs-on: tgrogers-raid
     needs: [build-TITANV, build-TITANV-LOCALXBAR, build-QV100, build-2060, build-3070]
-    permissions:
-      # Give the default GITHUB_TOKEN write permission to commit and push the
-      # added or changed files to the repository.
-      contents: write
     steps:
       - uses: actions/checkout@v4
-      # Other steps that change files in the repository go here
-      # …
+        with:
+          ref: ${{github.event.pull_request.head.ref}}
+          repository: ${{github.event.pull_request.head.repo.full_name}}
+          ssh-key: ''
+
       - name: Run clang-format
         run: |
-          sudo apt-get install -y clang-format
+          git config user.name "purdue-jenkins"
+          git config user.email "purdue-jenkins@users.noreply.github.com"
+          git remote set-url origin git@github.com:${{github.event.pull_request.head.repo.full_name}}
+          git remote -v
           /bin/bash ./format-code.sh
-      - uses: stefanzweifel/git-auto-commit-action@v5
-        with:
-          # Optional. Commit message for the created commit.
-          # Defaults to "Apply automatic changes"
-          commit_message: Automated clang-format
-          # Optional. Option used by `git-status` to determine if the repository is 
-          # dirty. See https://git-scm.com/docs/git-status#_options
-          status_options: '--untracked-files=no'
\ No newline at end of file
+          if git status --untracked-files=no | grep -q "nothing to commit"; then echo "No changes to commit."; else git commit -a -m "Automated Format"; git push; fi
diff --git a/src/cuda-sim/instructions.cc b/src/cuda-sim/instructions.cc
index 2314bef30..108de9759 100644
--- a/src/cuda-sim/instructions.cc
+++ b/src/cuda-sim/instructions.cc
@@ -5442,33 +5442,33 @@ void shfl_impl(const ptx_instruction *pI, core_t *core, warp_inst_t inst) {
 }
 
 void shf_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
-  ptx_reg_t a,b,c,d;
+  ptx_reg_t a, b, c, d;
   const operand_info &dst = pI->dst();
   const operand_info &src1 = pI->src1();
   const operand_info &src2 = pI->src2();
   const operand_info &src3 = pI->src3();
 
-  // Only b32 is allowed  
+  // Only b32 is allowed
   unsigned i_type = pI->get_type();
   a = thread->get_operand_value(src1, dst, i_type, thread, 1);
   b = thread->get_operand_value(src2, dst, i_type, thread, 1);
   c = thread->get_operand_value(src3, dst, i_type, thread, 1);
 
-  if(i_type != B32_TYPE)
+  if (i_type != B32_TYPE)
     printf("Only the b32 data_type is allowed per the ISA\n");
 
   unsigned clamp_mode = pI->clamp_mode();
   unsigned n = c.u32 & 0x1f;
-  if(clamp_mode) {
-    if(c.u32 < 32)
+  if (clamp_mode) {
+    if (c.u32 < 32)
       n = c;
     else
       n = 32;
   }
-  if(pI->left_mode())
-    d.u32 = (b.u32 << n) | (a.u32 >> (32-n));
+  if (pI->left_mode())
+    d.u32 = (b.u32 << n) | (a.u32 >> (32 - n));
   else
-    d.u32 = (b.u32 << (32-n)) | (a.u32 >> n);
+    d.u32 = (b.u32 << (32 - n)) | (a.u32 >> n);
 
   thread->set_operand_value(dst, d, i_type, thread, pI);
 }
diff --git a/src/cuda-sim/ptx_ir.h b/src/cuda-sim/ptx_ir.h
index 46f183ba6..d253866db 100644
--- a/src/cuda-sim/ptx_ir.h
+++ b/src/cuda-sim/ptx_ir.h
@@ -1085,7 +1085,7 @@ class ptx_instruction : public warp_inst_t {
   unsigned cache_option() const { return m_cache_option; }
   unsigned rounding_mode() const { return m_rounding_mode; }
   unsigned saturation_mode() const { return m_saturation_mode; }
-  unsigned clamp_mode() const {return m_clamp_mode;}
+  unsigned clamp_mode() const { return m_clamp_mode; }
   unsigned left_mode() const { return m_left_mode; }
   unsigned dimension() const { return m_geom_spec; }
   unsigned barrier_op() const { return m_barrier_op; }
diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h
index a24ffd30e..d43b3995a 100644
--- a/src/gpgpu-sim/gpu-sim.h
+++ b/src/gpgpu-sim/gpu-sim.h
@@ -132,9 +132,9 @@ struct power_config {
 
     // NOTE: After changing the nonlinear model to only scaling idle core,
     // NOTE: The min_inc_per_active_sm is not used any more
-    if (g_use_nonlinear_model)
-      sscanf(gpu_nonlinear_model_config, "%lf:%lf", &gpu_idle_core_power,
-             &gpu_min_inc_per_active_sm);
+    // if (g_use_nonlinear_model)
+    //   sscanf(gpu_nonlinear_model_config, "%lf:%lf", &gpu_idle_core_power,
+    //          &gpu_min_inc_per_active_sm);
   }
   void reg_options(class OptionParser *opp);
 
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index b1f904fd8..92691d386 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -134,7 +134,7 @@ class shd_warp_t {
     m_waiting_ldgsts = false;
 
     // Ni: Clear m_ldgdepbar_buf
-    for (int i = 0; i < m_ldgdepbar_buf.size(); i++) {
+    for (unsigned i = 0; i < m_ldgdepbar_buf.size(); i++) {
       m_ldgdepbar_buf[i].clear();
     }
     m_ldgdepbar_buf.clear();
@@ -165,7 +165,7 @@ class shd_warp_t {
     m_waiting_ldgsts = false;
 
     // Ni: Clear m_ldgdepbar_buf
-    for (int i = 0; i < m_ldgdepbar_buf.size(); i++) {
+    for (unsigned i = 0; i < m_ldgdepbar_buf.size(); i++) {
       m_ldgdepbar_buf[i].clear();
     }
     m_ldgdepbar_buf.clear();

From 38b4df5653ecbd9907a3d39b125640cd4fb7d012 Mon Sep 17 00:00:00 2001
From: JRPan <25518778+JRPan@users.noreply.github.com>
Date: Tue, 20 Aug 2024 20:43:32 -0400
Subject: [PATCH 148/154] Stream stats (#71)

* Temp commit for Justin and Cassie to sync on
code changes for adding per-stream status.

* Resolved compile errors.

* Removed redundant parameter

* Passed cuda_stream_id from accelsim to gpgpusim

* Cleaned up unused changes

* Changed vector to map, having operator problems.

* StreamID defaults to zero

* Implemented streams to inc_stats and so on

* Fixed TOTAL_ACCESS counts

* Implemented GLOBAL_TIMER.

* Fixed m_shader->get_kernel SEGFAULT issue in shader.cc.

* Use warp_init to track streamID instead of issue_warp

* Removed temp debug print

* Modified cache_stats to only print data from latest finished stream

Added optional arg to cache_stats::print_stats, cache_stats::print_fail_stats and their upstream functions. When streamID is specified, print stats
from that stream. When not specified, print all stats.

NOTE: current implementation depending on streamid never equals -1

* Removed default arg values of streamID

* modified constructor of mem_fetch to pass in streamID

* changed get_streamid to get_streamID

* Added TODO to gpgpusim_entrypoint.cc and power_stat.cc

* Only collect power stats when enabled

* print last finished stream in PTX mode using last_streamID

* take out additional printf

* Add a field to baseline cache to indicate cache level

* save gpu object in cache

* Print stream ID only once per kernel

* rm test print

* use -1 for default stream id

* cleanup debug prints

* remove GLOABL_TIMER

* Automated clang-format

* Should be correct to print everything in power model

* addressing concerns & errors

* Automated clang-format

* add m_stats_pw in operator+

* Automated Format

---------

Co-authored-by: Justin Qiao <sqiao6@wisc.edu>
Co-authored-by: Justin Qiao <71228724+ShichenQiao@users.noreply.github.com>
Co-authored-by: Tim Rogers <timrogers@gmail.com>
Co-authored-by: JRPan <JRPan@users.noreply.github.com>
Co-authored-by: purdue-jenkins <purdue-jenkins@users.noreply.github.com>
---
 libcuda/cuda_runtime_api.cc    |   4 +-
 src/abstract_hardware_model.cc |   8 +-
 src/abstract_hardware_model.h  |  21 +-
 src/gpgpu-sim/gpu-cache.cc     | 533 +++++++++++++++++++++++----------
 src/gpgpu-sim/gpu-cache.h      |  77 +++--
 src/gpgpu-sim/gpu-sim.cc       |  72 +++--
 src/gpgpu-sim/gpu-sim.h        |  18 +-
 src/gpgpu-sim/l2cache.cc       |  29 +-
 src/gpgpu-sim/l2cache.h        |   7 +-
 src/gpgpu-sim/mem_fetch.cc     |   9 +-
 src/gpgpu-sim/mem_fetch.h      |  10 +-
 src/gpgpu-sim/power_stat.cc    |   6 +-
 src/gpgpu-sim/shader.cc        |  42 +--
 src/gpgpu-sim/shader.h         |  19 +-
 src/gpgpusim_entrypoint.cc     |   6 +-
 src/stream_manager.cc          |  12 +-
 16 files changed, 613 insertions(+), 260 deletions(-)

diff --git a/libcuda/cuda_runtime_api.cc b/libcuda/cuda_runtime_api.cc
index b64c3d9e2..b540ffd91 100644
--- a/libcuda/cuda_runtime_api.cc
+++ b/libcuda/cuda_runtime_api.cc
@@ -1421,7 +1421,9 @@ cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlagsInternal(
     dim3 gridDim(context->get_device()->get_gpgpu()->max_cta_per_core() *
                  context->get_device()->get_gpgpu()->get_config().num_shader());
     dim3 blockDim(blockSize);
-    kernel_info_t result(gridDim, blockDim, entry);
+    // because this fuction is only checking for resource requirements, we do
+    // not care which stream this kernel runs at, just picked -1
+    kernel_info_t result(gridDim, blockDim, entry, -1);
     // if(entry == NULL){
     //	*numBlocks = 1;
     //	return g_last_cudaError = cudaErrorUnknown;
diff --git a/src/abstract_hardware_model.cc b/src/abstract_hardware_model.cc
index fd056c6d1..e8ddf95ab 100644
--- a/src/abstract_hardware_model.cc
+++ b/src/abstract_hardware_model.cc
@@ -49,12 +49,14 @@ void mem_access_t::init(gpgpu_context *ctx) {
   m_addr = 0;
   m_req_size = 0;
 }
+
 void warp_inst_t::issue(const active_mask_t &mask, unsigned warp_id,
                         unsigned long long cycle, int dynamic_warp_id,
-                        int sch_id) {
+                        int sch_id, unsigned long long streamID) {
   m_warp_active_mask = mask;
   m_warp_issued_mask = mask;
   m_uid = ++(m_config->gpgpu_ctx->warp_inst_sm_next_uid);
+  m_streamID = streamID;
   m_warp_id = warp_id;
   m_dynamic_warp_id = dynamic_warp_id;
   issue_cycle = cycle;
@@ -755,7 +757,8 @@ void warp_inst_t::completed(unsigned long long cycle) const {
 }
 
 kernel_info_t::kernel_info_t(dim3 gridDim, dim3 blockDim,
-                             class function_info *entry) {
+                             class function_info *entry,
+                             unsigned long long streamID) {
   m_kernel_entry = entry;
   m_grid_dim = gridDim;
   m_block_dim = blockDim;
@@ -765,6 +768,7 @@ kernel_info_t::kernel_info_t(dim3 gridDim, dim3 blockDim,
   m_next_tid = m_next_cta;
   m_num_cores_running = 0;
   m_uid = (entry->gpgpu_ctx->kernel_info_m_next_uid)++;
+  m_streamID = streamID;
   m_param_mem = new memory_space_impl<8192>("param", 64 * 1024);
 
   // Jin: parent and child kernel management for CDP
diff --git a/src/abstract_hardware_model.h b/src/abstract_hardware_model.h
index e5f3b7859..98a403997 100644
--- a/src/abstract_hardware_model.h
+++ b/src/abstract_hardware_model.h
@@ -233,7 +233,8 @@ class kernel_info_t {
   //      m_num_cores_running=0;
   //      m_param_mem=NULL;
   //   }
-  kernel_info_t(dim3 gridDim, dim3 blockDim, class function_info *entry);
+  kernel_info_t(dim3 gridDim, dim3 blockDim, class function_info *entry,
+                unsigned long long streamID);
   kernel_info_t(
       dim3 gridDim, dim3 blockDim, class function_info *entry,
       std::map<std::string, const struct cudaArray *> nameToCudaArray,
@@ -292,6 +293,7 @@ class kernel_info_t {
            m_next_tid.x < m_block_dim.x;
   }
   unsigned get_uid() const { return m_uid; }
+  unsigned long long get_streamID() const { return m_streamID; }
   std::string get_name() const { return name(); }
   std::string name() const;
 
@@ -325,7 +327,8 @@ class kernel_info_t {
 
   class function_info *m_kernel_entry;
 
-  unsigned m_uid;
+  unsigned m_uid;  // Kernel ID
+  unsigned long long m_streamID;
 
   // These maps contain the snapshot of the texture mappings at kernel launch
   std::map<std::string, const struct cudaArray *> m_NameToCudaArray;
@@ -900,8 +903,8 @@ class mem_fetch_interface {
 class mem_fetch_allocator {
  public:
   virtual mem_fetch *alloc(new_addr_type addr, mem_access_type type,
-                           unsigned size, bool wr,
-                           unsigned long long cycle) const = 0;
+                           unsigned size, bool wr, unsigned long long cycle,
+                           unsigned long long streamID) const = 0;
   virtual mem_fetch *alloc(const class warp_inst_t &inst,
                            const mem_access_t &access,
                            unsigned long long cycle) const = 0;
@@ -911,7 +914,8 @@ class mem_fetch_allocator {
                            const mem_access_sector_mask_t &sector_mask,
                            unsigned size, bool wr, unsigned long long cycle,
                            unsigned wid, unsigned sid, unsigned tpc,
-                           mem_fetch *original_mf) const = 0;
+                           mem_fetch *original_mf,
+                           unsigned long long streamID) const = 0;
 };
 
 // the maximum number of destination, source, or address uarch operands in a
@@ -1059,6 +1063,7 @@ class warp_inst_t : public inst_t {
   // constructors
   warp_inst_t() {
     m_uid = 0;
+    m_streamID = (unsigned long long)-1;
     m_empty = true;
     m_config = NULL;
 
@@ -1071,6 +1076,7 @@ class warp_inst_t : public inst_t {
   }
   warp_inst_t(const core_config *config) {
     m_uid = 0;
+    m_streamID = (unsigned long long)-1;
     assert(config->warp_size <= MAX_WARP_SIZE);
     m_config = config;
     m_empty = true;
@@ -1098,7 +1104,8 @@ class warp_inst_t : public inst_t {
   void clear() { m_empty = true; }
 
   void issue(const active_mask_t &mask, unsigned warp_id,
-             unsigned long long cycle, int dynamic_warp_id, int sch_id);
+             unsigned long long cycle, int dynamic_warp_id, int sch_id,
+             unsigned long long streamID);
 
   const active_mask_t &get_active_mask() const { return m_warp_active_mask; }
   void completed(unsigned long long cycle)
@@ -1226,11 +1233,13 @@ class warp_inst_t : public inst_t {
 
   void print(FILE *fout) const;
   unsigned get_uid() const { return m_uid; }
+  unsigned long long get_streamID() const { return m_streamID; }
   unsigned get_schd_id() const { return m_scheduler_id; }
   active_mask_t get_warp_active_mask() const { return m_warp_active_mask; }
 
  protected:
   unsigned m_uid;
+  unsigned long long m_streamID;
   bool m_empty;
   bool m_cache_hit;
   unsigned long long issue_cycle;
diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index 32cc56b63..cd3c88033 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -634,14 +634,6 @@ void mshr_table::display(FILE *fp) const {
 /***************************************************************** Caches
  * *****************************************************************/
 cache_stats::cache_stats() {
-  m_stats.resize(NUM_MEM_ACCESS_TYPE);
-  m_stats_pw.resize(NUM_MEM_ACCESS_TYPE);
-  m_fail_stats.resize(NUM_MEM_ACCESS_TYPE);
-  for (unsigned i = 0; i < NUM_MEM_ACCESS_TYPE; ++i) {
-    m_stats[i].resize(NUM_CACHE_REQUEST_STATUS, 0);
-    m_stats_pw[i].resize(NUM_CACHE_REQUEST_STATUS, 0);
-    m_fail_stats[i].resize(NUM_CACHE_RESERVATION_FAIL_STATUS, 0);
-  }
   m_cache_port_available_cycles = 0;
   m_cache_data_port_busy_cycles = 0;
   m_cache_fill_port_busy_cycles = 0;
@@ -651,11 +643,10 @@ void cache_stats::clear() {
   ///
   /// Zero out all current cache statistics
   ///
-  for (unsigned i = 0; i < NUM_MEM_ACCESS_TYPE; ++i) {
-    std::fill(m_stats[i].begin(), m_stats[i].end(), 0);
-    std::fill(m_stats_pw[i].begin(), m_stats_pw[i].end(), 0);
-    std::fill(m_fail_stats[i].begin(), m_fail_stats[i].end(), 0);
-  }
+  m_stats.clear();
+  m_stats_pw.clear();
+  m_fail_stats.clear();
+
   m_cache_port_available_cycles = 0;
   m_cache_data_port_busy_cycles = 0;
   m_cache_fill_port_busy_cycles = 0;
@@ -665,35 +656,67 @@ void cache_stats::clear_pw() {
   ///
   /// Zero out per-window cache statistics
   ///
-  for (unsigned i = 0; i < NUM_MEM_ACCESS_TYPE; ++i) {
-    std::fill(m_stats_pw[i].begin(), m_stats_pw[i].end(), 0);
-  }
+  m_stats_pw.clear();
 }
 
-void cache_stats::inc_stats(int access_type, int access_outcome) {
+void cache_stats::inc_stats(int access_type, int access_outcome,
+                            unsigned long long streamID) {
   ///
   /// Increment the stat corresponding to (access_type, access_outcome) by 1.
   ///
   if (!check_valid(access_type, access_outcome))
     assert(0 && "Unknown cache access type or access outcome");
 
-  m_stats[access_type][access_outcome]++;
+  if (m_stats.find(streamID) == m_stats.end()) {
+    std::vector<std::vector<unsigned long long>> new_val;
+    new_val.resize(NUM_MEM_ACCESS_TYPE);
+    for (unsigned j = 0; j < NUM_MEM_ACCESS_TYPE; ++j) {
+      new_val[j].resize(NUM_CACHE_REQUEST_STATUS, 0);
+    }
+    m_stats.insert(std::pair<unsigned long long,
+                             std::vector<std::vector<unsigned long long>>>(
+        streamID, new_val));
+  }
+  m_stats.at(streamID)[access_type][access_outcome]++;
 }
 
-void cache_stats::inc_stats_pw(int access_type, int access_outcome) {
+void cache_stats::inc_stats_pw(int access_type, int access_outcome,
+                               unsigned long long streamID) {
   ///
   /// Increment the corresponding per-window cache stat
   ///
   if (!check_valid(access_type, access_outcome))
     assert(0 && "Unknown cache access type or access outcome");
-  m_stats_pw[access_type][access_outcome]++;
+
+  if (m_stats_pw.find(streamID) == m_stats_pw.end()) {
+    std::vector<std::vector<unsigned long long>> new_val;
+    new_val.resize(NUM_MEM_ACCESS_TYPE);
+    for (unsigned j = 0; j < NUM_MEM_ACCESS_TYPE; ++j) {
+      new_val[j].resize(NUM_CACHE_REQUEST_STATUS, 0);
+    }
+    m_stats_pw.insert(std::pair<unsigned long long,
+                                std::vector<std::vector<unsigned long long>>>(
+        streamID, new_val));
+  }
+  m_stats_pw.at(streamID)[access_type][access_outcome]++;
 }
 
-void cache_stats::inc_fail_stats(int access_type, int fail_outcome) {
+void cache_stats::inc_fail_stats(int access_type, int fail_outcome,
+                                 unsigned long long streamID) {
   if (!check_fail_valid(access_type, fail_outcome))
     assert(0 && "Unknown cache access type or access fail");
 
-  m_fail_stats[access_type][fail_outcome]++;
+  if (m_fail_stats.find(streamID) == m_fail_stats.end()) {
+    std::vector<std::vector<unsigned long long>> new_val;
+    new_val.resize(NUM_MEM_ACCESS_TYPE);
+    for (unsigned j = 0; j < NUM_MEM_ACCESS_TYPE; ++j) {
+      new_val[j].resize(NUM_CACHE_RESERVATION_FAIL_STATUS, 0);
+    }
+    m_fail_stats.insert(std::pair<unsigned long long,
+                                  std::vector<std::vector<unsigned long long>>>(
+        streamID, new_val));
+  }
+  m_fail_stats.at(streamID)[access_type][fail_outcome]++;
 }
 
 enum cache_request_status cache_stats::select_stats_status(
@@ -712,7 +735,8 @@ enum cache_request_status cache_stats::select_stats_status(
 }
 
 unsigned long long &cache_stats::operator()(int access_type, int access_outcome,
-                                            bool fail_outcome) {
+                                            bool fail_outcome,
+                                            unsigned long long streamID) {
   ///
   /// Simple method to read/modify the stat corresponding to (access_type,
   /// access_outcome) Used overloaded () to avoid the need for separate
@@ -722,17 +746,18 @@ unsigned long long &cache_stats::operator()(int access_type, int access_outcome,
     if (!check_fail_valid(access_type, access_outcome))
       assert(0 && "Unknown cache access type or fail outcome");
 
-    return m_fail_stats[access_type][access_outcome];
+    return m_fail_stats.at(streamID)[access_type][access_outcome];
   } else {
     if (!check_valid(access_type, access_outcome))
       assert(0 && "Unknown cache access type or access outcome");
 
-    return m_stats[access_type][access_outcome];
+    return m_stats.at(streamID)[access_type][access_outcome];
   }
 }
 
 unsigned long long cache_stats::operator()(int access_type, int access_outcome,
-                                           bool fail_outcome) const {
+                                           bool fail_outcome,
+                                           unsigned long long streamID) const {
   ///
   /// Const accessor into m_stats.
   ///
@@ -740,12 +765,12 @@ unsigned long long cache_stats::operator()(int access_type, int access_outcome,
     if (!check_fail_valid(access_type, access_outcome))
       assert(0 && "Unknown cache access type or fail outcome");
 
-    return m_fail_stats[access_type][access_outcome];
+    return m_fail_stats.at(streamID)[access_type][access_outcome];
   } else {
     if (!check_valid(access_type, access_outcome))
       assert(0 && "Unknown cache access type or access outcome");
 
-    return m_stats[access_type][access_outcome];
+    return m_stats.at(streamID)[access_type][access_outcome];
   }
 }
 
@@ -754,15 +779,74 @@ cache_stats cache_stats::operator+(const cache_stats &cs) {
   /// Overloaded + operator to allow for simple stat accumulation
   ///
   cache_stats ret;
-  for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
-    for (unsigned status = 0; status < NUM_CACHE_REQUEST_STATUS; ++status) {
-      ret(type, status, false) =
-          m_stats[type][status] + cs(type, status, false);
+  for (auto iter = m_stats.begin(); iter != m_stats.end(); ++iter) {
+    unsigned long long streamID = iter->first;
+    ret.m_stats.insert(std::pair<unsigned long long,
+                                 std::vector<std::vector<unsigned long long>>>(
+        streamID, m_stats.at(streamID)));
+  }
+  for (auto iter = m_stats_pw.begin(); iter != m_stats_pw.end(); ++iter) {
+    unsigned long long streamID = iter->first;
+    ret.m_stats_pw.insert(
+        std::pair<unsigned long long,
+                  std::vector<std::vector<unsigned long long>>>(
+            streamID, m_stats_pw.at(streamID)));
+  }
+  for (auto iter = m_fail_stats.begin(); iter != m_fail_stats.end(); ++iter) {
+    unsigned long long streamID = iter->first;
+    ret.m_fail_stats.insert(
+        std::pair<unsigned long long,
+                  std::vector<std::vector<unsigned long long>>>(
+            streamID, m_fail_stats.at(streamID)));
+  }
+  for (auto iter = cs.m_stats.begin(); iter != cs.m_stats.end(); ++iter) {
+    unsigned long long streamID = iter->first;
+    if (ret.m_stats.find(streamID) == ret.m_stats.end()) {
+      ret.m_stats.insert(
+          std::pair<unsigned long long,
+                    std::vector<std::vector<unsigned long long>>>(
+              streamID, cs.m_stats.at(streamID)));
+    } else {
+      for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
+        for (unsigned status = 0; status < NUM_CACHE_REQUEST_STATUS; ++status) {
+          ret.m_stats.at(streamID)[type][status] +=
+              cs(type, status, false, streamID);
+        }
+      }
     }
-    for (unsigned status = 0; status < NUM_CACHE_RESERVATION_FAIL_STATUS;
-         ++status) {
-      ret(type, status, true) =
-          m_fail_stats[type][status] + cs(type, status, true);
+  }
+  for (auto iter = cs.m_stats_pw.begin(); iter != cs.m_stats_pw.end(); ++iter) {
+    unsigned long long streamID = iter->first;
+    if (ret.m_stats_pw.find(streamID) == ret.m_stats_pw.end()) {
+      ret.m_stats_pw.insert(
+          std::pair<unsigned long long,
+                    std::vector<std::vector<unsigned long long>>>(
+              streamID, cs.m_stats_pw.at(streamID)));
+    } else {
+      for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
+        for (unsigned status = 0; status < NUM_CACHE_REQUEST_STATUS; ++status) {
+          ret.m_stats_pw.at(streamID)[type][status] +=
+              cs(type, status, false, streamID);
+        }
+      }
+    }
+  }
+  for (auto iter = cs.m_fail_stats.begin(); iter != cs.m_fail_stats.end();
+       ++iter) {
+    unsigned long long streamID = iter->first;
+    if (ret.m_fail_stats.find(streamID) == ret.m_fail_stats.end()) {
+      ret.m_fail_stats.insert(
+          std::pair<unsigned long long,
+                    std::vector<std::vector<unsigned long long>>>(
+              streamID, cs.m_fail_stats.at(streamID)));
+    } else {
+      for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
+        for (unsigned status = 0; status < NUM_CACHE_RESERVATION_FAIL_STATUS;
+             ++status) {
+          ret.m_fail_stats.at(streamID)[type][status] +=
+              cs(type, status, true, streamID);
+        }
+      }
     }
   }
   ret.m_cache_port_available_cycles =
@@ -778,16 +862,52 @@ cache_stats &cache_stats::operator+=(const cache_stats &cs) {
   ///
   /// Overloaded += operator to allow for simple stat accumulation
   ///
-  for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
-    for (unsigned status = 0; status < NUM_CACHE_REQUEST_STATUS; ++status) {
-      m_stats[type][status] += cs(type, status, false);
+  for (auto iter = cs.m_stats.begin(); iter != cs.m_stats.end(); ++iter) {
+    unsigned long long streamID = iter->first;
+    if (m_stats.find(streamID) == m_stats.end()) {
+      m_stats.insert(std::pair<unsigned long long,
+                               std::vector<std::vector<unsigned long long>>>(
+          streamID, cs.m_stats.at(streamID)));
+    } else {
+      for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
+        for (unsigned status = 0; status < NUM_CACHE_REQUEST_STATUS; ++status) {
+          m_stats.at(streamID)[type][status] +=
+              cs(type, status, false, streamID);
+        }
+      }
     }
-    for (unsigned status = 0; status < NUM_CACHE_REQUEST_STATUS; ++status) {
-      m_stats_pw[type][status] += cs(type, status, false);
+  }
+  for (auto iter = cs.m_stats_pw.begin(); iter != cs.m_stats_pw.end(); ++iter) {
+    unsigned long long streamID = iter->first;
+    if (m_stats_pw.find(streamID) == m_stats_pw.end()) {
+      m_stats_pw.insert(std::pair<unsigned long long,
+                                  std::vector<std::vector<unsigned long long>>>(
+          streamID, cs.m_stats_pw.at(streamID)));
+    } else {
+      for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
+        for (unsigned status = 0; status < NUM_CACHE_REQUEST_STATUS; ++status) {
+          m_stats_pw.at(streamID)[type][status] +=
+              cs(type, status, false, streamID);
+        }
+      }
     }
-    for (unsigned status = 0; status < NUM_CACHE_RESERVATION_FAIL_STATUS;
-         ++status) {
-      m_fail_stats[type][status] += cs(type, status, true);
+  }
+  for (auto iter = cs.m_fail_stats.begin(); iter != cs.m_fail_stats.end();
+       ++iter) {
+    unsigned long long streamID = iter->first;
+    if (m_fail_stats.find(streamID) == m_fail_stats.end()) {
+      m_fail_stats.insert(
+          std::pair<unsigned long long,
+                    std::vector<std::vector<unsigned long long>>>(
+              streamID, cs.m_fail_stats.at(streamID)));
+    } else {
+      for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
+        for (unsigned status = 0; status < NUM_CACHE_RESERVATION_FAIL_STATUS;
+             ++status) {
+          m_fail_stats.at(streamID)[type][status] +=
+              cs(type, status, true, streamID);
+        }
+      }
     }
   }
   m_cache_port_available_cycles += cs.m_cache_port_available_cycles;
@@ -796,46 +916,65 @@ cache_stats &cache_stats::operator+=(const cache_stats &cs) {
   return *this;
 }
 
-void cache_stats::print_stats(FILE *fout, const char *cache_name) const {
+void cache_stats::print_stats(FILE *fout, unsigned long long streamID,
+                              const char *cache_name) const {
   ///
-  /// Print out each non-zero cache statistic for every memory access type and
-  /// status "cache_name" defaults to "Cache_stats" when no argument is
-  /// provided, otherwise the provided name is used. The printed format is
+  /// For a given CUDA stream, print out each non-zero cache statistic for every
+  /// memory access type and status "cache_name" defaults to "Cache_stats" when
+  /// no argument is provided, otherwise the provided name is used. The printed
+  /// format is
   /// "<cache_name>[<request_type>][<request_status>] = <stat_value>"
-  ///
+  /// Specify streamID to be -1 to print every stream.
+
   std::vector<unsigned> total_access;
-  total_access.resize(NUM_MEM_ACCESS_TYPE, 0);
   std::string m_cache_name = cache_name;
-  for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
-    for (unsigned status = 0; status < NUM_CACHE_REQUEST_STATUS; ++status) {
-      fprintf(fout, "\t%s[%s][%s] = %llu\n", m_cache_name.c_str(),
-              mem_access_type_str((enum mem_access_type)type),
-              cache_request_status_str((enum cache_request_status)status),
-              m_stats[type][status]);
+  for (auto iter = m_stats.begin(); iter != m_stats.end(); ++iter) {
+    unsigned long long streamid = iter->first;
+    // when streamID is specified, skip stats for all other streams, otherwise,
+    // print stats from all streams
+    if ((streamID != -1) && (streamid != streamID)) continue;
+    total_access.clear();
+    total_access.resize(NUM_MEM_ACCESS_TYPE, 0);
+    for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
+      for (unsigned status = 0; status < NUM_CACHE_REQUEST_STATUS; ++status) {
+        fprintf(fout, "\t%s[%s][%s] = %llu\n", m_cache_name.c_str(),
+                mem_access_type_str((enum mem_access_type)type),
+                cache_request_status_str((enum cache_request_status)status),
+                m_stats.at(streamid)[type][status]);
 
-      if (status != RESERVATION_FAIL && status != MSHR_HIT)
-        // MSHR_HIT is a special type of SECTOR_MISS
-        // so its already included in the SECTOR_MISS
-        total_access[type] += m_stats[type][status];
+        if (status != RESERVATION_FAIL && status != MSHR_HIT)
+          // MSHR_HIT is a special type of SECTOR_MISS
+          // so its already included in the SECTOR_MISS
+          total_access[type] += m_stats.at(streamid)[type][status];
+      }
+    }
+    for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
+      if (total_access[type] > 0)
+        fprintf(fout, "\t%s[%s][%s] = %u\n", m_cache_name.c_str(),
+                mem_access_type_str((enum mem_access_type)type), "TOTAL_ACCESS",
+                total_access[type]);
     }
-  }
-  for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
-    if (total_access[type] > 0)
-      fprintf(fout, "\t%s[%s][%s] = %u\n", m_cache_name.c_str(),
-              mem_access_type_str((enum mem_access_type)type), "TOTAL_ACCESS",
-              total_access[type]);
   }
 }
 
-void cache_stats::print_fail_stats(FILE *fout, const char *cache_name) const {
+void cache_stats::print_fail_stats(FILE *fout, unsigned long long streamID,
+                                   const char *cache_name) const {
   std::string m_cache_name = cache_name;
-  for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
-    for (unsigned fail = 0; fail < NUM_CACHE_RESERVATION_FAIL_STATUS; ++fail) {
-      if (m_fail_stats[type][fail] > 0) {
-        fprintf(fout, "\t%s[%s][%s] = %llu\n", m_cache_name.c_str(),
-                mem_access_type_str((enum mem_access_type)type),
-                cache_fail_status_str((enum cache_reservation_fail_reason)fail),
-                m_fail_stats[type][fail]);
+  for (auto iter = m_fail_stats.begin(); iter != m_fail_stats.end(); ++iter) {
+    unsigned long long streamid = iter->first;
+    // when streamID is specified, skip stats for all other streams, otherwise,
+    // print stats from all streams
+    if ((streamID != -1) && (streamid != streamID)) continue;
+    for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
+      for (unsigned fail = 0; fail < NUM_CACHE_RESERVATION_FAIL_STATUS;
+           ++fail) {
+        if (m_fail_stats.at(streamid)[type][fail] > 0) {
+          fprintf(
+              fout, "\t%s[%s][%s] = %llu\n", m_cache_name.c_str(),
+              mem_access_type_str((enum mem_access_type)type),
+              cache_fail_status_str((enum cache_reservation_fail_reason)fail),
+              m_fail_stats.at(streamid)[type][fail]);
+        }
       }
     }
   }
@@ -866,11 +1005,14 @@ unsigned long long cache_stats::get_stats(
   /// cache_request_statuses.
   ///
   unsigned long long total = 0;
-  for (unsigned type = 0; type < num_access_type; ++type) {
-    for (unsigned status = 0; status < num_access_status; ++status) {
-      if (!check_valid((int)access_type[type], (int)access_status[status]))
-        assert(0 && "Unknown cache access type or access outcome");
-      total += m_stats[access_type[type]][access_status[status]];
+  for (auto iter = m_stats.begin(); iter != m_stats.end(); ++iter) {
+    unsigned long long streamID = iter->first;
+    for (unsigned type = 0; type < num_access_type; ++type) {
+      for (unsigned status = 0; status < num_access_status; ++status) {
+        if (!check_valid((int)access_type[type], (int)access_status[status]))
+          assert(0 && "Unknown cache access type or access outcome");
+        total += m_stats.at(streamID)[access_type[type]][access_status[status]];
+      }
     }
   }
   return total;
@@ -883,18 +1025,23 @@ void cache_stats::get_sub_stats(struct cache_sub_stats &css) const {
   struct cache_sub_stats t_css;
   t_css.clear();
 
-  for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
-    for (unsigned status = 0; status < NUM_CACHE_REQUEST_STATUS; ++status) {
-      if (status == HIT || status == MISS || status == SECTOR_MISS ||
-          status == HIT_RESERVED)
-        t_css.accesses += m_stats[type][status];
+  for (auto iter = m_stats.begin(); iter != m_stats.end(); ++iter) {
+    unsigned long long streamID = iter->first;
+    for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
+      for (unsigned status = 0; status < NUM_CACHE_REQUEST_STATUS; ++status) {
+        if (status == HIT || status == MISS || status == SECTOR_MISS ||
+            status == HIT_RESERVED)
+          t_css.accesses += m_stats.at(streamID)[type][status];
 
-      if (status == MISS || status == SECTOR_MISS)
-        t_css.misses += m_stats[type][status];
+        if (status == MISS || status == SECTOR_MISS)
+          t_css.misses += m_stats.at(streamID)[type][status];
 
-      if (status == HIT_RESERVED) t_css.pending_hits += m_stats[type][status];
+        if (status == HIT_RESERVED)
+          t_css.pending_hits += m_stats.at(streamID)[type][status];
 
-      if (status == RESERVATION_FAIL) t_css.res_fails += m_stats[type][status];
+        if (status == RESERVATION_FAIL)
+          t_css.res_fails += m_stats.at(streamID)[type][status];
+      }
     }
   }
 
@@ -912,41 +1059,48 @@ void cache_stats::get_sub_stats_pw(struct cache_sub_stats_pw &css) const {
   struct cache_sub_stats_pw t_css;
   t_css.clear();
 
-  for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
-    for (unsigned status = 0; status < NUM_CACHE_REQUEST_STATUS; ++status) {
-      if (status == HIT || status == MISS || status == SECTOR_MISS ||
-          status == HIT_RESERVED)
-        t_css.accesses += m_stats_pw[type][status];
-
-      if (status == HIT) {
-        if (type == GLOBAL_ACC_R || type == CONST_ACC_R || type == INST_ACC_R) {
-          t_css.read_hits += m_stats_pw[type][status];
-        } else if (type == GLOBAL_ACC_W) {
-          t_css.write_hits += m_stats_pw[type][status];
+  for (auto iter = m_stats_pw.begin(); iter != m_stats_pw.end(); ++iter) {
+    unsigned long long streamID = iter->first;
+    for (unsigned type = 0; type < NUM_MEM_ACCESS_TYPE; ++type) {
+      for (unsigned status = 0; status < NUM_CACHE_REQUEST_STATUS; ++status) {
+        if (status == HIT || status == MISS || status == SECTOR_MISS ||
+            status == HIT_RESERVED)
+          t_css.accesses += m_stats_pw.at(streamID)[type][status];
+
+        if (status == HIT) {
+          if (type == GLOBAL_ACC_R || type == CONST_ACC_R ||
+              type == INST_ACC_R) {
+            t_css.read_hits += m_stats_pw.at(streamID)[type][status];
+          } else if (type == GLOBAL_ACC_W) {
+            t_css.write_hits += m_stats_pw.at(streamID)[type][status];
+          }
         }
-      }
 
-      if (status == MISS || status == SECTOR_MISS) {
-        if (type == GLOBAL_ACC_R || type == CONST_ACC_R || type == INST_ACC_R) {
-          t_css.read_misses += m_stats_pw[type][status];
-        } else if (type == GLOBAL_ACC_W) {
-          t_css.write_misses += m_stats_pw[type][status];
+        if (status == MISS || status == SECTOR_MISS) {
+          if (type == GLOBAL_ACC_R || type == CONST_ACC_R ||
+              type == INST_ACC_R) {
+            t_css.read_misses += m_stats_pw.at(streamID)[type][status];
+          } else if (type == GLOBAL_ACC_W) {
+            t_css.write_misses += m_stats_pw.at(streamID)[type][status];
+          }
         }
-      }
 
-      if (status == HIT_RESERVED) {
-        if (type == GLOBAL_ACC_R || type == CONST_ACC_R || type == INST_ACC_R) {
-          t_css.read_pending_hits += m_stats_pw[type][status];
-        } else if (type == GLOBAL_ACC_W) {
-          t_css.write_pending_hits += m_stats_pw[type][status];
+        if (status == HIT_RESERVED) {
+          if (type == GLOBAL_ACC_R || type == CONST_ACC_R ||
+              type == INST_ACC_R) {
+            t_css.read_pending_hits += m_stats_pw.at(streamID)[type][status];
+          } else if (type == GLOBAL_ACC_W) {
+            t_css.write_pending_hits += m_stats_pw.at(streamID)[type][status];
+          }
         }
-      }
 
-      if (status == RESERVATION_FAIL) {
-        if (type == GLOBAL_ACC_R || type == CONST_ACC_R || type == INST_ACC_R) {
-          t_css.read_res_fails += m_stats_pw[type][status];
-        } else if (type == GLOBAL_ACC_W) {
-          t_css.write_res_fails += m_stats_pw[type][status];
+        if (status == RESERVATION_FAIL) {
+          if (type == GLOBAL_ACC_R || type == CONST_ACC_R ||
+              type == INST_ACC_R) {
+            t_css.read_res_fails += m_stats_pw.at(streamID)[type][status];
+          } else if (type == GLOBAL_ACC_W) {
+            t_css.write_res_fails += m_stats_pw.at(streamID)[type][status];
+          }
         }
       }
     }
@@ -1139,6 +1293,50 @@ void baseline_cache::display_state(FILE *fp) const {
   fprintf(fp, "\n");
 }
 
+void baseline_cache::inc_aggregated_stats(cache_request_status status,
+                                          cache_request_status cache_status,
+                                          mem_fetch *mf,
+                                          enum cache_gpu_level level) {
+  if (level == L1_GPU_CACHE) {
+    m_gpu->aggregated_l1_stats.inc_stats(
+        mf->get_streamID(), mf->get_access_type(),
+        m_gpu->aggregated_l1_stats.select_stats_status(status, cache_status));
+  } else if (level == L2_GPU_CACHE) {
+    m_gpu->aggregated_l2_stats.inc_stats(
+        mf->get_streamID(), mf->get_access_type(),
+        m_gpu->aggregated_l2_stats.select_stats_status(status, cache_status));
+  }
+}
+
+void baseline_cache::inc_aggregated_fail_stats(
+    cache_request_status status, cache_request_status cache_status,
+    mem_fetch *mf, enum cache_gpu_level level) {
+  if (level == L1_GPU_CACHE) {
+    m_gpu->aggregated_l1_stats.inc_fail_stats(
+        mf->get_streamID(), mf->get_access_type(),
+        m_gpu->aggregated_l1_stats.select_stats_status(status, cache_status));
+  } else if (level == L2_GPU_CACHE) {
+    m_gpu->aggregated_l2_stats.inc_fail_stats(
+        mf->get_streamID(), mf->get_access_type(),
+        m_gpu->aggregated_l2_stats.select_stats_status(status, cache_status));
+  }
+}
+
+void baseline_cache::inc_aggregated_stats_pw(cache_request_status status,
+                                             cache_request_status cache_status,
+                                             mem_fetch *mf,
+                                             enum cache_gpu_level level) {
+  if (level == L1_GPU_CACHE) {
+    m_gpu->aggregated_l1_stats.inc_stats_pw(
+        mf->get_streamID(), mf->get_access_type(),
+        m_gpu->aggregated_l1_stats.select_stats_status(status, cache_status));
+  } else if (level == L2_GPU_CACHE) {
+    m_gpu->aggregated_l2_stats.inc_stats_pw(
+        mf->get_streamID(), mf->get_access_type(),
+        m_gpu->aggregated_l2_stats.select_stats_status(status, cache_status));
+  }
+}
+
 /// Read miss handler without writeback
 void baseline_cache::send_read_request(new_addr_type addr,
                                        new_addr_type block_addr,
@@ -1170,7 +1368,7 @@ void baseline_cache::send_read_request(new_addr_type addr,
       m_tag_array->access(block_addr, time, cache_index, wb, evicted, mf);
 
     m_mshrs.add(mshr_addr, mf);
-    m_stats.inc_stats(mf->get_access_type(), MSHR_HIT);
+    m_stats.inc_stats(mf->get_access_type(), MSHR_HIT, mf->get_streamID());
     do_miss = true;
 
   } else if (!mshr_hit && mshr_avail &&
@@ -1191,9 +1389,11 @@ void baseline_cache::send_read_request(new_addr_type addr,
 
     do_miss = true;
   } else if (mshr_hit && !mshr_avail)
-    m_stats.inc_fail_stats(mf->get_access_type(), MSHR_MERGE_ENRTY_FAIL);
+    m_stats.inc_fail_stats(mf->get_access_type(), MSHR_MERGE_ENRTY_FAIL,
+                           mf->get_streamID());
   else if (!mshr_hit && !mshr_avail)
-    m_stats.inc_fail_stats(mf->get_access_type(), MSHR_ENRTY_FAIL);
+    m_stats.inc_fail_stats(mf->get_access_type(), MSHR_ENRTY_FAIL,
+                           mf->get_streamID());
   else
     assert(0);
 }
@@ -1253,7 +1453,8 @@ cache_request_status data_cache::wr_hit_wt(new_addr_type addr,
                                            std::list<cache_event> &events,
                                            enum cache_request_status status) {
   if (miss_queue_full(0)) {
-    m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
+    m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL,
+                           mf->get_streamID());
     return RESERVATION_FAIL;  // cannot handle request this cycle
   }
 
@@ -1281,7 +1482,8 @@ cache_request_status data_cache::wr_hit_we(new_addr_type addr,
                                            std::list<cache_event> &events,
                                            enum cache_request_status status) {
   if (miss_queue_full(0)) {
-    m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
+    m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL,
+                           mf->get_streamID());
     return RESERVATION_FAIL;  // cannot handle request this cycle
   }
 
@@ -1330,11 +1532,14 @@ enum cache_request_status data_cache::wr_miss_wa_naive(
          (m_miss_queue.size() < m_config.m_miss_queue_size)))) {
     // check what is the exactly the failure reason
     if (miss_queue_full(2))
-      m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
+      m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL,
+                             mf->get_streamID());
     else if (mshr_hit && !mshr_avail)
-      m_stats.inc_fail_stats(mf->get_access_type(), MSHR_MERGE_ENRTY_FAIL);
+      m_stats.inc_fail_stats(mf->get_access_type(), MSHR_MERGE_ENRTY_FAIL,
+                             mf->get_streamID());
     else if (!mshr_hit && !mshr_avail)
-      m_stats.inc_fail_stats(mf->get_access_type(), MSHR_ENRTY_FAIL);
+      m_stats.inc_fail_stats(mf->get_access_type(), MSHR_ENRTY_FAIL,
+                             mf->get_streamID());
     else
       assert(0);
 
@@ -1353,10 +1558,10 @@ enum cache_request_status data_cache::wr_miss_wa_naive(
                        mf->get_access_warp_mask(), mf->get_access_byte_mask(),
                        mf->get_access_sector_mask(), m_gpu->gpgpu_ctx);
 
-  mem_fetch *n_mf =
-      new mem_fetch(*ma, NULL, mf->get_ctrl_size(), mf->get_wid(),
-                    mf->get_sid(), mf->get_tpc(), mf->get_mem_config(),
-                    m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
+  mem_fetch *n_mf = new mem_fetch(
+      *ma, NULL, mf->get_streamID(), mf->get_ctrl_size(), mf->get_wid(),
+      mf->get_sid(), mf->get_tpc(), mf->get_mem_config(),
+      m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
 
   bool do_miss = false;
   bool wb = false;
@@ -1378,7 +1583,7 @@ enum cache_request_status data_cache::wr_miss_wa_naive(
           evicted.m_block_addr, m_wrbk_type, mf->get_access_warp_mask(),
           evicted.m_byte_mask, evicted.m_sector_mask, evicted.m_modified_size,
           true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, -1, -1, -1,
-          NULL);
+          NULL, mf->get_streamID());
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
       wb->set_chip(mf->get_tlx_addr().chip);
@@ -1404,7 +1609,8 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
     // reserve mshr
 
     if (miss_queue_full(0)) {
-      m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
+      m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL,
+                             mf->get_streamID());
       return RESERVATION_FAIL;  // cannot handle request this cycle
     }
 
@@ -1431,7 +1637,7 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
             evicted.m_block_addr, m_wrbk_type, mf->get_access_warp_mask(),
             evicted.m_byte_mask, evicted.m_sector_mask, evicted.m_modified_size,
             true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, -1, -1, -1,
-            NULL);
+            NULL, mf->get_streamID());
         // the evicted block may have wrong chip id when advanced L2 hashing  is
         // used, so set the right chip address from the original mf
         wb->set_chip(mf->get_tlx_addr().chip);
@@ -1451,11 +1657,14 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
            (m_miss_queue.size() < m_config.m_miss_queue_size)))) {
       // check what is the exactly the failure reason
       if (miss_queue_full(1))
-        m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
+        m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL,
+                               mf->get_streamID());
       else if (mshr_hit && !mshr_avail)
-        m_stats.inc_fail_stats(mf->get_access_type(), MSHR_MERGE_ENRTY_FAIL);
+        m_stats.inc_fail_stats(mf->get_access_type(), MSHR_MERGE_ENRTY_FAIL,
+                               mf->get_streamID());
       else if (!mshr_hit && !mshr_avail)
-        m_stats.inc_fail_stats(mf->get_access_type(), MSHR_ENRTY_FAIL);
+        m_stats.inc_fail_stats(mf->get_access_type(), MSHR_ENRTY_FAIL,
+                               mf->get_streamID());
       else
         assert(0);
 
@@ -1468,7 +1677,8 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
     if (m_mshrs.probe(mshr_addr) &&
         m_mshrs.is_read_after_write_pending(mshr_addr) && mf->is_write()) {
       // assert(0);
-      m_stats.inc_fail_stats(mf->get_access_type(), MSHR_RW_PENDING);
+      m_stats.inc_fail_stats(mf->get_access_type(), MSHR_RW_PENDING,
+                             mf->get_streamID());
       return RESERVATION_FAIL;
     }
 
@@ -1479,8 +1689,8 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
         mf->get_access_sector_mask(), m_gpu->gpgpu_ctx);
 
     mem_fetch *n_mf = new mem_fetch(
-        *ma, NULL, mf->get_ctrl_size(), mf->get_wid(), mf->get_sid(),
-        mf->get_tpc(), mf->get_mem_config(),
+        *ma, NULL, mf->get_streamID(), mf->get_ctrl_size(), mf->get_wid(),
+        mf->get_sid(), mf->get_tpc(), mf->get_mem_config(),
         m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, NULL, mf);
 
     new_addr_type block_addr = m_config.block_addr(addr);
@@ -1504,7 +1714,7 @@ enum cache_request_status data_cache::wr_miss_wa_fetch_on_write(
             evicted.m_block_addr, m_wrbk_type, mf->get_access_warp_mask(),
             evicted.m_byte_mask, evicted.m_sector_mask, evicted.m_modified_size,
             true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, -1, -1, -1,
-            NULL);
+            NULL, mf->get_streamID());
         // the evicted block may have wrong chip id when advanced L2 hashing  is
         // used, so set the right chip address from the original mf
         wb->set_chip(mf->get_tlx_addr().chip);
@@ -1528,7 +1738,8 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
   // mshr
 
   if (miss_queue_full(0)) {
-    m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
+    m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL,
+                           mf->get_streamID());
     return RESERVATION_FAIL;  // cannot handle request this cycle
   }
 
@@ -1571,7 +1782,7 @@ enum cache_request_status data_cache::wr_miss_wa_lazy_fetch_on_read(
           evicted.m_block_addr, m_wrbk_type, mf->get_access_warp_mask(),
           evicted.m_byte_mask, evicted.m_sector_mask, evicted.m_modified_size,
           true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, -1, -1, -1,
-          NULL);
+          NULL, mf->get_streamID());
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
       wb->set_chip(mf->get_tlx_addr().chip);
@@ -1589,7 +1800,8 @@ enum cache_request_status data_cache::wr_miss_no_wa(
     new_addr_type addr, unsigned cache_index, mem_fetch *mf, unsigned time,
     std::list<cache_event> &events, enum cache_request_status status) {
   if (miss_queue_full(0)) {
-    m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
+    m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL,
+                           mf->get_streamID());
     return RESERVATION_FAIL;  // cannot handle request this cycle
   }
 
@@ -1634,7 +1846,8 @@ enum cache_request_status data_cache::rd_miss_base(
   if (miss_queue_full(1)) {
     // cannot handle request this cycle
     // (might need to generate two requests)
-    m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
+    m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL,
+                           mf->get_streamID());
     return RESERVATION_FAIL;
   }
 
@@ -1653,7 +1866,7 @@ enum cache_request_status data_cache::rd_miss_base(
           evicted.m_block_addr, m_wrbk_type, mf->get_access_warp_mask(),
           evicted.m_byte_mask, evicted.m_sector_mask, evicted.m_modified_size,
           true, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, -1, -1, -1,
-          NULL);
+          NULL, mf->get_streamID());
       // the evicted block may have wrong chip id when advanced L2 hashing  is
       // used, so set the right chip address from the original mf
       wb->set_chip(mf->get_tlx_addr().chip);
@@ -1693,16 +1906,20 @@ enum cache_request_status read_only_cache::access(
         cache_status = RESERVATION_FAIL;
     } else {
       cache_status = RESERVATION_FAIL;
-      m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL);
+      m_stats.inc_fail_stats(mf->get_access_type(), MISS_QUEUE_FULL,
+                             mf->get_streamID());
     }
   } else {
-    m_stats.inc_fail_stats(mf->get_access_type(), LINE_ALLOC_FAIL);
+    m_stats.inc_fail_stats(mf->get_access_type(), LINE_ALLOC_FAIL,
+                           mf->get_streamID());
   }
 
   m_stats.inc_stats(mf->get_access_type(),
-                    m_stats.select_stats_status(status, cache_status));
+                    m_stats.select_stats_status(status, cache_status),
+                    mf->get_streamID());
   m_stats.inc_stats_pw(mf->get_access_type(),
-                       m_stats.select_stats_status(status, cache_status));
+                       m_stats.select_stats_status(status, cache_status),
+                       mf->get_streamID());
   return cache_status;
 }
 
@@ -1730,7 +1947,8 @@ enum cache_request_status data_cache::process_tag_probe(
     } else {
       // the only reason for reservation fail here is LINE_ALLOC_FAIL (i.e all
       // lines are reserved)
-      m_stats.inc_fail_stats(mf->get_access_type(), LINE_ALLOC_FAIL);
+      m_stats.inc_fail_stats(mf->get_access_type(), LINE_ALLOC_FAIL,
+                             mf->get_streamID());
     }
   } else {  // Read
     if (probe_status == HIT) {
@@ -1742,7 +1960,8 @@ enum cache_request_status data_cache::process_tag_probe(
     } else {
       // the only reason for reservation fail here is LINE_ALLOC_FAIL (i.e all
       // lines are reserved)
-      m_stats.inc_fail_stats(mf->get_access_type(), LINE_ALLOC_FAIL);
+      m_stats.inc_fail_stats(mf->get_access_type(), LINE_ALLOC_FAIL,
+                             mf->get_streamID());
     }
   }
 
@@ -1767,9 +1986,11 @@ enum cache_request_status data_cache::access(new_addr_type addr, mem_fetch *mf,
   enum cache_request_status access_status =
       process_tag_probe(wr, probe_status, addr, cache_index, mf, time, events);
   m_stats.inc_stats(mf->get_access_type(),
-                    m_stats.select_stats_status(probe_status, access_status));
-  m_stats.inc_stats_pw(mf->get_access_type(), m_stats.select_stats_status(
-                                                  probe_status, access_status));
+                    m_stats.select_stats_status(probe_status, access_status),
+                    mf->get_streamID());
+  m_stats.inc_stats_pw(mf->get_access_type(),
+                       m_stats.select_stats_status(probe_status, access_status),
+                       mf->get_streamID());
   return access_status;
 }
 
@@ -1831,9 +2052,11 @@ enum cache_request_status tex_cache::access(new_addr_type addr, mem_fetch *mf,
     cache_status = HIT_RESERVED;
   }
   m_stats.inc_stats(mf->get_access_type(),
-                    m_stats.select_stats_status(status, cache_status));
+                    m_stats.select_stats_status(status, cache_status),
+                    mf->get_streamID());
   m_stats.inc_stats_pw(mf->get_access_type(),
-                       m_stats.select_stats_status(status, cache_status));
+                       m_stats.select_stats_status(status, cache_status),
+                       mf->get_streamID());
   return cache_status;
 }
 
diff --git a/src/gpgpu-sim/gpu-cache.h b/src/gpgpu-sim/gpu-cache.h
index 5fd40a9bc..c07695fa8 100644
--- a/src/gpgpu-sim/gpu-cache.h
+++ b/src/gpgpu-sim/gpu-cache.h
@@ -72,6 +72,13 @@ enum cache_event_type {
   WRITE_ALLOCATE_SENT
 };
 
+enum cache_gpu_level {
+  L1_GPU_CACHE = 0,
+  L2_GPU_CACHE,
+  OTHER_GPU_CACHE,
+  NUM_CACHE_GPU_LEVELS
+};
+
 struct evicted_block_info {
   new_addr_type m_block_addr;
   unsigned m_modified_size;
@@ -1200,20 +1207,26 @@ class cache_stats {
   void clear();
   // Clear AerialVision cache stats after each window
   void clear_pw();
-  void inc_stats(int access_type, int access_outcome);
+  void inc_stats(int access_type, int access_outcome,
+                 unsigned long long streamID);
   // Increment AerialVision cache stats
-  void inc_stats_pw(int access_type, int access_outcome);
-  void inc_fail_stats(int access_type, int fail_outcome);
+  void inc_stats_pw(int access_type, int access_outcome,
+                    unsigned long long streamID);
+  void inc_fail_stats(int access_type, int fail_outcome,
+                      unsigned long long streamID);
   enum cache_request_status select_stats_status(
       enum cache_request_status probe, enum cache_request_status access) const;
   unsigned long long &operator()(int access_type, int access_outcome,
-                                 bool fail_outcome);
+                                 bool fail_outcome,
+                                 unsigned long long streamID);
   unsigned long long operator()(int access_type, int access_outcome,
-                                bool fail_outcome) const;
+                                bool fail_outcome,
+                                unsigned long long streamID) const;
   cache_stats operator+(const cache_stats &cs);
   cache_stats &operator+=(const cache_stats &cs);
-  void print_stats(FILE *fout, const char *cache_name = "Cache_stats") const;
-  void print_fail_stats(FILE *fout,
+  void print_stats(FILE *fout, unsigned long long streamID,
+                   const char *cache_name = "Cache_stats") const;
+  void print_fail_stats(FILE *fout, unsigned long long streamID,
                         const char *cache_name = "Cache_fail_stats") const;
 
   unsigned long long get_stats(enum mem_access_type *access_type,
@@ -1231,10 +1244,14 @@ class cache_stats {
   bool check_valid(int type, int status) const;
   bool check_fail_valid(int type, int fail) const;
 
-  std::vector<std::vector<unsigned long long> > m_stats;
+  // CUDA streamID -> cache stats[NUM_MEM_ACCESS_TYPE]
+  std::map<unsigned long long, std::vector<std::vector<unsigned long long>>>
+      m_stats;
   // AerialVision cache stats (per-window)
-  std::vector<std::vector<unsigned long long> > m_stats_pw;
-  std::vector<std::vector<unsigned long long> > m_fail_stats;
+  std::map<unsigned long long, std::vector<std::vector<unsigned long long>>>
+      m_stats_pw;
+  std::map<unsigned long long, std::vector<std::vector<unsigned long long>>>
+      m_fail_stats;
 
   unsigned long long m_cache_port_available_cycles;
   unsigned long long m_cache_data_port_busy_cycles;
@@ -1264,11 +1281,14 @@ class baseline_cache : public cache_t {
  public:
   baseline_cache(const char *name, cache_config &config, int core_id,
                  int type_id, mem_fetch_interface *memport,
-                 enum mem_fetch_status status)
+                 enum mem_fetch_status status, enum cache_gpu_level level,
+                 gpgpu_sim *gpu)
       : m_config(config),
         m_tag_array(new tag_array(config, core_id, type_id)),
         m_mshrs(config.m_mshr_entries, config.m_mshr_max_merge),
-        m_bandwidth_management(config) {
+        m_bandwidth_management(config),
+        m_level(level),
+        m_gpu(gpu) {
     init(name, config, memport, status);
   }
 
@@ -1336,6 +1356,15 @@ class baseline_cache : public cache_t {
   bool fill_port_free() const {
     return m_bandwidth_management.fill_port_free();
   }
+  void inc_aggregated_stats(cache_request_status status,
+                            cache_request_status cache_status, mem_fetch *mf,
+                            enum cache_gpu_level level);
+  void inc_aggregated_fail_stats(cache_request_status status,
+                                 cache_request_status cache_status,
+                                 mem_fetch *mf, enum cache_gpu_level level);
+  void inc_aggregated_stats_pw(cache_request_status status,
+                               cache_request_status cache_status, mem_fetch *mf,
+                               enum cache_gpu_level level);
 
   // This is a gapping hole we are poking in the system to quickly handle
   // filling the cache on cudamemcopies. We don't care about anything other than
@@ -1367,6 +1396,8 @@ class baseline_cache : public cache_t {
   std::list<mem_fetch *> m_miss_queue;
   enum mem_fetch_status m_miss_queue_status;
   mem_fetch_interface *m_memport;
+  cache_gpu_level m_level;
+  gpgpu_sim *m_gpu;
 
   struct extra_mf_fields {
     extra_mf_fields() { m_valid = false; }
@@ -1453,8 +1484,10 @@ class read_only_cache : public baseline_cache {
  public:
   read_only_cache(const char *name, cache_config &config, int core_id,
                   int type_id, mem_fetch_interface *memport,
-                  enum mem_fetch_status status)
-      : baseline_cache(name, config, core_id, type_id, memport, status) {}
+                  enum mem_fetch_status status, enum cache_gpu_level level,
+                  gpgpu_sim *gpu)
+      : baseline_cache(name, config, core_id, type_id, memport, status, level,
+                       gpu) {}
 
   /// Access cache for read_only_cache: returns RESERVATION_FAIL if request
   /// could not be accepted (for any reason)
@@ -1478,8 +1511,10 @@ class data_cache : public baseline_cache {
   data_cache(const char *name, cache_config &config, int core_id, int type_id,
              mem_fetch_interface *memport, mem_fetch_allocator *mfcreator,
              enum mem_fetch_status status, mem_access_type wr_alloc_type,
-             mem_access_type wrbk_type, class gpgpu_sim *gpu)
-      : baseline_cache(name, config, core_id, type_id, memport, status) {
+             mem_access_type wrbk_type, class gpgpu_sim *gpu,
+             enum cache_gpu_level level)
+      : baseline_cache(name, config, core_id, type_id, memport, status, level,
+                       gpu) {
     init(mfcreator);
     m_wr_alloc_type = wr_alloc_type;
     m_wrbk_type = wrbk_type;
@@ -1668,9 +1703,10 @@ class l1_cache : public data_cache {
  public:
   l1_cache(const char *name, cache_config &config, int core_id, int type_id,
            mem_fetch_interface *memport, mem_fetch_allocator *mfcreator,
-           enum mem_fetch_status status, class gpgpu_sim *gpu)
+           enum mem_fetch_status status, class gpgpu_sim *gpu,
+           enum cache_gpu_level level)
       : data_cache(name, config, core_id, type_id, memport, mfcreator, status,
-                   L1_WR_ALLOC_R, L1_WRBK_ACC, gpu) {}
+                   L1_WR_ALLOC_R, L1_WRBK_ACC, gpu, level) {}
 
   virtual ~l1_cache() {}
 
@@ -1693,9 +1729,10 @@ class l2_cache : public data_cache {
  public:
   l2_cache(const char *name, cache_config &config, int core_id, int type_id,
            mem_fetch_interface *memport, mem_fetch_allocator *mfcreator,
-           enum mem_fetch_status status, class gpgpu_sim *gpu)
+           enum mem_fetch_status status, class gpgpu_sim *gpu,
+           enum cache_gpu_level level)
       : data_cache(name, config, core_id, type_id, memport, mfcreator, status,
-                   L2_WR_ALLOC_R, L2_WRBK_ACC, gpu) {}
+                   L2_WR_ALLOC_R, L2_WRBK_ACC, gpu, level) {}
 
   virtual ~l2_cache() {}
 
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index 1cb8a251e..0c922bdb3 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -788,6 +788,22 @@ void increment_x_then_y_then_z(dim3 &i, const dim3 &bound) {
 }
 
 void gpgpu_sim::launch(kernel_info_t *kinfo) {
+  unsigned kernelID = kinfo->get_uid();
+  unsigned long long streamID = kinfo->get_streamID();
+
+  kernel_time_t kernel_time = {gpu_tot_sim_cycle + gpu_sim_cycle, 0};
+  if (gpu_kernel_time.find(streamID) == gpu_kernel_time.end()) {
+    std::map<unsigned, kernel_time_t> new_val;
+    new_val.insert(std::pair<unsigned, kernel_time_t>(kernelID, kernel_time));
+    gpu_kernel_time.insert(
+        std::pair<unsigned long long, std::map<unsigned, kernel_time_t>>(
+            streamID, new_val));
+  } else {
+    gpu_kernel_time.at(streamID).insert(
+        std::pair<unsigned, kernel_time_t>(kernelID, kernel_time));
+    ////////// assume same kernel ID do not appear more than once
+  }
+
   unsigned cta_size = kinfo->threads_per_cta();
   if (cta_size > m_shader_config->n_thread_per_shader) {
     printf(
@@ -893,7 +909,10 @@ kernel_info_t *gpgpu_sim::select_kernel() {
 }
 
 unsigned gpgpu_sim::finished_kernel() {
-  if (m_finished_kernel.empty()) return 0;
+  if (m_finished_kernel.empty()) {
+    last_streamID = -1;
+    return 0;
+  }
   unsigned result = m_finished_kernel.front();
   m_finished_kernel.pop_front();
   return result;
@@ -901,6 +920,11 @@ unsigned gpgpu_sim::finished_kernel() {
 
 void gpgpu_sim::set_kernel_done(kernel_info_t *kernel) {
   unsigned uid = kernel->get_uid();
+  last_uid = uid;
+  unsigned long long streamID = kernel->get_streamID();
+  last_streamID = streamID;
+  gpu_kernel_time.at(streamID).at(uid).end_cycle =
+      gpu_tot_sim_cycle + gpu_sim_cycle;
   m_finished_kernel.push_back(uid);
   std::vector<kernel_info_t *>::iterator k;
   for (k = m_running_kernels.begin(); k != m_running_kernels.end(); k++) {
@@ -971,6 +995,9 @@ gpgpu_sim::gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx)
   gpu_tot_sim_cycle_parition_util = 0;
   partiton_replys_in_parallel = 0;
   partiton_replys_in_parallel_total = 0;
+  last_streamID = -1;
+
+  gpu_kernel_time.clear();
 
   m_memory_partition_unit =
       new memory_partition_unit *[m_memory_config->m_n_mem];
@@ -1178,9 +1205,9 @@ PowerscalingCoefficients *gpgpu_sim::get_scaling_coeffs() {
   return m_gpgpusim_wrapper->get_scaling_coeffs();
 }
 
-void gpgpu_sim::print_stats() {
+void gpgpu_sim::print_stats(unsigned long long streamID) {
   gpgpu_ctx->stats->ptx_file_line_stats_write_file();
-  gpu_print_stat();
+  gpu_print_stat(streamID);
 
   if (g_network_mode) {
     printf(
@@ -1363,12 +1390,15 @@ void gpgpu_sim::clear_executed_kernel_info() {
   m_executed_kernel_names.clear();
   m_executed_kernel_uids.clear();
 }
-void gpgpu_sim::gpu_print_stat() {
+
+void gpgpu_sim::gpu_print_stat(unsigned long long streamID) {
   FILE *statfout = stdout;
 
   std::string kernel_info_str = executed_kernel_info_string();
   fprintf(statfout, "%s", kernel_info_str.c_str());
 
+  printf("kernel_stream_id = %llu\n", streamID);
+
   printf("gpu_sim_cycle = %lld\n", gpu_sim_cycle);
   printf("gpu_sim_insn = %lld\n", gpu_sim_insn);
   printf("gpu_ipc = %12.4f\n", (float)gpu_sim_insn / gpu_sim_cycle);
@@ -1440,9 +1470,10 @@ void gpgpu_sim::gpu_print_stat() {
     m_cluster[i]->get_cache_stats(core_cache_stats);
   }
   printf("\nTotal_core_cache_stats:\n");
-  core_cache_stats.print_stats(stdout, "Total_core_cache_stats_breakdown");
+  core_cache_stats.print_stats(stdout, streamID,
+                               "Total_core_cache_stats_breakdown");
   printf("\nTotal_core_cache_fail_stats:\n");
-  core_cache_stats.print_fail_stats(stdout,
+  core_cache_stats.print_fail_stats(stdout, streamID,
                                     "Total_core_cache_fail_stats_breakdown");
   shader_print_scheduler_stat(stdout, false);
 
@@ -1510,9 +1541,10 @@ void gpgpu_sim::gpu_print_stat() {
       printf("L2_total_cache_reservation_fails = %llu\n",
              total_l2_css.res_fails);
       printf("L2_total_cache_breakdown:\n");
-      l2_stats.print_stats(stdout, "L2_cache_stats_breakdown");
+      l2_stats.print_stats(stdout, streamID, "L2_cache_stats_breakdown");
       printf("L2_total_cache_reservation_fail_breakdown:\n");
-      l2_stats.print_fail_stats(stdout, "L2_cache_stats_fail_breakdown");
+      l2_stats.print_fail_stats(stdout, streamID,
+                                "L2_cache_stats_fail_breakdown");
       total_l2_css.print_port_stats(stdout, "L2_cache");
     }
   }
@@ -1955,8 +1987,10 @@ void gpgpu_sim::cycle() {
         if (mf) partiton_reqs_in_parallel_per_cycle++;
       }
       m_memory_sub_partition[i]->cache_cycle(gpu_sim_cycle + gpu_tot_sim_cycle);
-      m_memory_sub_partition[i]->accumulate_L2cache_stats(
-          m_power_stats->pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX]);
+      if (m_config.g_power_simulation_enabled) {
+        m_memory_sub_partition[i]->accumulate_L2cache_stats(
+            m_power_stats->pwr_mem_stat->l2_cache_stats[CURRENT_STAT_IDX]);
+      }
     }
   }
   partiton_reqs_in_parallel += partiton_reqs_in_parallel_per_cycle;
@@ -1978,14 +2012,16 @@ void gpgpu_sim::cycle() {
         *active_sms += m_cluster[i]->get_n_active_sms();
       }
       // Update core icnt/cache stats for AccelWattch
-      m_cluster[i]->get_icnt_stats(
-          m_power_stats->pwr_mem_stat->n_simt_to_mem[CURRENT_STAT_IDX][i],
-          m_power_stats->pwr_mem_stat->n_mem_to_simt[CURRENT_STAT_IDX][i]);
-      m_cluster[i]->get_cache_stats(
-          m_power_stats->pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX]);
-      m_cluster[i]->get_current_occupancy(
-          gpu_occupancy.aggregate_warp_slot_filled,
-          gpu_occupancy.aggregate_theoretical_warp_slots);
+      if (m_config.g_power_simulation_enabled) {
+        m_cluster[i]->get_icnt_stats(
+            m_power_stats->pwr_mem_stat->n_simt_to_mem[CURRENT_STAT_IDX][i],
+            m_power_stats->pwr_mem_stat->n_mem_to_simt[CURRENT_STAT_IDX][i]);
+        m_cluster[i]->get_cache_stats(
+            m_power_stats->pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX]);
+        m_cluster[i]->get_current_occupancy(
+            gpu_occupancy.aggregate_warp_slot_filled,
+            gpu_occupancy.aggregate_theoretical_warp_slots);
+      }
     }
     float temp = 0;
     for (unsigned i = 0; i < m_shader_config->num_shader(); i++) {
diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h
index d43b3995a..8e81451b6 100644
--- a/src/gpgpu-sim/gpu-sim.h
+++ b/src/gpgpu-sim/gpu-sim.h
@@ -539,7 +539,7 @@ class gpgpu_sim : public gpgpu_t {
            (m_config.gpu_max_completed_cta_opt &&
             (gpu_completed_cta >= m_config.gpu_max_completed_cta_opt));
   }
-  void print_stats();
+  void print_stats(unsigned long long streamID);
   void update_stats();
   void deadlock_check();
   void inc_completed_cta() { gpu_completed_cta++; }
@@ -568,7 +568,7 @@ class gpgpu_sim : public gpgpu_t {
   void decrement_kernel_latency();
 
   const gpgpu_sim_config &get_config() const { return m_config; }
-  void gpu_print_stat();
+  void gpu_print_stat(unsigned long long streamID);
   void dump_pipeline(int mask, int s, int m) const;
 
   void perf_memcpy_to_gpu(size_t dst_start_addr, size_t count);
@@ -685,6 +685,17 @@ class gpgpu_sim : public gpgpu_t {
   occupancy_stats gpu_occupancy;
   occupancy_stats gpu_tot_occupancy;
 
+  typedef struct {
+    unsigned long long start_cycle;
+    unsigned long long end_cycle;
+  } kernel_time_t;
+  std::map<unsigned long long, std::map<unsigned, kernel_time_t>>
+      gpu_kernel_time;
+  unsigned long long last_streamID;
+  unsigned long long last_uid;
+  cache_stats aggregated_l1_stats;
+  cache_stats aggregated_l2_stats;
+
   // performance counter for stalls due to congestion.
   unsigned int gpu_stall_dramfull;
   unsigned int gpu_stall_icnt2sh;
@@ -712,6 +723,9 @@ class gpgpu_sim : public gpgpu_t {
  public:
   bool is_functional_sim() { return m_functional_sim; }
   kernel_info_t *get_functional_kernel() { return m_functional_sim_kernel; }
+  std::vector<kernel_info_t *> get_running_kernels() {
+    return m_running_kernels;
+  }
   void functional_launch(kernel_info_t *k) {
     m_functional_sim = true;
     m_functional_sim_kernel = k;
diff --git a/src/gpgpu-sim/l2cache.cc b/src/gpgpu-sim/l2cache.cc
index 846945378..52eed0ef7 100644
--- a/src/gpgpu-sim/l2cache.cc
+++ b/src/gpgpu-sim/l2cache.cc
@@ -51,12 +51,12 @@
 
 mem_fetch *partition_mf_allocator::alloc(new_addr_type addr,
                                          mem_access_type type, unsigned size,
-                                         bool wr,
-                                         unsigned long long cycle) const {
+                                         bool wr, unsigned long long cycle,
+                                         unsigned long long streamID) const {
   assert(wr);
   mem_access_t access(type, addr, size, wr, m_memory_config->gpgpu_ctx);
-  mem_fetch *mf = new mem_fetch(access, NULL, WRITE_PACKET_SIZE, -1, -1, -1,
-                                m_memory_config, cycle);
+  mem_fetch *mf = new mem_fetch(access, NULL, streamID, WRITE_PACKET_SIZE, -1,
+                                -1, -1, m_memory_config, cycle);
   return mf;
 }
 
@@ -65,12 +65,12 @@ mem_fetch *partition_mf_allocator::alloc(
     const mem_access_byte_mask_t &byte_mask,
     const mem_access_sector_mask_t &sector_mask, unsigned size, bool wr,
     unsigned long long cycle, unsigned wid, unsigned sid, unsigned tpc,
-    mem_fetch *original_mf) const {
+    mem_fetch *original_mf, unsigned long long streamID) const {
   mem_access_t access(type, addr, size, wr, active_mask, byte_mask, sector_mask,
                       m_memory_config->gpgpu_ctx);
-  mem_fetch *mf =
-      new mem_fetch(access, NULL, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE,
-                    wid, sid, tpc, m_memory_config, cycle, original_mf);
+  mem_fetch *mf = new mem_fetch(access, NULL, streamID,
+                                wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE, wid,
+                                sid, tpc, m_memory_config, cycle, original_mf);
   return mf;
 }
 memory_partition_unit::memory_partition_unit(unsigned partition_id,
@@ -436,9 +436,9 @@ memory_sub_partition::memory_sub_partition(unsigned sub_partition_id,
   m_mf_allocator = new partition_mf_allocator(config);
 
   if (!m_config->m_L2_config.disabled())
-    m_L2cache =
-        new l2_cache(L2c_name, m_config->m_L2_config, -1, -1, m_L2interface,
-                     m_mf_allocator, IN_PARTITION_L2_MISS_QUEUE, gpu);
+    m_L2cache = new l2_cache(L2c_name, m_config->m_L2_config, -1, -1,
+                             m_L2interface, m_mf_allocator,
+                             IN_PARTITION_L2_MISS_QUEUE, gpu, L2_GPU_CACHE);
 
   unsigned int icnt_L2;
   unsigned int L2_dram;
@@ -733,7 +733,7 @@ memory_sub_partition::breakdown_request_to_sector_requests(mem_fetch *mf) {
           mf->get_access_warp_mask(), mf->get_access_byte_mask() & mask,
           std::bitset<SECTOR_CHUNCK_SIZE>().set(i), SECTOR_SIZE, mf->is_write(),
           m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, mf->get_wid(),
-          mf->get_sid(), mf->get_tpc(), mf);
+          mf->get_sid(), mf->get_tpc(), mf, mf->get_streamID());
 
       result.push_back(n_mf);
     }
@@ -756,7 +756,7 @@ memory_sub_partition::breakdown_request_to_sector_requests(mem_fetch *mf) {
           mf->get_access_byte_mask() & mask,
           std::bitset<SECTOR_CHUNCK_SIZE>().set(i), SECTOR_SIZE, mf->is_write(),
           m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle, mf->get_wid(),
-          mf->get_sid(), mf->get_tpc(), mf);
+          mf->get_sid(), mf->get_tpc(), mf, mf->get_streamID());
 
       result.push_back(n_mf);
     }
@@ -772,7 +772,8 @@ memory_sub_partition::breakdown_request_to_sector_requests(mem_fetch *mf) {
             mf->get_access_warp_mask(), mf->get_access_byte_mask() & mask,
             std::bitset<SECTOR_CHUNCK_SIZE>().set(i), SECTOR_SIZE,
             mf->is_write(), m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,
-            mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf);
+            mf->get_wid(), mf->get_sid(), mf->get_tpc(), mf,
+            mf->get_streamID());
 
         result.push_back(n_mf);
       }
diff --git a/src/gpgpu-sim/l2cache.h b/src/gpgpu-sim/l2cache.h
index ccf9b70e8..65c9c38b3 100644
--- a/src/gpgpu-sim/l2cache.h
+++ b/src/gpgpu-sim/l2cache.h
@@ -52,15 +52,16 @@ class partition_mf_allocator : public mem_fetch_allocator {
     return NULL;
   }
   virtual mem_fetch *alloc(new_addr_type addr, mem_access_type type,
-                           unsigned size, bool wr,
-                           unsigned long long cycle) const;
+                           unsigned size, bool wr, unsigned long long cycle,
+                           unsigned long long streamID) const;
   virtual mem_fetch *alloc(new_addr_type addr, mem_access_type type,
                            const active_mask_t &active_mask,
                            const mem_access_byte_mask_t &byte_mask,
                            const mem_access_sector_mask_t &sector_mask,
                            unsigned size, bool wr, unsigned long long cycle,
                            unsigned wid, unsigned sid, unsigned tpc,
-                           mem_fetch *original_mf) const;
+                           mem_fetch *original_mf,
+                           unsigned long long streamID) const;
 
  private:
   const memory_config *m_memory_config;
diff --git a/src/gpgpu-sim/mem_fetch.cc b/src/gpgpu-sim/mem_fetch.cc
index 0d86046ad..7211a7dd3 100644
--- a/src/gpgpu-sim/mem_fetch.cc
+++ b/src/gpgpu-sim/mem_fetch.cc
@@ -35,10 +35,10 @@
 unsigned mem_fetch::sm_next_mf_request_uid = 1;
 
 mem_fetch::mem_fetch(const mem_access_t &access, const warp_inst_t *inst,
-                     unsigned ctrl_size, unsigned wid, unsigned sid,
-                     unsigned tpc, const memory_config *config,
-                     unsigned long long cycle, mem_fetch *m_original_mf,
-                     mem_fetch *m_original_wr_mf)
+                     unsigned long long streamID, unsigned ctrl_size,
+                     unsigned wid, unsigned sid, unsigned tpc,
+                     const memory_config *config, unsigned long long cycle,
+                     mem_fetch *m_original_mf, mem_fetch *m_original_wr_mf)
     : m_access(access)
 
 {
@@ -48,6 +48,7 @@ mem_fetch::mem_fetch(const mem_access_t &access, const warp_inst_t *inst,
     m_inst = *inst;
     assert(wid == m_inst.warp_id());
   }
+  m_streamID = streamID;
   m_data_size = access.get_size();
   m_ctrl_size = ctrl_size;
   m_sid = sid;
diff --git a/src/gpgpu-sim/mem_fetch.h b/src/gpgpu-sim/mem_fetch.h
index 283fe80e5..770421822 100644
--- a/src/gpgpu-sim/mem_fetch.h
+++ b/src/gpgpu-sim/mem_fetch.h
@@ -54,9 +54,10 @@ class memory_config;
 class mem_fetch {
  public:
   mem_fetch(const mem_access_t &access, const warp_inst_t *inst,
-            unsigned ctrl_size, unsigned wid, unsigned sid, unsigned tpc,
-            const memory_config *config, unsigned long long cycle,
-            mem_fetch *original_mf = NULL, mem_fetch *original_wr_mf = NULL);
+            unsigned long long streamID, unsigned ctrl_size, unsigned wid,
+            unsigned sid, unsigned tpc, const memory_config *config,
+            unsigned long long cycle, mem_fetch *original_mf = NULL,
+            mem_fetch *original_wr_mf = NULL);
   ~mem_fetch();
 
   void set_status(enum mem_fetch_status status, unsigned long long cycle);
@@ -105,6 +106,7 @@ class mem_fetch {
   unsigned get_timestamp() const { return m_timestamp; }
   unsigned get_return_timestamp() const { return m_timestamp2; }
   unsigned get_icnt_receive_time() const { return m_icnt_receive_time; }
+  unsigned long long get_streamID() const { return m_streamID; }
 
   enum mem_access_type get_access_type() const { return m_access.get_type(); }
   const active_mask_t &get_access_warp_mask() const {
@@ -163,6 +165,8 @@ class mem_fetch {
   // requesting instruction (put last so mem_fetch prints nicer in gdb)
   warp_inst_t m_inst;
 
+  unsigned long long m_streamID;
+
   static unsigned sm_next_mf_request_uid;
 
   const memory_config *m_mem_config;
diff --git a/src/gpgpu-sim/power_stat.cc b/src/gpgpu-sim/power_stat.cc
index dead4a0d7..764652b9e 100644
--- a/src/gpgpu-sim/power_stat.cc
+++ b/src/gpgpu-sim/power_stat.cc
@@ -181,11 +181,11 @@ void power_mem_stat_t::print(FILE *fout) const {
           total_mem_reads + total_mem_writes);
   fprintf(fout, "Total memory controller reads: %u\n", total_mem_reads);
   fprintf(fout, "Total memory controller writes: %u\n", total_mem_writes);
-
+  // TODO: print_stats(require stream ID input)
   fprintf(fout, "Core cache stats:\n");
-  core_cache_stats->print_stats(fout);
+  core_cache_stats->print_stats(fout, -1);
   fprintf(fout, "L2 cache stats:\n");
-  l2_cache_stats->print_stats(fout);
+  l2_cache_stats->print_stats(fout, -1);
 }
 
 power_core_stat_t::power_core_stat_t(const shader_core_config *shader_config,
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 9fe4c092c..4d4f11277 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -57,11 +57,11 @@
 
 mem_fetch *shader_core_mem_fetch_allocator::alloc(
     new_addr_type addr, mem_access_type type, unsigned size, bool wr,
-    unsigned long long cycle) const {
+    unsigned long long cycle, unsigned long long streamID) const {
   mem_access_t access(type, addr, size, wr, m_memory_config->gpgpu_ctx);
-  mem_fetch *mf =
-      new mem_fetch(access, NULL, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE, -1,
-                    m_core_id, m_cluster_id, m_memory_config, cycle);
+  mem_fetch *mf = new mem_fetch(
+      access, NULL, streamID, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE, -1,
+      m_core_id, m_cluster_id, m_memory_config, cycle);
   return mf;
 }
 
@@ -70,12 +70,12 @@ mem_fetch *shader_core_mem_fetch_allocator::alloc(
     const mem_access_byte_mask_t &byte_mask,
     const mem_access_sector_mask_t &sector_mask, unsigned size, bool wr,
     unsigned long long cycle, unsigned wid, unsigned sid, unsigned tpc,
-    mem_fetch *original_mf) const {
+    mem_fetch *original_mf, unsigned long long streamID) const {
   mem_access_t access(type, addr, size, wr, active_mask, byte_mask, sector_mask,
                       m_memory_config->gpgpu_ctx);
   mem_fetch *mf = new mem_fetch(
-      access, NULL, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE, wid, m_core_id,
-      m_cluster_id, m_memory_config, cycle, original_mf);
+      access, NULL, streamID, wr ? WRITE_PACKET_SIZE : READ_PACKET_SIZE, wid,
+      m_core_id, m_cluster_id, m_memory_config, cycle, original_mf);
   return mf;
 }
 /////////////////////////////////////////////////////////////////////////////
@@ -178,7 +178,7 @@ void shader_core_ctx::create_front_pipeline() {
   snprintf(name, STRSIZE, "L1I_%03d", m_sid);
   m_L1I = new read_only_cache(name, m_config->m_L1I_config, m_sid,
                               get_shader_instruction_cache_id(), m_icnt,
-                              IN_L1I_MISS_QUEUE);
+                              IN_L1I_MISS_QUEUE, OTHER_GPU_CACHE, m_gpu);
 }
 
 void shader_core_ctx::create_schedulers() {
@@ -447,7 +447,7 @@ void shader_core_ctx::create_exec_pipeline() {
 
   m_ldst_unit = new ldst_unit(m_icnt, m_mem_fetch_allocator, this,
                               &m_operand_collector, m_scoreboard, m_config,
-                              m_memory_config, m_stats, m_sid, m_tpc);
+                              m_memory_config, m_stats, m_sid, m_tpc, m_gpu);
   m_fu.push_back(m_ldst_unit);
   m_dispatch_port.push_back(ID_OC_MEM);
   m_issue_port.push_back(OC_EX_MEM);
@@ -567,7 +567,8 @@ void shader_core_ctx::init_warps(unsigned cta_id, unsigned start_thread,
         start_pc = pc;
       }
 
-      m_warp[i]->init(start_pc, cta_id, i, active_threads, m_dynamic_warp_id);
+      m_warp[i]->init(start_pc, cta_id, i, active_threads, m_dynamic_warp_id,
+                      kernel.get_streamID());
       ++m_dynamic_warp_id;
       m_not_completed += n_active;
       ++m_active_warps;
@@ -985,8 +986,8 @@ void shader_core_ctx::fetch() {
           // mem_fetch *mf = m_mem_fetch_allocator->alloc()
           mem_access_t acc(INST_ACC_R, ppc, nbytes, false, m_gpu->gpgpu_ctx);
           mem_fetch *mf = new mem_fetch(
-              acc, NULL /*we don't have an instruction yet*/, READ_PACKET_SIZE,
-              warp_id, m_sid, m_tpc, m_memory_config,
+              acc, NULL, m_warp[warp_id]->get_kernel_info()->get_streamID(),
+              READ_PACKET_SIZE, warp_id, m_sid, m_tpc, m_memory_config,
               m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle);
           std::list<cache_event> events;
           enum cache_request_status status;
@@ -1040,10 +1041,10 @@ void shader_core_ctx::issue_warp(register_set &pipe_reg_set,
   m_warp[warp_id]->ibuffer_free();
   assert(next_inst->valid());
   **pipe_reg = *next_inst;  // static instruction information
-  (*pipe_reg)->issue(active_mask, warp_id,
-                     m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,
-                     m_warp[warp_id]->get_dynamic_warp_id(),
-                     sch_id);  // dynamic instruction information
+  (*pipe_reg)->issue(
+      active_mask, warp_id, m_gpu->gpu_tot_sim_cycle + m_gpu->gpu_sim_cycle,
+      m_warp[warp_id]->get_dynamic_warp_id(), sch_id,
+      m_warp[warp_id]->get_streamID());  // dynamic instruction information
   m_stats->shader_cycle_distro[2 + (*pipe_reg)->active_count()]++;
   func_exec_inst(**pipe_reg);
 
@@ -2597,7 +2598,7 @@ void ldst_unit::init(mem_fetch_interface *icnt,
                         IN_SHADER_L1T_ROB);
   m_L1C = new read_only_cache(L1C_name, m_config->m_L1C_config, m_sid,
                               get_shader_constant_cache_id(), icnt,
-                              IN_L1C_MISS_QUEUE);
+                              IN_L1C_MISS_QUEUE, OTHER_GPU_CACHE, m_gpu);
   m_L1D = NULL;
   m_mem_rc = NO_RC_FAIL;
   m_num_writeback_clients =
@@ -2613,9 +2614,10 @@ ldst_unit::ldst_unit(mem_fetch_interface *icnt,
                      shader_core_ctx *core, opndcoll_rfu_t *operand_collector,
                      Scoreboard *scoreboard, const shader_core_config *config,
                      const memory_config *mem_config, shader_core_stats *stats,
-                     unsigned sid, unsigned tpc)
+                     unsigned sid, unsigned tpc, gpgpu_sim *gpu)
     : pipelined_simd_unit(NULL, config, config->smem_latency, core, 0),
-      m_next_wb(config) {
+      m_next_wb(config),
+      m_gpu(gpu) {
   assert(config->smem_latency > 1);
   init(icnt, mf_allocator, core, operand_collector, scoreboard, config,
        mem_config, stats, sid, tpc);
@@ -2624,7 +2626,7 @@ ldst_unit::ldst_unit(mem_fetch_interface *icnt,
     snprintf(L1D_name, STRSIZE, "L1D_%03d", m_sid);
     m_L1D = new l1_cache(L1D_name, m_config->m_L1D_config, m_sid,
                          get_shader_normal_cache_id(), m_icnt, m_mf_allocator,
-                         IN_L1D_MISS_QUEUE, core->get_gpu());
+                         IN_L1D_MISS_QUEUE, core->get_gpu(), L1_GPU_CACHE);
 
     l1_latency_queue.resize(m_config->m_L1D_config.l1_banks);
     assert(m_config->m_L1D_config.l1_latency > 0);
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index 92691d386..e658a14c9 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -120,6 +120,7 @@ class shd_warp_t {
     m_done_exit = true;
     m_last_fetch = 0;
     m_next = 0;
+    m_streamID = (unsigned long long)-1;
 
     // Jin: cdp support
     m_cdp_latency = 0;
@@ -140,8 +141,9 @@ class shd_warp_t {
     m_ldgdepbar_buf.clear();
   }
   void init(address_type start_pc, unsigned cta_id, unsigned wid,
-            const std::bitset<MAX_WARP_SIZE> &active,
-            unsigned dynamic_warp_id) {
+            const std::bitset<MAX_WARP_SIZE> &active, unsigned dynamic_warp_id,
+            unsigned long long streamID) {
+    m_streamID = streamID;
     m_cta_id = cta_id;
     m_warp_id = wid;
     m_dynamic_warp_id = dynamic_warp_id;
@@ -265,6 +267,7 @@ class shd_warp_t {
     m_inst_in_pipeline--;
   }
 
+  unsigned long long get_streamID() const { return m_streamID; }
   unsigned get_cta_id() const { return m_cta_id; }
 
   unsigned get_dynamic_warp_id() const { return m_dynamic_warp_id; }
@@ -277,6 +280,7 @@ class shd_warp_t {
  private:
   static const unsigned IBUFFER_SIZE = 2;
   class shader_core_ctx *m_shader;
+  unsigned long long m_streamID;
   unsigned m_cta_id;
   unsigned m_warp_id;
   unsigned m_warp_size;
@@ -1345,7 +1349,7 @@ class ldst_unit : public pipelined_simd_unit {
             shader_core_ctx *core, opndcoll_rfu_t *operand_collector,
             Scoreboard *scoreboard, const shader_core_config *config,
             const memory_config *mem_config, class shader_core_stats *stats,
-            unsigned sid, unsigned tpc);
+            unsigned sid, unsigned tpc, gpgpu_sim *gpu);
 
   // Add a structure to record the LDGSTS instructions,
   // similar to m_pending_writes, but since LDGSTS does not have a output
@@ -1435,6 +1439,7 @@ class ldst_unit : public pipelined_simd_unit {
                                                    warp_inst_t &inst);
   mem_stage_stall_type process_memory_access_queue_l1cache(l1_cache *cache,
                                                            warp_inst_t &inst);
+  gpgpu_sim *m_gpu;
 
   const memory_config *m_memory_config;
   class mem_fetch_interface *m_icnt;
@@ -2025,18 +2030,20 @@ class shader_core_mem_fetch_allocator : public mem_fetch_allocator {
     m_memory_config = config;
   }
   mem_fetch *alloc(new_addr_type addr, mem_access_type type, unsigned size,
-                   bool wr, unsigned long long cycle) const;
+                   bool wr, unsigned long long cycle,
+                   unsigned long long streamID) const;
   mem_fetch *alloc(new_addr_type addr, mem_access_type type,
                    const active_mask_t &active_mask,
                    const mem_access_byte_mask_t &byte_mask,
                    const mem_access_sector_mask_t &sector_mask, unsigned size,
                    bool wr, unsigned long long cycle, unsigned wid,
-                   unsigned sid, unsigned tpc, mem_fetch *original_mf) const;
+                   unsigned sid, unsigned tpc, mem_fetch *original_mf,
+                   unsigned long long streamID) const;
   mem_fetch *alloc(const warp_inst_t &inst, const mem_access_t &access,
                    unsigned long long cycle) const {
     warp_inst_t inst_copy = inst;
     mem_fetch *mf = new mem_fetch(
-        access, &inst_copy,
+        access, &inst_copy, inst.get_streamID(),
         access.is_write() ? WRITE_PACKET_SIZE : READ_PACKET_SIZE,
         inst.warp_id(), m_core_id, m_cluster_id, m_memory_config, cycle);
     return mf;
diff --git a/src/gpgpusim_entrypoint.cc b/src/gpgpusim_entrypoint.cc
index f4287d8a7..42c6981b0 100644
--- a/src/gpgpusim_entrypoint.cc
+++ b/src/gpgpusim_entrypoint.cc
@@ -57,7 +57,8 @@ void *gpgpu_sim_thread_sequential(void *ctx_ptr) {
         ctx->the_gpgpusim->g_the_gpu->cycle();
         ctx->the_gpgpusim->g_the_gpu->deadlock_check();
       }
-      ctx->the_gpgpusim->g_the_gpu->print_stats();
+      ctx->the_gpgpusim->g_the_gpu->print_stats(
+          ctx->the_gpgpusim->g_the_gpu->last_streamID);
       ctx->the_gpgpusim->g_the_gpu->update_stats();
       ctx->print_simulation_time();
     }
@@ -144,7 +145,8 @@ void *gpgpu_sim_thread_concurrent(void *ctx_ptr) {
       fflush(stdout);
     }
     if (sim_cycles) {
-      ctx->the_gpgpusim->g_the_gpu->print_stats();
+      ctx->the_gpgpusim->g_the_gpu->print_stats(
+          ctx->the_gpgpusim->g_the_gpu->last_streamID);
       ctx->the_gpgpusim->g_the_gpu->update_stats();
       ctx->print_simulation_time();
     }
diff --git a/src/stream_manager.cc b/src/stream_manager.cc
index 0ce3c6a74..72f8bb0b2 100644
--- a/src/stream_manager.cc
+++ b/src/stream_manager.cc
@@ -302,6 +302,14 @@ bool stream_manager::register_finished_kernel(unsigned grid_uid) {
 void stream_manager::stop_all_running_kernels() {
   pthread_mutex_lock(&m_lock);
 
+  std::vector<unsigned long long> finished_streams;
+  std::vector<kernel_info_t *> running_kernels = m_gpu->get_running_kernels();
+  for (kernel_info_t *k : running_kernels) {
+    if (k != NULL) {
+      finished_streams.push_back(k->get_streamID());
+    }
+  }
+
   // Signal m_gpu to stop all running kernels
   m_gpu->stop_all_running_kernels();
 
@@ -312,7 +320,9 @@ void stream_manager::stop_all_running_kernels() {
   }
 
   // If any kernels completed, print out the current stats
-  if (count > 0) m_gpu->print_stats();
+  for (unsigned long long streamID : finished_streams) {
+    m_gpu->print_stats(streamID);
+  }
 
   pthread_mutex_unlock(&m_lock);
 }

From 980eb88b547dad53b2343cc90ed66c4dec48dea3 Mon Sep 17 00:00:00 2001
From: Christin David Bose <chris241@purdue.edu>
Date: Tue, 1 Oct 2024 14:09:31 -0700
Subject: [PATCH 149/154] Change to calculate L2 BW if core freq and icnt freq
 are not the same (#78)

---
 src/gpgpu-sim/gpu-sim.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index 0c922bdb3..5bd41805d 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -1445,13 +1445,13 @@ void gpgpu_sim::gpu_print_stat(unsigned long long streamID) {
   // %lld\n", partiton_replys_in_parallel_total );
   printf("L2_BW  = %12.4f GB/Sec\n",
          ((float)(partiton_replys_in_parallel * 32) /
-          (gpu_sim_cycle * m_config.icnt_period)) /
+          (gpu_sim_cycle * m_config.core_period)) /
              1000000000);
   printf("L2_BW_total  = %12.4f GB/Sec\n",
          ((float)((partiton_replys_in_parallel +
                    partiton_replys_in_parallel_total) *
                   32) /
-          ((gpu_tot_sim_cycle + gpu_sim_cycle) * m_config.icnt_period)) /
+          ((gpu_tot_sim_cycle + gpu_sim_cycle) * m_config.core_period)) /
              1000000000);
 
   time_t curr_time;

From 667834cfe5214523edd7769aeab77f91b7137686 Mon Sep 17 00:00:00 2001
From: JRPan <25518778+JRPan@users.noreply.github.com>
Date: Mon, 7 Oct 2024 13:56:32 -0700
Subject: [PATCH 150/154] we have gcc-11 now. Check version for more than 2
 digits. (#79)

* we have gcc-11 now. Check version for more than 2 digits.

* version detection as well - And support c++ 11 by default
---
 setup_environment    | 3 ++-
 version_detection.mk | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/setup_environment b/setup_environment
index 871bb593a..342810151 100644
--- a/setup_environment
+++ b/setup_environment
@@ -44,7 +44,8 @@ if [ $? = 1 ]; then
 	return 1;
 fi
 
-CC_VERSION=`gcc --version | head -1 | awk '{for(i=1;i<=NF;i++){ if(match($i,/^[0-9]\.[0-9]\.[0-9]$/))  {print $i; exit 0}}}'`
+CC_VERSION=$(gcc --version | head -1 | awk '{for(i=1;i<=NF;i++){ if(match($i,/^[0-9]+\.[0-9]+\.[0-9]+$/)) {print $i; exit 0}}}')
+
 
 CUDA_VERSION_STRING=`$CUDA_INSTALL_PATH/bin/nvcc --version | awk '/release/ {print $5;}' | sed 's/,//'`;
 export CUDA_VERSION_NUMBER=`echo $CUDA_VERSION_STRING | sed 's/\./ /' | awk '{printf("%02u%02u", 10*int($1), 10*$2);}'`
diff --git a/version_detection.mk b/version_detection.mk
index 81c1d2ae7..0bf309500 100644
--- a/version_detection.mk
+++ b/version_detection.mk
@@ -43,7 +43,7 @@ CUDA_VERSION_STRING:=$(shell $(CUDA_INSTALL_PATH)/bin/nvcc --version | awk '/rel
 CUDART_VERSION:=$(shell echo $(CUDA_VERSION_STRING) | sed 's/\./ /' | awk '{printf("%02u%02u", 10*int($$1), 10*$$2);}')
 
 # Detect GCC Version 
-CC_VERSION := $(shell gcc --version | head -1 | awk '{for(i=1;i<=NF;i++){ if(match($$i,/^[0-9]\.[0-9]\.[0-9]$$/))  {print $$i; exit 0 }}}')
+CC_VERSION := $(shell gcc --version | head -1 | awk '{for(i=1;i<=NF;i++){ if(match($$i,/^[0-9]+\.[0-9]+\.[0-9]+$$/)) {print $$i; exit 0 }}}')
 
 # Detect Support for C++11 (C++0x) from GCC Version 
-GNUC_CPP0X := $(shell gcc --version | perl -ne 'if (/gcc\s+\(.*\)\s+([0-9.]+)/){ if($$1 >= 4.3) {$$n=1} else {$$n=0;} } END { print $$n; }')
+GNUC_CPP0X := 1

From 752d4e5bf622b0d7c730e3eb2f1b3e3cf91e81fa Mon Sep 17 00:00:00 2001
From: WilliamMTK <China_Aisa@live.com>
Date: Wed, 11 Dec 2024 16:52:40 -0500
Subject: [PATCH 151/154] Add SST integration into gpgpusim (#44)

* Add accommodations to run gpgpusim with SST simulation framework through balar

* Output setup_environment options when sourcing

* Add SST directive check when creating sim thread

* Add sst side test for jenkins

* sst-integration: update Jenkinsfile with offical sst-elements repo and fix bugs in pipeline script

* sst-integration: direct jenkins to rebuild gpgpusim before testing for sst

* sst-integration: fix bugs in sst repos config

* sst-integration: let Jenkins rebuilds simulator

Since the simulator needs to be configured with both normal mode and sst mode, need to rebuild make target to clean prior runs.

* sst-integration: Update Jenkinsfile to source env vars when running balar test

* sst-integration: refactor code to remove __SST__ flag

* sst-integration: fix a bug that init cluster twice for sst

* sst-integration: fix a bug of not sending mem packets to SST

* sst-integration: remove sst flags from makefiles and setup_env

* sst-integration: add comments to SST changes

* sst-integration: remove rebuilding simulator in jenkins when testing for SST

* sst-integration: revert simulator build script

* Add a function to support querying function argument info for SST

* sst-integration: add version detection for vanadis binary

* Automated Format

* add version detection support for gcc 10+

* sst-integration: add cudaMallocHost for SST

* sst-integration: fix a compilation bug

* sst-integration: add sst balar unittest CI

* sst-integration: specify GPU_ARCH for CI test

* sst-integration: use bash for github actions

* sst-integration: use https links for sst repos

* sst-integration: add SST dependencies to CI config

* sst-integration: remove sudo

* sst-integration: default to yes for apt install

* sst-integration: add manual trigger for github action

* sst-integration: remove wrong on event

* sst-integration: limit CPU usage for compilation

* sst-integration: fix wrong path

* sst-integration: use personal repo for testing

* sst-integration: remove sst-core source in CI to free space

* sst-integration: SST_Cycle use print stats with stream id

* Automated Format

* sst-integration: check for diskspace and try to clean it

* sst-integration: move out of docker image

* sst-integration: testing for ci path

* sst-integration: fix syntax

* sst-integration: pass env vars

* sst-integration: set env properly

* sst-integration: merge LLVM build and test into same job

* sst-integration: fix step order

* sst-integration: checkout correct branch for env-setup

* sst-integration: remove resourcing gpu apps

* sst-integration: revert back to docker github action

* sst-integration: enable debug trace for sst testing

* sst-integration: resourcing gpu app for env vars

* sst-integration: use GPUAPPS_ROOT for path for gpu app

* sst-integration: use GPUAPPS_ROOT for path for gpu app

* sst-integration: enable parallel ci tests and fix not returning with cudaMallocHostSST

* sst-integration: using debug flag for CI run

* sst-integration: revert debug ci run

* sst-integration: CI skips cuda sdk download and launch multiple jobs

* sst-integration: reenable parallel tests

* sst-integration: reduce concurrent test thread count

* sst-integration: skip long test for github runner

* sst-integration: try running CI with single core

* sst-integrtion: add callback to SST to check thread sync is done in SST_Cycle()

* sst-integration: ignore lookup if already found and add callbacks to SST

* Automated Format

* sst-integration: add support for indirect texture access

* Automated Format

* sste-integration: fix up for PR

* Automated Format

---------

Co-authored-by: purdue-jenkins <purdue-jenkins@users.noreply.github.com>
---
 .github/workflows/sst_integration.yml |  80 ++++++++
 Jenkinsfile                           |  35 ++++
 Makefile                              |   2 +
 libcuda/cuda_api_object.h             |  17 ++
 libcuda/cuda_runtime_api.cc           | 274 +++++++++++++++++++++++---
 libcuda/gpgpu_context.h               |   4 +
 setup_environment                     |  18 +-
 src/cuda-sim/cuda-sim.cc              |  45 ++++-
 src/cuda-sim/instructions.cc          |  11 ++
 src/cuda-sim/ptx_ir.cc                |  16 ++
 src/cuda-sim/ptx_ir.h                 |   2 +
 src/cuda-sim/ptx_sim.h                |   3 +
 src/gpgpu-sim/gpu-cache.cc            |   1 +
 src/gpgpu-sim/gpu-sim.cc              | 200 +++++++++++++++++--
 src/gpgpu-sim/gpu-sim.h               | 139 ++++++++++++-
 src/gpgpu-sim/mem_fetch.cc            |  12 +-
 src/gpgpu-sim/mem_latency_stat.cc     |  17 +-
 src/gpgpu-sim/shader.cc               | 135 +++++++++++--
 src/gpgpu-sim/shader.h                | 101 ++++++++++
 src/gpgpusim_entrypoint.cc            | 138 ++++++++++++-
 src/stream_manager.cc                 |  12 +-
 21 files changed, 1176 insertions(+), 86 deletions(-)
 create mode 100644 .github/workflows/sst_integration.yml

diff --git a/.github/workflows/sst_integration.yml b/.github/workflows/sst_integration.yml
new file mode 100644
index 000000000..03635db64
--- /dev/null
+++ b/.github/workflows/sst_integration.yml
@@ -0,0 +1,80 @@
+# Workflow with cmake build system
+name: SST Integration Test
+
+# Controls when the workflow will run
+on:
+  # Triggers the workflow on push or pull request events but only for the mydev branch
+  push:
+    branches-ignore:
+      - "gh-readonly-queue**"
+  pull_request:
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  build-QV100:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        # test_type: [simple, medium, long]
+        test_type: [simple, medium]
+    container:
+      image: tgrogers/accel-sim_regress:SST-Integration-Ubuntu-22.04-cuda-11.7-llvm-18.1.8-riscv-gnu-2024.08.06-nightly
+      env:
+        CONFIG: QV100
+        GPU_ARCH: sm_70
+        
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v4
+      - name: Build GPGPU-Sim in SST mode
+        run: |
+          source ./setup_environment
+          make -j4
+      - name: Prepare SST dependencies
+        run: |
+          apt install -y openmpi-bin openmpi-common libtool libtool-bin autoconf python3 python3-dev automake build-essential git
+      # Use personal repo for now
+      - name: Build SST-Core
+        run: |
+          git clone https://github.com/William-An/sst-core.git
+          cd sst-core
+          git pull
+          git checkout devel
+          ./autogen.sh
+          ./configure --prefix=`realpath ../sstcore-install` --disable-mpi --disable-mem-pools
+          make -j4
+          make install
+          cd ..
+          rm -rf ./sst-core
+      # Use personal repo for now
+      - name: Build SST-Elements
+        run: |
+          git clone https://github.com/William-An/sst-elements.git
+          source ./setup_environment
+          cd sst-elements
+          git pull
+          git checkout balar-mmio-vanadis-llvm
+          ./autogen.sh
+          ./configure --prefix=`realpath ../sstelements-install` --with-sst-core=`realpath ../sstcore-install` --with-cuda=$CUDA_INSTALL_PATH --with-gpgpusim=$GPGPUSIM_ROOT
+          make -j4
+          make install
+      # Have to resource the gpu app
+      # Also fake a SDK since rodinia 2.0 does not need this, speed things up on github
+      - name: Balar Test
+        run: |
+          pip install testtools blessings pygments
+          source ./setup_environment
+          mkdir 4.2
+          mkdir fake_sdk
+          export NVIDIA_COMPUTE_SDK_LOCATION=$(readlink -f ./fake_sdk)
+          source $GPUAPPS_ROOT/src/setup_environment sst
+          rm -rf 4.2
+          rm -f gpucomputingsdk_4.2.9_linux.run
+          ./sstcore-install/bin/sst-test-elements -w "*balar*${{ matrix.test_type }}*"
\ No newline at end of file
diff --git a/Jenkinsfile b/Jenkinsfile
index f6676bf14..4ef467bae 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -87,6 +87,41 @@ pipeline {
                         ssh tgrogers@dynamo.ecn.purdue.edu "cd $PLOTDIR && rm -rf latest && cp -r ${BUILD_NUMBER} latest"'
             }
         }
+        stage('sst-core-build') {
+            steps {
+                sh 'rm -rf sstcore-install'
+                sh 'rm -rf sst-core && git clone git@github.com:sstsimulator/sst-core.git'
+                sh '''#!/bin/bash
+                    cd sst-core
+                    ./autogen.sh
+                    ./configure --prefix=`realpath ../sstcore-install` --disable-mpi --disable-mem-pools
+                    make -j 10 
+                    make install'''
+            }
+        }
+        stage('sst-elements-build') {
+            steps {
+                sh 'rm -rf sstelements-install'
+                sh 'rm -rf sst-elements && git clone git@github.com:sstsimulator/sst-elements.git'
+                // First sourcing the env_setup and setup_environment script for env vars
+                sh '''#!/bin/bash
+                    source ./env-setup/11.0_env_setup.sh
+                    source `pwd`/setup_environment
+                    cd sst-elements
+                    ./autogen.sh
+                    ./configure --prefix=`realpath ../sstelements-install` --with-sst-core=`realpath ../sstcore-install` --with-cuda=$CUDA_INSTALL_PATH --with-gpgpusim=$GPGPUSIM_ROOT
+                    make -j 10 
+                    make install'''
+            }
+        }
+        stage('sst balar test') {
+            steps {
+                sh '''#!/bin/bash
+                    source ./env-setup/11.0_env_setup.sh
+                    source `pwd`/setup_environment sst
+                    ./sstcore-install/bin/sst-test-elements -p ./sst-elements/src/sst/elements/balar/tests'''
+            }
+        }
     }
     post {
         success {
diff --git a/Makefile b/Makefile
index 82ea39928..37dba0146 100644
--- a/Makefile
+++ b/Makefile
@@ -34,6 +34,7 @@ INTERSIM ?= intersim2
 
 include version_detection.mk
 
+# Check for debug
 ifeq ($(GPGPUSIM_CONFIG), gcc-$(CC_VERSION)/cuda-$(CUDART_VERSION)/debug)
 	export DEBUG=1
 else
@@ -168,6 +169,7 @@ $(SIM_LIB_DIR)/libcudart.so: makedirs $(LIBS) cudalib
 	if [ ! -f $(SIM_LIB_DIR)/libcudart.so.10.0 ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart.so.10.0; fi
 	if [ ! -f $(SIM_LIB_DIR)/libcudart.so.10.1 ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart.so.10.1; fi
 	if [ ! -f $(SIM_LIB_DIR)/libcudart.so.11.0 ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart.so.11.0; fi
+	if [ ! -f $(SIM_LIB_DIR)/libcudart_mod.so ]; then ln -s libcudart.so $(SIM_LIB_DIR)/libcudart_mod.so; fi
 
 $(SIM_LIB_DIR)/libcudart.dylib: makedirs $(LIBS) cudalib
 	g++ -dynamiclib -Wl,-headerpad_max_install_names,-undefined,dynamic_lookup,-compatibility_version,1.1,-current_version,1.1\
diff --git a/libcuda/cuda_api_object.h b/libcuda/cuda_api_object.h
index d292e224e..e620e5728 100644
--- a/libcuda/cuda_api_object.h
+++ b/libcuda/cuda_api_object.h
@@ -1,6 +1,7 @@
 #ifndef __cuda_api_object_h__
 #define __cuda_api_object_h__
 
+#include <functional>
 #include <list>
 #include <map>
 #include <set>
@@ -193,9 +194,25 @@ class cuda_runtime_api {
   // backward pointer
   class gpgpu_context *gpgpu_ctx;
   // member function list
+
+  // For SST and other potential simulator interface
+  void cuobjdumpInit(const char *fn);
+  void extract_code_using_cuobjdump(const char *fn);
+  void extract_ptx_files_using_cuobjdump(CUctx_st *context, const char *fn);
+
+  // For running GPGPUSim alone
   void cuobjdumpInit();
   void extract_code_using_cuobjdump();
   void extract_ptx_files_using_cuobjdump(CUctx_st *context);
+
+  // Internal functions for the above public methods
+  void cuobjdumpInit_internal(std::function<void()> ctx_extract_code_func);
+  void extract_code_using_cuobjdump_internal(
+      CUctx_st *context, std::string &app_binary,
+      std::function<void(CUctx_st *)> ctx_extract_ptx_func);
+  void extract_ptx_files_using_cuobjdump_internal(CUctx_st *context,
+                                                  std::string &app_binary);
+
   std::list<cuobjdumpSection *> pruneSectionList(CUctx_st *context);
   std::list<cuobjdumpSection *> mergeMatchingSections(std::string identifier);
   std::list<cuobjdumpSection *> mergeSections();
diff --git a/libcuda/cuda_runtime_api.cc b/libcuda/cuda_runtime_api.cc
index b540ffd91..5dfd3fc38 100644
--- a/libcuda/cuda_runtime_api.cc
+++ b/libcuda/cuda_runtime_api.cc
@@ -109,6 +109,7 @@
 #include <string.h>
 #include <time.h>
 #include <fstream>
+#include <functional>
 #include <iostream>
 #include <regex>
 #include <sstream>
@@ -151,6 +152,9 @@
 #include <mach-o/dyld.h>
 #endif
 
+// SST cycle
+extern bool SST_Cycle();
+
 /*DEVICE_BUILTIN*/
 struct cudaArray {
   void *devPtr;
@@ -412,6 +416,13 @@ void setCuobjdumpsassfilename(
 //! processes (e.g. cuobjdump) reading /proc/<pid>/exe will see the emulator
 //! executable instead of the application binary.
 //!
+// In SST need the string to pass the binary information
+// as we cannot get it from /proc/self/exe
+std::string get_app_binary(const char *fn) {
+  printf("self exe links to: %s\n", fn);
+  return fn;
+}
+
 std::string get_app_binary() {
   char self_exe_path[1025];
 #ifdef __APPLE__
@@ -453,19 +464,25 @@ char *get_app_binary_name(std::string abs_path) {
   return self_exe_path;
 }
 
-static int get_app_cuda_version() {
+static int get_app_cuda_version_internal(std::string app_binary) {
   int app_cuda_version = 0;
   char fname[1024];
   snprintf(fname, 1024, "_app_cuda_version_XXXXXX");
   int fd = mkstemp(fname);
   close(fd);
+  // Weili: Add way to extract CUDA version information from Balar Vanadis
+  // binary (stored as a const string)
   std::string app_cuda_version_command =
-      "ldd " + get_app_binary() +
+      "ldd " + app_binary +
       " | grep libcudart.so | sed  's/.*libcudart.so.\\(.*\\) =>.*/\\1/' > " +
+      fname + " && strings " + app_binary +
+      " | grep libcudart_vanadis.a | sed  "
+      "'s/.*libcudart_vanadis.a.\\(.*\\)/\\1/' >> " +
       fname;
   int res = system(app_cuda_version_command.c_str());
   if (res == -1) {
-    printf("Error - Cannot detect the app's CUDA version.\n");
+    printf("Error - Cannot detect the app's CUDA version. Command: %s\n",
+           app_cuda_version_command.c_str());
     exit(1);
   }
   FILE *cmd = fopen(fname, "r");
@@ -476,12 +493,24 @@ static int get_app_cuda_version() {
   }
   fclose(cmd);
   if (app_cuda_version == 0) {
-    printf("Error - Cannot detect the app's CUDA version.\n");
+    printf("Error - Cannot detect the app's CUDA version. Command: %s\n",
+           app_cuda_version_command.c_str());
     exit(1);
   }
   return app_cuda_version;
 }
 
+static int get_app_cuda_version(const char *fn) {
+  // Use for other simulator integration
+  std::string app_binary = get_app_binary(fn);
+  return get_app_cuda_version_internal(app_binary);
+}
+
+static int get_app_cuda_version() {
+  std::string app_binary = get_app_binary();
+  return get_app_cuda_version_internal(app_binary);
+}
+
 //! Keep track of the association between filename and cubin handle
 void cuda_runtime_api::cuobjdumpRegisterFatBinary(unsigned int handle,
                                                   const char *filename,
@@ -574,8 +603,11 @@ __host__ cudaError_t CUDARTAPI cudaDeviceGetLimitInternal(
   return g_last_cudaError = cudaSuccess;
 }
 
-void **cudaRegisterFatBinaryInternal(void *fatCubin,
-                                     gpgpu_context *gpgpu_ctx = NULL) {
+// Internal implementation for cudaRegisterFatBiaryInternal
+void **cudaRegisterFatBiaryInternal_impl(
+    void *fatCubin, gpgpu_context *gpgpu_ctx, std::string &app_binary_path,
+    int app_cuda_version,
+    std::function<void(gpgpu_context *)> ctx_cuobjdumpInit_func) {
   gpgpu_context *ctx;
   if (gpgpu_ctx) {
     ctx = gpgpu_ctx;
@@ -606,11 +638,9 @@ void **cudaRegisterFatBinaryInternal(void *fatCubin,
     // compiled with a newer version of CUDA to run apps compiled with older
     // versions of CUDA. This is especially useful for PTXPLUS execution.
     // Skip cuda version check for pytorch application
-    std::string app_binary_path = get_app_binary();
     int pos = app_binary_path.find("python");
     if (pos == std::string::npos) {
       // Not pytorch app : checking cuda version
-      int app_cuda_version = get_app_cuda_version();
       assert(
           app_cuda_version == CUDART_VERSION / 1000 &&
           "The app must be compiled with same major version as the simulator.");
@@ -661,7 +691,7 @@ void **cudaRegisterFatBinaryInternal(void *fatCubin,
      * then for next calls, only returns the appropriate number
      */
     assert(fat_cubin_handle >= 1);
-    if (fat_cubin_handle == 1) ctx->api->cuobjdumpInit();
+    if (fat_cubin_handle == 1) ctx_cuobjdumpInit_func(ctx);
     ctx->api->cuobjdumpRegisterFatBinary(fat_cubin_handle, filename, context);
 
     return (void **)fat_cubin_handle;
@@ -753,6 +783,28 @@ void **cudaRegisterFatBinaryInternal(void *fatCubin,
 #endif
 }
 
+void **cudaRegisterFatBinaryInternal(const char *fn, void *fatCubin,
+                                     gpgpu_context *gpgpu_ctx = NULL) {
+  std::string app_binary_path = get_app_binary(fn);
+  int app_cuda_version = get_app_cuda_version(fn);
+  auto ctx_cuobjdumpInit = [=](gpgpu_context *ctx) {
+    ctx->api->cuobjdumpInit(fn);
+  };
+  return cudaRegisterFatBiaryInternal_impl(fatCubin, gpgpu_ctx, app_binary_path,
+                                           app_cuda_version, ctx_cuobjdumpInit);
+}
+
+void **cudaRegisterFatBinaryInternal(void *fatCubin,
+                                     gpgpu_context *gpgpu_ctx = NULL) {
+  std::string app_binary_path = get_app_binary();
+  int app_cuda_version = get_app_cuda_version();
+  auto ctx_cuobjdumpInit = [](gpgpu_context *ctx) {
+    ctx->api->cuobjdumpInit();
+  };
+  return cudaRegisterFatBiaryInternal_impl(fatCubin, gpgpu_ctx, app_binary_path,
+                                           app_cuda_version, ctx_cuobjdumpInit);
+}
+
 void cudaRegisterFunctionInternal(void **fatCubinHandle, const char *hostFun,
                                   char *deviceFun, const char *deviceName,
                                   int thread_limit, uint3 *tid, uint3 *bid,
@@ -1057,6 +1109,24 @@ cudaError_t cudaMallocHostInternal(void **ptr, size_t size,
   }
 }
 
+// SST malloc done by vanadis, we just need to record the memory addr
+cudaError_t CUDARTAPI cudaMallocHostSSTInternal(
+    void *addr, size_t size, gpgpu_context *gpgpu_ctx = NULL) {
+  gpgpu_context *ctx;
+  if (gpgpu_ctx) {
+    ctx = gpgpu_ctx;
+  } else {
+    ctx = GPGPU_Context();
+  }
+  if (g_debug_execution >= 3) {
+    announce_call(__my_func__);
+  }
+  // track pinned memory size allocated in the host so that same amount of
+  // memory is also allocated in GPU.
+  ctx->api->pinned_memory_size[addr] = size;
+  return g_last_cudaError = cudaSuccess;
+}
+
 __host__ cudaError_t CUDARTAPI
 cudaMallocPitchInternal(void **devPtr, size_t *pitch, size_t width,
                         size_t height, gpgpu_context *gpgpu_ctx = NULL) {
@@ -2301,13 +2371,77 @@ cudaDeviceSynchronizeInternal(gpgpu_context *gpgpu_ctx = NULL) {
  *                                                                              *
  *******************************************************************************/
 
-extern "C" {
-
 /*******************************************************************************
  *                                                                              *
- *                                                                              *
+ *   SST Specific functions, used by Balar *
  *                                                                              *
  *******************************************************************************/
+
+/**
+ * @brief Custom function to get CUDA function parameter size and offset
+ *        from PTX parsing result
+ *
+ * @param hostFun
+ * @param index
+ * @return std::tuple<cudaError_t, size_t, unsigned>
+ */
+std::tuple<cudaError_t, size_t, unsigned> SST_cudaGetParamConfig(
+    uint64_t hostFun, unsigned index) {
+  if (g_debug_execution >= 3) {
+    announce_call(__my_func__);
+  }
+  CUctx_st *context = GPGPUSim_Context(GPGPU_Context());
+  function_info *entry = context->get_kernel((char *)hostFun);
+  cudaError_t result = cudaSuccess;
+  size_t size = 0;
+  unsigned alignment = 0;
+  if (index >= entry->num_args()) {
+    result = cudaErrorAssert;
+  } else {
+    std::pair<size_t, unsigned> p = entry->get_param_config(index);
+    size = p.first;
+    alignment = p.second;
+  }
+  return std::tuple<cudaError_t, size_t, unsigned>(result, size, alignment);
+}
+
+extern "C" {
+void SST_receive_mem_reply(unsigned core_id, void *mem_req) {
+  CUctx_st *context = GPGPUSim_Context(GPGPU_Context());
+  static_cast<sst_gpgpu_sim *>(context->get_device()->get_gpgpu())
+      ->SST_receive_mem_reply(core_id, mem_req);
+  // printf("GPGPU-sim: Recived Request\n");
+}
+
+bool SST_gpu_core_cycle() { return SST_Cycle(); }
+
+void SST_gpgpusim_numcores_equal_check(unsigned sst_numcores) {
+  CUctx_st *context = GPGPUSim_Context(GPGPU_Context());
+  static_cast<sst_gpgpu_sim *>(context->get_device()->get_gpgpu())
+      ->SST_gpgpusim_numcores_equal_check(sst_numcores);
+}
+
+uint64_t cudaMallocSST(void **devPtr, size_t size) {
+  if (g_debug_execution >= 3) {
+    announce_call(__my_func__);
+  }
+  void *test_malloc;
+  test_malloc = (void *)malloc(size);
+  void **test_malloc2 = &test_malloc;
+  CUctx_st *context = GPGPUSim_Context(GPGPU_Context());
+  *test_malloc2 = context->get_device()->get_gpgpu()->gpu_malloc(size);
+  printf("GPGPU-Sim PTX: cudaMallocing %zu bytes starting at 0x%llx..\n", size,
+         (unsigned long long)*test_malloc2);
+  if (g_debug_execution >= 3)
+    printf("GPGPU-Sim PTX: cudaMallocing %zu bytes starting at 0x%llx..\n",
+           size, (unsigned long long)*test_malloc2);
+  return (uint64_t)*test_malloc2;
+}
+
+__host__ cudaError_t CUDARTAPI cudaMallocHostSST(void *addr, size_t size) {
+  return cudaMallocHostSSTInternal(addr, size);
+}
+
 cudaError_t cudaPeekAtLastError(void) { return g_last_cudaError; }
 
 __host__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size) {
@@ -2534,6 +2668,7 @@ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
  *                                                                              *
  *                                                                              *
  *******************************************************************************/
+
 __host__ cudaError_t CUDARTAPI cudaMemset(void *mem, int c, size_t count) {
   return cudaMemsetInternal(mem, c, count);
 }
@@ -2754,11 +2889,32 @@ __host__ const char *CUDARTAPI cudaGetErrorString(cudaError_t error) {
   return strdup(buf);
 }
 
+// SST specific cuda apis
+__host__ cudaError_t CUDARTAPI cudaSetupArgumentSST(uint64_t arg,
+                                                    uint8_t value[200],
+                                                    size_t size,
+                                                    size_t offset) {
+  void *local_value;
+  local_value = (void *)malloc(size);
+
+  if (arg) {
+    memcpy(local_value, (void *)&arg, size);
+  } else {
+    memcpy(local_value, value, size);
+  }
+  return cudaSetupArgumentInternal(local_value, size, offset);
+}
+
 __host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg, size_t size,
                                                  size_t offset) {
   return cudaSetupArgumentInternal(arg, size, offset);
 }
 
+// SST specific cuda apis
+__host__ cudaError_t CUDARTAPI cudaLaunchSST(uint64_t hostFun) {
+  return cudaLaunchInternal((char *)hostFun);
+}
+
 __host__ cudaError_t CUDARTAPI cudaLaunch(const char *hostFun) {
   return cudaLaunchInternal(hostFun);
 }
@@ -2933,6 +3089,27 @@ __host__ cudaError_t CUDARTAPI cudaThreadSynchronize(void) {
   return cudaThreadSynchronizeInternal();
 }
 
+__host__ cudaError_t CUDARTAPI cudaThreadSynchronizeSST(void) {
+  // For SST, perform a one-time check and let SST_Cycle()
+  // do the polling test and invoke callback to SST
+  // to signal ThreadSynchonize done
+  gpgpu_context *ctx = GPGPU_Context();
+  if (g_debug_execution >= 3) {
+    announce_call(__my_func__);
+  }
+
+  // Called on host side
+  bool thread_sync_done = ctx->synchronize_check();
+  g_last_cudaError = cudaSuccess;
+  if (thread_sync_done) {
+    // We are already done, so no need to poll for sync done
+    ctx->requested_synchronize = false;
+    return cudaSuccess;
+  } else {
+    return cudaErrorNotReady;
+  }
+}
+
 int CUDARTAPI __cudaSynchronizeThreads(void **, void *) {
   if (g_debug_execution >= 3) {
     announce_call(__my_func__);
@@ -2992,10 +3169,10 @@ __host__ cudaError_t CUDARTAPI cudaGetExportTable(
 
 // extracts all ptx files from binary and dumps into
 // prog_name.unique_no.sm_<>.ptx files
-void cuda_runtime_api::extract_ptx_files_using_cuobjdump(CUctx_st *context) {
+void cuda_runtime_api::extract_ptx_files_using_cuobjdump_internal(
+    CUctx_st *context, std::string &app_binary) {
   char command[1000];
   char *pytorch_bin = getenv("PYTORCH_BIN");
-  std::string app_binary = get_app_binary();
 
   char ptx_list_file_name[1024];
   snprintf(ptx_list_file_name, 1024, "_cuobjdump_list_ptx_XXXXXX");
@@ -3062,6 +3239,17 @@ void cuda_runtime_api::extract_ptx_files_using_cuobjdump(CUctx_st *context) {
   }
 }
 
+void cuda_runtime_api::extract_ptx_files_using_cuobjdump(CUctx_st *context,
+                                                         const char *fn) {
+  std::string app_binary = get_app_binary(fn);
+  this->extract_ptx_files_using_cuobjdump_internal(context, app_binary);
+}
+
+void cuda_runtime_api::extract_ptx_files_using_cuobjdump(CUctx_st *context) {
+  std::string app_binary = get_app_binary();
+  this->extract_ptx_files_using_cuobjdump_internal(context, app_binary);
+}
+
 //! Call cuobjdump to extract everything (-elf -sass -ptx)
 /*!
  *	This Function extract the whole PTX (for all the files) using cuobjdump
@@ -3069,13 +3257,12 @@ void cuda_runtime_api::extract_ptx_files_using_cuobjdump(CUctx_st *context) {
  *with each binary in its own file It is also responsible for extracting the
  *libraries linked to the binary if the option is enabled
  * */
-void cuda_runtime_api::extract_code_using_cuobjdump() {
-  CUctx_st *context = GPGPUSim_Context(gpgpu_ctx);
-
+void cuda_runtime_api::extract_code_using_cuobjdump_internal(
+    CUctx_st *context, std::string &app_binary,
+    std::function<void(CUctx_st *)> ctx_extract_ptx_func) {
   // prevent the dumping by cuobjdump everytime we execute the code!
   const char *override_cuobjdump = getenv("CUOBJDUMP_SIM_FILE");
   char command[1000];
-  std::string app_binary = get_app_binary();
   // Running cuobjdump using dynamic link to current process
   snprintf(command, 1000, "md5sum %s ", app_binary.c_str());
   printf("Running md5sum using \"%s\"\n", command);
@@ -3090,7 +3277,7 @@ void cuda_runtime_api::extract_code_using_cuobjdump() {
   // used by ptxas.
   int result = 0;
 #if (CUDART_VERSION >= 6000)
-  extract_ptx_files_using_cuobjdump(context);
+  ctx_extract_ptx_func(context);
   return;
 #endif
   // TODO: redundant to dump twice. how can it be prevented?
@@ -3222,6 +3409,26 @@ void cuda_runtime_api::extract_code_using_cuobjdump() {
   }
 }
 
+void cuda_runtime_api::extract_code_using_cuobjdump(const char *fn) {
+  CUctx_st *context = GPGPUSim_Context(gpgpu_ctx);
+  std::string app_binary = get_app_binary(fn);
+  auto ctx_extract_ptx_func = [=](CUctx_st *context) {
+    extract_ptx_files_using_cuobjdump(context, fn);
+  };
+  extract_code_using_cuobjdump_internal(context, app_binary,
+                                        ctx_extract_ptx_func);
+}
+
+void cuda_runtime_api::extract_code_using_cuobjdump() {
+  CUctx_st *context = GPGPUSim_Context(gpgpu_ctx);
+  std::string app_binary = get_app_binary();
+  auto ctx_extract_ptx_func = [=](CUctx_st *context) {
+    extract_ptx_files_using_cuobjdump(context);
+  };
+  extract_code_using_cuobjdump_internal(context, app_binary,
+                                        ctx_extract_ptx_func);
+}
+
 //! Read file into char*
 // TODO: convert this to C++ streams, will be way cleaner
 char *readfile(const std::string filename) {
@@ -3466,10 +3673,11 @@ cuobjdumpPTXSection *cuda_runtime_api::findPTXSection(
 }
 
 //! Extract the code using cuobjdump and remove unnecessary sections
-void cuda_runtime_api::cuobjdumpInit() {
+void cuda_runtime_api::cuobjdumpInit_internal(
+    std::function<void()> ctx_extract_code_func) {
   CUctx_st *context = GPGPUSim_Context(gpgpu_ctx);
-  extract_code_using_cuobjdump();  // extract all the output of cuobjdump to
-                                   // _cuobjdump_*.*
+  ctx_extract_code_func();  // extract all the output of cuobjdump to
+                            // _cuobjdump_*.*
   const char *pre_load = getenv("CUOBJDUMP_SIM_FILE");
   if (pre_load == NULL || strlen(pre_load) == 0) {
     cuobjdumpSectionList = pruneSectionList(context);
@@ -3477,6 +3685,16 @@ void cuda_runtime_api::cuobjdumpInit() {
   }
 }
 
+void cuda_runtime_api::cuobjdumpInit(const char *fn) {
+  auto ctx_extract_code_func = [=]() { extract_code_using_cuobjdump(fn); };
+  cuobjdumpInit_internal(ctx_extract_code_func);
+}
+
+void cuda_runtime_api::cuobjdumpInit() {
+  auto ctx_extract_code_func = [=]() { extract_code_using_cuobjdump(); };
+  cuobjdumpInit_internal(ctx_extract_code_func);
+}
+
 //! Either submit PTX for simulation or convert SASS to PTXPlus and submit it
 void gpgpu_context::cuobjdumpParseBinary(unsigned int handle) {
   CUctx_st *context = GPGPUSim_Context(this);
@@ -3587,6 +3805,10 @@ void gpgpu_context::cuobjdumpParseBinary(unsigned int handle) {
 
 extern "C" {
 
+void **CUDARTAPI __cudaRegisterFatBinarySST(const char *fn) {
+  return cudaRegisterFatBinaryInternal(fn, NULL);
+}
+
 void **CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) {
   if (g_debug_execution >= 3) {
     announce_call(__my_func__);
@@ -3619,6 +3841,14 @@ cudaError_t CUDARTAPI __cudaPopCallConfiguration(dim3 *gridDim, dim3 *blockDim,
   return g_last_cudaError = cudaSuccess;
 }
 
+void CUDARTAPI __cudaRegisterFunctionSST(unsigned fatCubinHandle,
+                                         uint64_t hostFun,
+                                         char deviceFun[512]) {
+  cudaRegisterFunctionInternal((void **)fatCubinHandle, (const char *)hostFun,
+                               (char *)deviceFun, NULL, NULL, NULL, NULL, NULL,
+                               NULL);
+}
+
 void CUDARTAPI __cudaRegisterFunction(void **fatCubinHandle,
                                       const char *hostFun, char *deviceFun,
                                       const char *deviceName, int thread_limit,
diff --git a/libcuda/gpgpu_context.h b/libcuda/gpgpu_context.h
index d0cd7c48c..5ef21177b 100644
--- a/libcuda/gpgpu_context.h
+++ b/libcuda/gpgpu_context.h
@@ -44,6 +44,9 @@ class gpgpu_context {
       s_g_pc_to_insn;  // a direct mapping from PC to instruction
   bool debug_tensorcore;
 
+  // SST related
+  bool requested_synchronize = false;
+
   // objects pointers for each file
   cuda_runtime_api *api;
   ptxinfo_data *ptxinfo;
@@ -54,6 +57,7 @@ class gpgpu_context {
   ptx_stats *stats;
   // member function list
   void synchronize();
+  bool synchronize_check();
   void exit_simulation();
   void print_simulation_time();
   int gpgpu_opencl_ptx_sim_main_perf(kernel_info_t *grid);
diff --git a/setup_environment b/setup_environment
index 342810151..2fac1b991 100644
--- a/setup_environment
+++ b/setup_environment
@@ -46,7 +46,6 @@ fi
 
 CC_VERSION=$(gcc --version | head -1 | awk '{for(i=1;i<=NF;i++){ if(match($i,/^[0-9]+\.[0-9]+\.[0-9]+$/)) {print $i; exit 0}}}')
 
-
 CUDA_VERSION_STRING=`$CUDA_INSTALL_PATH/bin/nvcc --version | awk '/release/ {print $5;}' | sed 's/,//'`;
 export CUDA_VERSION_NUMBER=`echo $CUDA_VERSION_STRING | sed 's/\./ /' | awk '{printf("%02u%02u", 10*int($1), 10*$2);}'`
 if [ $CUDA_VERSION_NUMBER -gt 11100 -o $CUDA_VERSION_NUMBER -lt 2030  ]; then
@@ -61,13 +60,18 @@ if [ $CUDA_VERSION_NUMBER -ge 6000 ]; then
 	export CUOBJDUMP_SIM_FILE=jj
 fi
 
+# Simple configure, loop through all positional arguments
+# Default config
+export GPGPUSIM_CONFIG=gcc-$CC_VERSION/cuda-$CUDA_VERSION_NUMBER/release
 
-if [ $# = '1' ] ;
-then
-    export GPGPUSIM_CONFIG=gcc-$CC_VERSION/cuda-$CUDA_VERSION_NUMBER/$1
-else
-    export GPGPUSIM_CONFIG=gcc-$CC_VERSION/cuda-$CUDA_VERSION_NUMBER/release
-fi
+for opt in $@
+do
+	if [[ $opt == 'debug' ]] ; then
+		# Debug mode
+		echo -n "enabled debug mode "
+		export GPGPUSIM_CONFIG=gcc-$CC_VERSION/cuda-$CUDA_VERSION_NUMBER/$1
+	fi
+done
 
 export QTINC=/usr/include
 
diff --git a/src/cuda-sim/cuda-sim.cc b/src/cuda-sim/cuda-sim.cc
index 833d33f5c..2fd90c0e5 100644
--- a/src/cuda-sim/cuda-sim.cc
+++ b/src/cuda-sim/cuda-sim.cc
@@ -1305,7 +1305,12 @@ void function_info::add_param_name_type_size(unsigned index, std::string name,
 void function_info::add_param_data(unsigned argn,
                                    struct gpgpu_ptx_sim_arg *args) {
   const void *data = args->m_start;
-
+  if (g_debug_execution >= 3) {
+    if (args->m_nbytes == 4)
+      printf("ADD_PARAM_DATA %d\n", *((uint32_t *)data));
+    else
+      printf("ADD_PARAM_DATA %p\n", *((void **)data));
+  }
   bool scratchpad_memory_param =
       false;  // Is this parameter in CUDA shared memory or OpenCL local memory
 
@@ -1746,6 +1751,17 @@ static unsigned get_tex_datasize(const ptx_instruction *pI,
                                  ptx_thread_info *thread) {
   const operand_info &src1 = pI->src1();  // the name of the texture
   std::string texname = src1.name();
+  // If indirect access, use register's value as address
+  // to find the symbol
+  if (src1.is_reg()) {
+    const operand_info &dst = pI->dst();
+    ptx_reg_t src1_data =
+        thread->get_operand_value(src1, dst, pI->get_type(), thread, 1);
+    addr_t sym_addr = src1_data.u64;
+    symbol *texRef = thread->get_symbol_table()->lookup_by_addr(sym_addr);
+    assert(texRef != NULL);
+    texname = texRef->name();
+  }
 
   /*
     For programs with many streams, textures can be bound and unbound
@@ -2285,15 +2301,24 @@ void cuda_sim::gpgpu_ptx_sim_memcpy_symbol(const char *hostVar, const void *src,
     sym_name = g->second;
     mem_region = global_space;
   }
-  if (g_globals.find(hostVar) != g_globals.end()) {
-    found_sym = true;
-    sym_name = hostVar;
-    mem_region = global_space;
-  }
-  if (g_constants.find(hostVar) != g_constants.end()) {
-    found_sym = true;
-    sym_name = hostVar;
-    mem_region = const_space;
+
+  // Weili: Only attempt to find symbol as it is a string
+  // if we could not find it in previously registered variable.
+  // This will avoid constructing std::string() from hostVar address
+  // where it is not a string as
+  // Use of a string naming a variable as the symbol parameter was deprecated in
+  // CUDA 4.1 and removed in CUDA 5.0.
+  if (!found_sym) {
+    if (g_globals.find(hostVar) != g_globals.end()) {
+      found_sym = true;
+      sym_name = hostVar;
+      mem_region = global_space;
+    }
+    if (g_constants.find(hostVar) != g_constants.end()) {
+      found_sym = true;
+      sym_name = hostVar;
+      mem_region = const_space;
+    }
   }
 
   if (!found_sym) {
diff --git a/src/cuda-sim/instructions.cc b/src/cuda-sim/instructions.cc
index 108de9759..843bf0ba7 100644
--- a/src/cuda-sim/instructions.cc
+++ b/src/cuda-sim/instructions.cc
@@ -6055,6 +6055,17 @@ void tex_impl(const ptx_instruction *pI, ptx_thread_info *thread) {
                    // to be fetched
 
   std::string texname = src1.name();
+  // If indirect access, use register's value as address
+  // to find the symbol
+  if (src1.is_reg()) {
+    ptx_reg_t src1_data =
+        thread->get_operand_value(src1, dst, pI->get_type(), thread, 1);
+    addr_t sym_addr = src1_data.u64;
+    symbol *texRef = thread->get_symbol_table()->lookup_by_addr(sym_addr);
+    assert(texRef != NULL);
+    texname = texRef->name();
+  }
+
   unsigned to_type = pI->get_type();
   unsigned c_type = pI->get_type2();
   fflush(stdout);
diff --git a/src/cuda-sim/ptx_ir.cc b/src/cuda-sim/ptx_ir.cc
index 139920930..4e500ccb4 100644
--- a/src/cuda-sim/ptx_ir.cc
+++ b/src/cuda-sim/ptx_ir.cc
@@ -139,6 +139,22 @@ symbol *symbol_table::lookup(const char *identifier) {
   return NULL;
 }
 
+symbol *symbol_table::lookup_by_addr(addr_t addr) {
+  for (auto it = m_symbols.begin(); it != m_symbols.end(); ++it) {
+    symbol *sym = it->second;
+
+    // check if symbol has the addr to be found
+    if ((!sym->is_reg()) && (sym->has_valid_address()) &&
+        (sym->get_address() == addr)) {
+      return sym;
+    }
+  }
+  if (m_parent) {
+    return m_parent->lookup_by_addr(addr);
+  }
+  return NULL;
+}
+
 symbol *symbol_table::add_variable(const char *identifier,
                                    const type_info *type, unsigned size,
                                    const char *filename, unsigned line) {
diff --git a/src/cuda-sim/ptx_ir.h b/src/cuda-sim/ptx_ir.h
index d253866db..b08a692d8 100644
--- a/src/cuda-sim/ptx_ir.h
+++ b/src/cuda-sim/ptx_ir.h
@@ -205,6 +205,7 @@ class symbol {
   const std::string &name() const { return m_name; }
   const std::string &decl_location() const { return m_decl_location; }
   const type_info *type() const { return m_type; }
+  bool has_valid_address() const { return m_address_valid; }
   addr_t get_address() const {
     assert(m_is_label ||
            !m_type->get_key().is_reg());  // todo : other assertions
@@ -310,6 +311,7 @@ class symbol_table {
   void set_ptx_version(float ver, unsigned ext);
   void set_sm_target(const char *target, const char *ext, const char *ext2);
   symbol *lookup(const char *identifier);
+  symbol *lookup_by_addr(addr_t addr);
   std::string get_scope_name() const { return m_scope_name; }
   symbol *add_variable(const char *identifier, const type_info *type,
                        unsigned size, const char *filename, unsigned line);
diff --git a/src/cuda-sim/ptx_sim.h b/src/cuda-sim/ptx_sim.h
index f0c26efc8..8eec922e4 100644
--- a/src/cuda-sim/ptx_sim.h
+++ b/src/cuda-sim/ptx_sim.h
@@ -459,6 +459,9 @@ class ptx_thread_info {
   // Jin: get corresponding kernel grid for CDP purpose
   kernel_info_t &get_kernel() { return m_kernel; }
 
+  // Weili: access symbol_table
+  symbol_table *get_symbol_table() { return m_symbol_table; }
+
  public:
   addr_t m_last_effective_address;
   bool m_branch_taken;
diff --git a/src/gpgpu-sim/gpu-cache.cc b/src/gpgpu-sim/gpu-cache.cc
index cd3c88033..0ea9ff63d 100644
--- a/src/gpgpu-sim/gpu-cache.cc
+++ b/src/gpgpu-sim/gpu-cache.cc
@@ -2062,6 +2062,7 @@ enum cache_request_status tex_cache::access(new_addr_type addr, mem_fetch *mf,
 
 void tex_cache::cycle() {
   // send next request to lower level of memory
+  // TODO: Use different full() for sst_mem_interface?
   if (!m_request_fifo.empty()) {
     mem_fetch *mf = m_request_fifo.peek();
     if (!m_memport->full(mf->get_ctrl_size(), false)) {
diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index 5bd41805d..b92494b43 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -319,6 +319,9 @@ void memory_config::reg_options(class OptionParser *opp) {
       "elimnate_rw_turnaround i.e set tWTR and tRTW = 0", "0");
   option_parser_register(opp, "-icnt_flit_size", OPT_UINT32, &icnt_flit_size,
                          "icnt_flit_size", "32");
+  // SST mode activate
+  option_parser_register(opp, "-SST_mode", OPT_BOOL, &SST_mode, "SST mode",
+                         "0");
   m_address_mapping.addrdec_setoption(opp);
 }
 
@@ -955,6 +958,16 @@ void exec_gpgpu_sim::createSIMTCluster() {
                                    m_shader_stats, m_memory_stats);
 }
 
+// SST get its own simt_cluster
+void sst_gpgpu_sim::createSIMTCluster() {
+  m_cluster = new simt_core_cluster *[m_shader_config->n_simt_clusters];
+  for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++)
+    m_cluster[i] =
+        new sst_simt_core_cluster(this, i, m_shader_config, m_memory_config,
+                                  m_shader_stats, m_memory_stats);
+  SST_gpgpu_reply_buffer.resize(m_shader_config->n_simt_clusters);
+}
+
 gpgpu_sim::gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx)
     : gpgpu_t(config, ctx), m_config(config) {
   gpgpu_ctx = ctx;
@@ -999,26 +1012,29 @@ gpgpu_sim::gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx)
 
   gpu_kernel_time.clear();
 
-  m_memory_partition_unit =
-      new memory_partition_unit *[m_memory_config->m_n_mem];
-  m_memory_sub_partition =
-      new memory_sub_partition *[m_memory_config->m_n_mem_sub_partition];
-  for (unsigned i = 0; i < m_memory_config->m_n_mem; i++) {
-    m_memory_partition_unit[i] =
-        new memory_partition_unit(i, m_memory_config, m_memory_stats, this);
-    for (unsigned p = 0;
-         p < m_memory_config->m_n_sub_partition_per_memory_channel; p++) {
-      unsigned submpid =
-          i * m_memory_config->m_n_sub_partition_per_memory_channel + p;
-      m_memory_sub_partition[submpid] =
-          m_memory_partition_unit[i]->get_sub_partition(p);
+  // TODO: somehow move this logic to the sst_gpgpu_sim constructor?
+  if (!m_config.is_SST_mode()) {
+    // Init memory if not in SST mode
+    m_memory_partition_unit =
+        new memory_partition_unit *[m_memory_config->m_n_mem];
+    m_memory_sub_partition =
+        new memory_sub_partition *[m_memory_config->m_n_mem_sub_partition];
+    for (unsigned i = 0; i < m_memory_config->m_n_mem; i++) {
+      m_memory_partition_unit[i] =
+          new memory_partition_unit(i, m_memory_config, m_memory_stats, this);
+      for (unsigned p = 0;
+           p < m_memory_config->m_n_sub_partition_per_memory_channel; p++) {
+        unsigned submpid =
+            i * m_memory_config->m_n_sub_partition_per_memory_channel + p;
+        m_memory_sub_partition[submpid] =
+            m_memory_partition_unit[i]->get_sub_partition(p);
+      }
     }
-  }
-
-  icnt_wrapper_init();
-  icnt_create(m_shader_config->n_simt_clusters,
-              m_memory_config->m_n_mem_sub_partition);
 
+    icnt_wrapper_init();
+    icnt_create(m_shader_config->n_simt_clusters,
+                m_memory_config->m_n_mem_sub_partition);
+  }
   time_vector_create(NUM_MEM_REQ_STAT);
   fprintf(stdout,
           "GPGPU-Sim uArch: performance model initialization complete.\n");
@@ -1037,6 +1053,22 @@ gpgpu_sim::gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx)
   m_functional_sim_kernel = NULL;
 }
 
+void sst_gpgpu_sim::SST_receive_mem_reply(unsigned core_id, void *mem_req) {
+  assert(core_id < m_shader_config->n_simt_clusters);
+  mem_fetch *mf = (mem_fetch *)mem_req;
+
+  (SST_gpgpu_reply_buffer[core_id]).push_back(mf);
+}
+
+mem_fetch *sst_gpgpu_sim::SST_pop_mem_reply(unsigned core_id) {
+  if (SST_gpgpu_reply_buffer[core_id].size() > 0) {
+    mem_fetch *temp = SST_gpgpu_reply_buffer[core_id].front();
+    SST_gpgpu_reply_buffer[core_id].pop_front();
+    return temp;
+  } else
+    return NULL;
+}
+
 int gpgpu_sim::shared_mem_size() const {
   return m_shader_config->gpgpu_shmem_size;
 }
@@ -1132,6 +1164,26 @@ bool gpgpu_sim::active() {
   return false;
 }
 
+bool sst_gpgpu_sim::active() {
+  if (m_config.gpu_max_cycle_opt &&
+      (gpu_tot_sim_cycle + gpu_sim_cycle) >= m_config.gpu_max_cycle_opt)
+    return false;
+  if (m_config.gpu_max_insn_opt &&
+      (gpu_tot_sim_insn + gpu_sim_insn) >= m_config.gpu_max_insn_opt)
+    return false;
+  if (m_config.gpu_max_cta_opt &&
+      (gpu_tot_issued_cta >= m_config.gpu_max_cta_opt))
+    return false;
+  if (m_config.gpu_max_completed_cta_opt &&
+      (gpu_completed_cta >= m_config.gpu_max_completed_cta_opt))
+    return false;
+  if (m_config.gpu_deadlock_detect && gpu_deadlock) return false;
+  for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++)
+    if (m_cluster[i]->get_not_completed() > 0) return true;
+  if (get_more_cta_left()) return true;
+  return false;
+}
+
 void gpgpu_sim::init() {
   // run a CUDA grid on the GPU microarchitecture simulator
   gpu_sim_cycle = 0;
@@ -2157,6 +2209,11 @@ void gpgpu_sim::cycle() {
   }
 }
 
+void sst_gpgpu_sim::cycle() {
+  SST_cycle();
+  return;
+}
+
 void shader_core_ctx::dump_warp_state(FILE *fout) const {
   fprintf(fout, "\n");
   fprintf(fout, "per warp functional simulation status:\n");
@@ -2236,3 +2293,110 @@ const shader_core_config *gpgpu_sim::getShaderCoreConfig() {
 const memory_config *gpgpu_sim::getMemoryConfig() { return m_memory_config; }
 
 simt_core_cluster *gpgpu_sim::getSIMTCluster() { return *m_cluster; }
+
+void sst_gpgpu_sim::SST_gpgpusim_numcores_equal_check(unsigned sst_numcores) {
+  if (m_shader_config->n_simt_clusters != sst_numcores) {
+    assert(
+        "\nSST core is not equal the GPGPU-sim cores. Open gpgpu-sim.config "
+        "file and ensure n_simt_clusters"
+        "is the same as SST gpu cores.\n" &&
+        0);
+  } else {
+    printf("\nSST GPU core is equal the GPGPU-sim cores = %d\n", sst_numcores);
+  }
+}
+
+void sst_gpgpu_sim::SST_cycle() {
+  // shader core loading (pop from ICNT into core) follows CORE clock
+  for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++)
+    static_cast<sst_simt_core_cluster *>(m_cluster[i])->icnt_cycle_SST();
+
+  // L1 cache + shader core pipeline stages
+  m_power_stats->pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX].clear();
+  for (unsigned i = 0; i < m_shader_config->n_simt_clusters; i++) {
+    if (m_cluster[i]->get_not_completed() || get_more_cta_left()) {
+      m_cluster[i]->core_cycle();
+      *active_sms += m_cluster[i]->get_n_active_sms();
+    }
+    // Update core icnt/cache stats for GPUWattch
+    m_cluster[i]->get_icnt_stats(
+        m_power_stats->pwr_mem_stat->n_simt_to_mem[CURRENT_STAT_IDX][i],
+        m_power_stats->pwr_mem_stat->n_mem_to_simt[CURRENT_STAT_IDX][i]);
+    m_cluster[i]->get_cache_stats(
+        m_power_stats->pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX]);
+  }
+  float temp = 0;
+  for (unsigned i = 0; i < m_shader_config->num_shader(); i++) {
+    temp += m_shader_stats->m_pipeline_duty_cycle[i];
+  }
+  temp = temp / m_shader_config->num_shader();
+  *average_pipeline_duty_cycle = ((*average_pipeline_duty_cycle) + temp);
+  // cout<<"Average pipeline duty cycle: "<<*average_pipeline_duty_cycle<<endl;
+
+  if (g_single_step && ((gpu_sim_cycle + gpu_tot_sim_cycle) >= g_single_step)) {
+    asm("int $03");
+  }
+  gpu_sim_cycle++;
+  if (g_interactive_debugger_enabled) gpgpu_debug();
+
+    // McPAT main cycle (interface with McPAT)
+#ifdef GPGPUSIM_POWER_MODEL
+  if (m_config.g_power_simulation_enabled) {
+    mcpat_cycle(m_config, getShaderCoreConfig(), m_gpgpusim_wrapper,
+                m_power_stats, m_config.gpu_stat_sample_freq, gpu_tot_sim_cycle,
+                gpu_sim_cycle, gpu_tot_sim_insn, gpu_sim_insn,
+                m_config.g_dvfs_enabled);
+  }
+#endif
+
+  issue_block2core();
+
+  if (!(gpu_sim_cycle % m_config.gpu_stat_sample_freq)) {
+    time_t days, hrs, minutes, sec;
+    time_t curr_time;
+    time(&curr_time);
+    unsigned long long elapsed_time =
+        MAX(curr_time - gpgpu_ctx->the_gpgpusim->g_simulation_starttime, 1);
+    if ((elapsed_time - last_liveness_message_time) >=
+        m_config.liveness_message_freq) {
+      days = elapsed_time / (3600 * 24);
+      hrs = elapsed_time / 3600 - 24 * days;
+      minutes = elapsed_time / 60 - 60 * (hrs + 24 * days);
+      sec = elapsed_time - 60 * (minutes + 60 * (hrs + 24 * days));
+
+      last_liveness_message_time = elapsed_time;
+    }
+    visualizer_printstat();
+    m_memory_stats->memlatstat_lat_pw();
+    if (m_config.gpgpu_runtime_stat && (m_config.gpu_runtime_stat_flag != 0)) {
+      if (m_config.gpu_runtime_stat_flag & GPU_RSTAT_BW_STAT) {
+        for (unsigned i = 0; i < m_memory_config->m_n_mem; i++)
+          m_memory_partition_unit[i]->print_stat(stdout);
+        printf("maxmrqlatency = %d \n", m_memory_stats->max_mrq_latency);
+        printf("maxmflatency = %d \n", m_memory_stats->max_mf_latency);
+      }
+      if (m_config.gpu_runtime_stat_flag & GPU_RSTAT_SHD_INFO)
+        shader_print_runtime_stat(stdout);
+      if (m_config.gpu_runtime_stat_flag & GPU_RSTAT_L1MISS)
+        shader_print_l1_miss_stat(stdout);
+      if (m_config.gpu_runtime_stat_flag & GPU_RSTAT_SCHED)
+        shader_print_scheduler_stat(stdout, false);
+    }
+  }
+
+  if (!(gpu_sim_cycle % 20000)) {
+    // deadlock detection
+    if (m_config.gpu_deadlock_detect && gpu_sim_insn == last_gpu_sim_insn) {
+      gpu_deadlock = true;
+    } else {
+      last_gpu_sim_insn = gpu_sim_insn;
+    }
+  }
+  try_snap_shot(gpu_sim_cycle);
+  spill_log_to_file(stdout, 0, gpu_sim_cycle);
+
+#if (CUDART_VERSION >= 5000)
+  // launch device kernel
+  gpgpu_ctx->device_runtime->launch_one_device_kernel();
+#endif
+}
diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h
index 8e81451b6..d0c2a1763 100644
--- a/src/gpgpu-sim/gpu-sim.h
+++ b/src/gpgpu-sim/gpu-sim.h
@@ -69,6 +69,38 @@ class gpgpu_context;
 
 extern tr1_hash_map<new_addr_type, unsigned> address_random_interleaving;
 
+// SST communication functions
+/**
+ * @brief Check if SST requests buffer is full
+ *
+ * @param core_id
+ * @return true
+ * @return false
+ */
+extern bool is_SST_buffer_full(unsigned core_id);
+
+/**
+ * @brief Send loads to SST memory backend
+ *
+ * @param core_id
+ * @param address
+ * @param size
+ * @param mem_req
+ */
+extern void send_read_request_SST(unsigned core_id, uint64_t address,
+                                  size_t size, void *mem_req);
+
+/**
+ * @brief Send stores to SST memory backend
+ *
+ * @param core_id
+ * @param address
+ * @param size
+ * @param mem_req
+ */
+extern void send_write_request_SST(unsigned core_id, uint64_t address,
+                                   size_t size, void *mem_req);
+
 enum dram_ctrl_t { DRAM_FIFO = 0, DRAM_FRFCFS = 1 };
 
 enum hw_perf_t {
@@ -274,6 +306,14 @@ class memory_config {
   }
   void reg_options(class OptionParser *opp);
 
+  /**
+   * @brief Check if the config script is in SST mode
+   *
+   * @return true
+   * @return false
+   */
+  bool is_SST_mode() const { return SST_mode; }
+
   bool m_valid;
   mutable l2_cache_config m_L2_config;
   bool m_L2_texure_only;
@@ -351,7 +391,7 @@ class memory_config {
   unsigned write_low_watermark;
   bool m_perf_sim_memcpy;
   bool simple_dram_model;
-
+  bool SST_mode;
   gpgpu_context *gpgpu_ctx;
 };
 
@@ -398,6 +438,15 @@ class gpgpu_sim_config : public power_config,
   unsigned num_shader() const { return m_shader_config.num_shader(); }
   unsigned num_cluster() const { return m_shader_config.n_simt_clusters; }
   unsigned get_max_concurrent_kernel() const { return max_concurrent_kernel; }
+
+  /**
+   * @brief Check if we are in SST mode
+   *
+   * @return true
+   * @return false
+   */
+  bool is_SST_mode() const { return m_memory_config.SST_mode; }
+
   unsigned checkpoint_option;
 
   size_t stack_limit() const { return stack_size_limit; }
@@ -462,6 +511,7 @@ class gpgpu_sim_config : public power_config,
   unsigned long long liveness_message_freq;
 
   friend class gpgpu_sim;
+  friend class sst_gpgpu_sim;
 };
 
 struct occupancy_stats {
@@ -600,10 +650,18 @@ class gpgpu_sim : public gpgpu_t {
   void hit_watchpoint(unsigned watchpoint_num, ptx_thread_info *thd,
                       const ptx_instruction *pI);
 
+  /**
+   * @brief Check if we are in SST mode
+   *
+   * @return true
+   * @return false
+   */
+  bool is_SST_mode() { return m_config.is_SST_mode(); }
+
   // backward pointer
   class gpgpu_context *gpgpu_ctx;
 
- private:
+ protected:
   // clocks
   void reinit_clock_domains(void);
   int next_clock_domain(void);
@@ -715,7 +773,7 @@ class gpgpu_sim : public gpgpu_t {
   void set_cache_config(std::string kernel_name);
 
   // Jin: functional simulation for CDP
- private:
+ protected:
   // set by stream operation every time a functoinal simulation is done
   bool m_functional_sim;
   kernel_info_t *m_functional_sim_kernel;
@@ -748,4 +806,79 @@ class exec_gpgpu_sim : public gpgpu_sim {
   virtual void createSIMTCluster();
 };
 
+/**
+ * @brief A GPGPUSim class customized to SST Balar interfacing
+ *
+ */
+class sst_gpgpu_sim : public gpgpu_sim {
+ public:
+  sst_gpgpu_sim(const gpgpu_sim_config &config, gpgpu_context *ctx)
+      : gpgpu_sim(config, ctx) {
+    createSIMTCluster();
+  }
+
+  // SST memory handling
+  std::vector<std::deque<mem_fetch *>>
+      SST_gpgpu_reply_buffer; /** SST mem response queue */
+
+  /**
+   * @brief Receive mem request's response from SST and put
+   *        it in a buffer (SST_gpgpu_reply_buffer)
+   *
+   * @param core_id
+   * @param mem_req
+   */
+  void SST_receive_mem_reply(unsigned core_id, void *mem_req);
+
+  /**
+   * @brief Pop the head of the buffer queue to get the
+   *        memory response
+   *
+   * @param core_id
+   * @return mem_fetch*
+   */
+  mem_fetch *SST_pop_mem_reply(unsigned core_id);
+
+  virtual void createSIMTCluster();
+
+  // SST Balar interfacing
+  /**
+   * @brief Advance core and collect stats
+   *
+   */
+  void SST_cycle();
+
+  /**
+   * @brief Wrapper of SST_cycle()
+   *
+   */
+  void cycle();
+
+  /**
+   * @brief Whether the GPU is active, removed test for
+   *        memory system since that is handled in SST
+   *
+   * @return true
+   * @return false
+   */
+  bool active();
+
+  /**
+   * @brief SST mode use SST memory system instead, so the memcpy
+   *        is empty here
+   *
+   * @param dst_start_addr
+   * @param count
+   */
+  void perf_memcpy_to_gpu(size_t dst_start_addr, size_t count){};
+
+  /**
+   * @brief Check if the SST config matches up with the
+   *        gpgpusim.config in core number
+   *
+   * @param sst_numcores SST core count
+   */
+  void SST_gpgpusim_numcores_equal_check(unsigned sst_numcores);
+};
+
 #endif
diff --git a/src/gpgpu-sim/mem_fetch.cc b/src/gpgpu-sim/mem_fetch.cc
index 7211a7dd3..809c92081 100644
--- a/src/gpgpu-sim/mem_fetch.cc
+++ b/src/gpgpu-sim/mem_fetch.cc
@@ -54,9 +54,15 @@ mem_fetch::mem_fetch(const mem_access_t &access, const warp_inst_t *inst,
   m_sid = sid;
   m_tpc = tpc;
   m_wid = wid;
-  config->m_address_mapping.addrdec_tlx(access.get_addr(), &m_raw_addr);
-  m_partition_addr =
-      config->m_address_mapping.partition_address(access.get_addr());
+
+  if (!config->is_SST_mode()) {
+    // In SST memory model, the SST memory hierarchy is
+    // responsible to generate the correct address mapping
+    config->m_address_mapping.addrdec_tlx(access.get_addr(), &m_raw_addr);
+    m_partition_addr =
+        config->m_address_mapping.partition_address(access.get_addr());
+  }
+
   m_type = m_access.is_write() ? WRITE_REQUEST : READ_REQUEST;
   m_timestamp = cycle;
   m_timestamp2 = 0;
diff --git a/src/gpgpu-sim/mem_latency_stat.cc b/src/gpgpu-sim/mem_latency_stat.cc
index 63d7ee80c..c77a68648 100644
--- a/src/gpgpu-sim/mem_latency_stat.cc
+++ b/src/gpgpu-sim/mem_latency_stat.cc
@@ -203,7 +203,15 @@ unsigned memory_stats_t::memlatstat_done(mem_fetch *mf) {
 }
 
 void memory_stats_t::memlatstat_read_done(mem_fetch *mf) {
-  if (m_memory_config->gpgpu_memlatency_stat) {
+  if (m_memory_config->SST_mode) {
+    // in SST mode, we just calculate mem latency
+    unsigned mf_latency;
+    mf_latency =
+        (m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle) - mf->get_timestamp();
+    num_mfs++;
+    mf_total_lat += mf_latency;
+    if (mf_latency > max_mf_latency) max_mf_latency = mf_latency;
+  } else if (m_memory_config->gpgpu_memlatency_stat) {
     unsigned mf_latency = memlatstat_done(mf);
     if (mf_latency >
         mf_max_lat_table[mf->get_tlx_addr().chip][mf->get_tlx_addr().bk])
@@ -273,7 +281,12 @@ void memory_stats_t::memlatstat_print(unsigned n_mem, unsigned gpu_mem_n_bk) {
   unsigned max_bank_accesses, min_bank_accesses, max_chip_accesses,
       min_chip_accesses;
 
-  if (m_memory_config->gpgpu_memlatency_stat) {
+  if (m_memory_config->SST_mode) {
+    // in SST mode, we just calculate mem latency
+    printf("max_mem_SST_latency = %d \n", max_mf_latency);
+    if (num_mfs)
+      printf("average_mf_SST_latency = %lld \n", mf_total_lat / num_mfs);
+  } else if (m_memory_config->gpgpu_memlatency_stat) {
     printf("maxmflatency = %d \n", max_mf_latency);
     printf("max_icnt2mem_latency = %d \n", max_icnt2mem_latency);
     printf("maxmrqlatency = %d \n", max_mrq_latency);
diff --git a/src/gpgpu-sim/shader.cc b/src/gpgpu-sim/shader.cc
index 4d4f11277..7482e0ef9 100644
--- a/src/gpgpu-sim/shader.cc
+++ b/src/gpgpu-sim/shader.cc
@@ -162,7 +162,10 @@ void shader_core_ctx::create_front_pipeline() {
   }
 
   // m_icnt = new shader_memory_interface(this,cluster);
-  if (m_config->gpgpu_perfect_mem) {
+  if (m_memory_config->SST_mode) {
+    m_icnt = new sst_memory_interface(
+        this, static_cast<sst_simt_core_cluster *>(m_cluster));
+  } else if (m_config->gpgpu_perfect_mem) {
     m_icnt = new perfect_memory_interface(this, m_cluster);
   } else {
     m_icnt = new shader_memory_interface(this, m_cluster);
@@ -2281,7 +2284,15 @@ bool ldst_unit::memory_cycle(warp_inst_t &inst,
         inst.is_store() ? WRITE_PACKET_SIZE : READ_PACKET_SIZE;
     unsigned size = access.get_size() + control_size;
     // printf("Interconnect:Addr: %x, size=%d\n",access.get_addr(),size);
-    if (m_icnt->full(size, inst.is_store() || inst.isatomic())) {
+    if (m_memory_config->SST_mode &&
+        (static_cast<sst_memory_interface *>(m_icnt)->full(
+            size, inst.is_store() || inst.isatomic(), access.get_type()))) {
+      // SST need mf type here
+      // Cast it to sst_memory_interface pointer first as this full() method
+      // is not a virtual method in parent class
+      stall_cond = ICNT_RC_FAIL;
+    } else if (!m_memory_config->SST_mode &&
+               (m_icnt->full(size, inst.is_store() || inst.isatomic()))) {
       stall_cond = ICNT_RC_FAIL;
     } else {
       mem_fetch *mf =
@@ -2846,7 +2857,10 @@ void ldst_unit::cycle() {
       }
     } else {
       if (mf->get_type() == WRITE_ACK ||
-          (m_config->gpgpu_perfect_mem && mf->get_is_write())) {
+          ((m_config->gpgpu_perfect_mem || m_memory_config->SST_mode) &&
+           mf->get_is_write())) {
+        // SST memory is handled by SST mem hierarchy
+        // Perfect mem
         m_core->store_ack(mf);
         m_response_fifo.pop_front();
         delete mf;
@@ -4020,7 +4034,8 @@ void shader_core_ctx::accept_ldst_unit_response(mem_fetch *mf) {
 
 void shader_core_ctx::store_ack(class mem_fetch *mf) {
   assert(mf->get_type() == WRITE_ACK ||
-         (m_config->gpgpu_perfect_mem && mf->get_is_write()));
+         ((m_config->gpgpu_perfect_mem || m_memory_config->SST_mode) &&
+          mf->get_is_write()));
   unsigned warp_id = mf->get_wid();
   m_warp[warp_id]->dec_store_req();
 }
@@ -4573,7 +4588,46 @@ bool simt_core_cluster::icnt_injection_buffer_full(unsigned size, bool write) {
   return !::icnt_has_buffer(m_cluster_id, request_size);
 }
 
+bool sst_simt_core_cluster::SST_injection_buffer_full(unsigned size, bool write,
+                                                      mem_access_type type) {
+  switch (type) {
+    case CONST_ACC_R:
+    case INST_ACC_R: {
+      return response_queue_full();
+      break;
+    }
+    default: {
+      return ::is_SST_buffer_full(m_cluster_id);
+      break;
+    }
+  }
+}
+
 void simt_core_cluster::icnt_inject_request_packet(class mem_fetch *mf) {
+  // Update stats based on mf type
+  update_icnt_stats(mf);
+
+  // The packet size varies depending on the type of request:
+  // - For write request and atomic request, the packet contains the data
+  // - For read request (i.e. not write nor atomic), the packet only has control
+  // metadata
+  unsigned int packet_size = mf->size();
+  if (!mf->get_is_write() && !mf->isatomic()) {
+    packet_size = mf->get_ctrl_size();
+  }
+  m_stats->m_outgoing_traffic_stats->record_traffic(mf, packet_size);
+  unsigned destination = mf->get_sub_partition_id();
+  mf->set_status(IN_ICNT_TO_MEM,
+                 m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
+  if (!mf->get_is_write() && !mf->isatomic())
+    ::icnt_push(m_cluster_id, m_config->mem2device(destination), (void *)mf,
+                mf->get_ctrl_size());
+  else
+    ::icnt_push(m_cluster_id, m_config->mem2device(destination), (void *)mf,
+                mf->size());
+}
+
+void simt_core_cluster::update_icnt_stats(class mem_fetch *mf) {
   // stats
   if (mf->get_is_write())
     m_stats->made_write_mfs++;
@@ -4618,6 +4672,12 @@ void simt_core_cluster::icnt_inject_request_packet(class mem_fetch *mf) {
     default:
       assert(0);
   }
+}
+
+void sst_simt_core_cluster::icnt_inject_request_packet_to_SST(
+    class mem_fetch *mf) {
+  // Update stats
+  update_icnt_stats(mf);
 
   // The packet size varies depending on the type of request:
   // - For write request and atomic request, the packet contains the data
@@ -4628,15 +4688,25 @@ void simt_core_cluster::icnt_inject_request_packet(class mem_fetch *mf) {
     packet_size = mf->get_ctrl_size();
   }
   m_stats->m_outgoing_traffic_stats->record_traffic(mf, packet_size);
-  unsigned destination = mf->get_sub_partition_id();
   mf->set_status(IN_ICNT_TO_MEM,
                  m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
-  if (!mf->get_is_write() && !mf->isatomic())
-    ::icnt_push(m_cluster_id, m_config->mem2device(destination), (void *)mf,
-                mf->get_ctrl_size());
-  else
-    ::icnt_push(m_cluster_id, m_config->mem2device(destination), (void *)mf,
-                mf->size());
+  switch (mf->get_access_type()) {
+    case CONST_ACC_R:
+    case INST_ACC_R: {
+      push_response_fifo(mf);
+      break;
+    }
+    default: {
+      if (!mf->get_is_write() && !mf->isatomic())
+        ::send_read_request_SST(m_cluster_id, mf->get_addr(),
+                                mf->get_data_size(), (void *)mf);
+      else
+        ::send_write_request_SST(m_cluster_id, mf->get_addr(),
+                                 mf->get_data_size(), (void *)mf);
+
+      break;
+    }
+  }
 }
 
 void simt_core_cluster::icnt_cycle() {
@@ -4678,6 +4748,49 @@ void simt_core_cluster::icnt_cycle() {
   }
 }
 
+void sst_simt_core_cluster::icnt_cycle_SST() {
+  if (!m_response_fifo.empty()) {
+    mem_fetch *mf = m_response_fifo.front();
+    unsigned cid = m_config->sid_to_cid(mf->get_sid());
+    if (mf->get_access_type() == INST_ACC_R) {
+      // instruction fetch response
+      if (!m_core[cid]->fetch_unit_response_buffer_full()) {
+        m_response_fifo.pop_front();
+        m_core[cid]->accept_fetch_response(mf);
+      }
+    } else {
+      // data response
+      if (!m_core[cid]->ldst_unit_response_buffer_full()) {
+        m_response_fifo.pop_front();
+        m_memory_stats->memlatstat_read_done(mf);
+        m_core[cid]->accept_ldst_unit_response(mf);
+      }
+    }
+  }
+
+  // pop from SST buffers
+  if (m_response_fifo.size() < m_config->n_simt_ejection_buffer_size) {
+    mem_fetch *mf = (mem_fetch *)(static_cast<sst_gpgpu_sim *>(get_gpu())
+                                      ->SST_pop_mem_reply(m_cluster_id));
+    if (!mf) return;
+    assert(mf->get_tpc() == m_cluster_id);
+
+    // do atomic here
+    // For now, we execute atomic when the mem reply comes back
+    // This needs to be validated
+    if (mf && mf->isatomic()) mf->do_atomic();
+
+    unsigned int packet_size =
+        (mf->get_is_write()) ? mf->get_ctrl_size() : mf->size();
+    m_stats->m_incoming_traffic_stats->record_traffic(mf, packet_size);
+    mf->set_status(IN_CLUSTER_TO_SHADER_QUEUE,
+                   m_gpu->gpu_sim_cycle + m_gpu->gpu_tot_sim_cycle);
+    // m_memory_stats->memlatstat_read_done(mf,m_shader_config->max_warps_per_shader);
+    m_response_fifo.push_back(mf);
+    m_stats->n_mem_to_simt[m_cluster_id] += mf->get_num_flits(false);
+  }
+}
+
 void simt_core_cluster::get_pdom_stack_top_info(unsigned sid, unsigned tid,
                                                 unsigned *pc,
                                                 unsigned *rpc) const {
diff --git a/src/gpgpu-sim/shader.h b/src/gpgpu-sim/shader.h
index e658a14c9..ee10af664 100644
--- a/src/gpgpu-sim/shader.h
+++ b/src/gpgpu-sim/shader.h
@@ -2015,6 +2015,7 @@ class shader_core_stats : public shader_core_stats_pod {
   friend class shader_core_ctx;
   friend class ldst_unit;
   friend class simt_core_cluster;
+  friend class sst_simt_core_cluster;
   friend class scheduler_unit;
   friend class TwoLevelScheduler;
   friend class LooseRoundRobbinScheduler;
@@ -2624,6 +2625,7 @@ class simt_core_cluster {
   void cache_invalidate();
   bool icnt_injection_buffer_full(unsigned size, bool write);
   void icnt_inject_request_packet(class mem_fetch *mf);
+  void update_icnt_stats(class mem_fetch *mf);
 
   // for perfect memory interface
   bool response_queue_full() {
@@ -2685,6 +2687,50 @@ class exec_simt_core_cluster : public simt_core_cluster {
   virtual void create_shader_core_ctx();
 };
 
+/**
+ * @brief SST cluster class
+ *
+ */
+class sst_simt_core_cluster : public exec_simt_core_cluster {
+ public:
+  sst_simt_core_cluster(class gpgpu_sim *gpu, unsigned cluster_id,
+                        const shader_core_config *config,
+                        const memory_config *mem_config,
+                        class shader_core_stats *stats,
+                        class memory_stats_t *mstats)
+      : exec_simt_core_cluster(gpu, cluster_id, config, mem_config, stats,
+                               mstats) {}
+
+  /**
+   * @brief Check if SST memory request injection
+   *        buffer is full by using extern
+   *        function is_SST_buffer_full()
+   *        defined in Balar
+   *
+   * @param size
+   * @param write
+   * @param type
+   * @return true
+   * @return false
+   */
+  bool SST_injection_buffer_full(unsigned size, bool write,
+                                 mem_access_type type);
+
+  /**
+   * @brief Send memory request packets to SST
+   *        memory
+   *
+   * @param mf
+   */
+  void icnt_inject_request_packet_to_SST(class mem_fetch *mf);
+
+  /**
+   * @brief Advance ICNT between core and SST
+   *
+   */
+  void icnt_cycle_SST();
+};
+
 class shader_memory_interface : public mem_fetch_interface {
  public:
   shader_memory_interface(shader_core_ctx *core, simt_core_cluster *cluster) {
@@ -2725,6 +2771,61 @@ class perfect_memory_interface : public mem_fetch_interface {
   simt_core_cluster *m_cluster;
 };
 
+/**
+ * @brief SST memory interface
+ *
+ */
+class sst_memory_interface : public mem_fetch_interface {
+ public:
+  sst_memory_interface(shader_core_ctx *core, sst_simt_core_cluster *cluster) {
+    m_core = core;
+    m_cluster = cluster;
+  }
+  /**
+   * @brief For constant, inst, tex cache access
+   *
+   * @param size
+   * @param write
+   * @return true
+   * @return false
+   */
+  virtual bool full(unsigned size, bool write) const {
+    assert(false && "Use the full() method with access type instead!");
+    return true;
+  }
+
+  /**
+   * @brief With SST, the core will direct all mem access except for
+   *        constant, tex, and inst reads to SST mem system
+   *        (i.e. not modeling constant mem right now), thus
+   *        requiring the mem_access_type information to be passed in
+   *
+   * @param size
+   * @param write
+   * @param type
+   * @return true
+   * @return false
+   */
+  bool full(unsigned size, bool write, mem_access_type type) const {
+    return m_cluster->SST_injection_buffer_full(size, write, type);
+  }
+
+  /**
+   * @brief Push memory request to SST memory system and
+   *        update stats
+   *
+   * @param mf
+   */
+  virtual void push(mem_fetch *mf) {
+    m_core->inc_simt_to_mem(mf->get_num_flits(true));
+    m_cluster->icnt_inject_request_packet_to_SST(mf);
+  }
+
+ private:
+  shader_core_ctx *m_core;
+  sst_simt_core_cluster *m_cluster;
+};
+
 inline int scheduler_unit::get_sid() const { return m_shader->get_sid(); }
 
 #endif /* SHADER_H */
diff --git a/src/gpgpusim_entrypoint.cc b/src/gpgpusim_entrypoint.cc
index 42c6981b0..839fef619 100644
--- a/src/gpgpusim_entrypoint.cc
+++ b/src/gpgpusim_entrypoint.cc
@@ -43,6 +43,20 @@
 static int sg_argc = 3;
 static const char *sg_argv[] = {"", "-config", "gpgpusim.config"};
 
+// Help funcs to avoid multiple '->' for SST
+GPGPUsim_ctx *GPGPUsim_ctx_ptr() { return GPGPU_Context()->the_gpgpusim; }
+
+class sst_gpgpu_sim *g_the_gpu() {
+  return static_cast<sst_gpgpu_sim *>(GPGPUsim_ctx_ptr()->g_the_gpu);
+}
+
+class stream_manager *g_stream_manager() {
+  return GPGPUsim_ctx_ptr()->g_stream_manager;
+}
+
+// SST callback
+extern void SST_callback_cudaThreadSynchronize_done();
+
 void *gpgpu_sim_thread_sequential(void *ctx_ptr) {
   gpgpu_context *ctx = (gpgpu_context *)ctx_ptr;
   // at most one kernel running at a time
@@ -169,6 +183,75 @@ void *gpgpu_sim_thread_concurrent(void *ctx_ptr) {
   return NULL;
 }
 
+bool sst_sim_cycles = false;
+
+bool SST_Cycle() {
+  // Check if Synchronize is done when SST previously requested
+  // cudaThreadSynchronize
+  if (GPGPU_Context()->requested_synchronize &&
+      ((g_stream_manager()->empty() && !GPGPUsim_ctx_ptr()->g_sim_active) ||
+       GPGPUsim_ctx_ptr()->g_sim_done)) {
+    SST_callback_cudaThreadSynchronize_done();
+    GPGPU_Context()->requested_synchronize = false;
+  }
+
+  if (g_stream_manager()->empty_protected() &&
+      !GPGPUsim_ctx_ptr()->g_sim_done && !g_the_gpu()->active()) {
+    GPGPUsim_ctx_ptr()->g_sim_active = false;
+    // printf("stream is empty %d \n",  g_stream_manager->empty());
+    return false;
+  }
+
+  if (g_stream_manager()->operation(&sst_sim_cycles) &&
+      !g_the_gpu()->active()) {
+    if (sst_sim_cycles) {
+      sst_sim_cycles = false;
+    }
+    return false;
+  }
+
+  // printf("GPGPU-Sim: Give GPU Cycle\n");
+  GPGPUsim_ctx_ptr()->g_sim_active = true;
+
+  // functional simulation
+  if (g_the_gpu()->is_functional_sim()) {
+    kernel_info_t *kernel = g_the_gpu()->get_functional_kernel();
+    assert(kernel);
+    GPGPUsim_ctx_ptr()->gpgpu_ctx->func_sim->gpgpu_cuda_ptx_sim_main_func(
+        *kernel);
+    g_the_gpu()->finish_functional_sim(kernel);
+  }
+
+  // performance simulation
+  if (g_the_gpu()->active()) {
+    g_the_gpu()->SST_cycle();
+    sst_sim_cycles = true;
+    g_the_gpu()->deadlock_check();
+  } else {
+    if (g_the_gpu()->cycle_insn_cta_max_hit()) {
+      g_stream_manager()->stop_all_running_kernels();
+      GPGPUsim_ctx_ptr()->g_sim_done = true;
+      GPGPUsim_ctx_ptr()->g_sim_active = false;
+      GPGPUsim_ctx_ptr()->break_limit = true;
+    }
+  }
+
+  if (!g_the_gpu()->active()) {
+    g_the_gpu()->print_stats(GPGPUsim_ctx_ptr()->g_the_gpu->last_streamID);
+    g_the_gpu()->update_stats();
+    GPGPU_Context()->print_simulation_time();
+  }
+
+  if (GPGPUsim_ctx_ptr()->break_limit) {
+    printf(
+        "GPGPU-Sim: ** break due to reaching the maximum cycles (or "
+        "instructions) **\n");
+    return true;
+  }
+
+  return false;
+}
+
 void gpgpu_context::synchronize() {
   printf("GPGPU-Sim: synchronize waiting for inactive GPU simulation\n");
   the_gpgpusim->g_stream_manager->print(stdout);
@@ -187,6 +270,27 @@ void gpgpu_context::synchronize() {
   //    sem_post(&g_sim_signal_start);
 }
 
+bool gpgpu_context::synchronize_check() {
+  // printf("GPGPU-Sim: synchronize checking for inactive GPU simulation\n");
+  requested_synchronize = true;
+  the_gpgpusim->g_stream_manager->print(stdout);
+  fflush(stdout);
+  //    sem_wait(&g_sim_signal_finish);
+  bool done = false;
+  pthread_mutex_lock(&(the_gpgpusim->g_sim_lock));
+  done = (the_gpgpusim->g_stream_manager->empty() &&
+          !the_gpgpusim->g_sim_active) ||
+         the_gpgpusim->g_sim_done;
+  pthread_mutex_unlock(&(the_gpgpusim->g_sim_lock));
+  if (done) {
+    printf(
+        "GPGPU-Sim: synchronize checking: detected inactive GPU simulation "
+        "thread\n");
+  }
+  fflush(stdout);
+  return done;
+}
+
 void gpgpu_context::exit_simulation() {
   the_gpgpusim->g_sim_done = true;
   printf("GPGPU-Sim: exit_simulation called\n");
@@ -220,8 +324,14 @@ gpgpu_sim *gpgpu_context::gpgpu_ptx_sim_init_perf() {
   assert(setlocale(LC_NUMERIC, "C"));
   the_gpgpusim->g_the_gpu_config->init();
 
-  the_gpgpusim->g_the_gpu =
-      new exec_gpgpu_sim(*(the_gpgpusim->g_the_gpu_config), this);
+  if (the_gpgpusim->g_the_gpu_config->is_SST_mode()) {
+    // Create SST specific GPGPUSim
+    the_gpgpusim->g_the_gpu =
+        new sst_gpgpu_sim(*(the_gpgpusim->g_the_gpu_config), this);
+  } else {
+    the_gpgpusim->g_the_gpu =
+        new exec_gpgpu_sim(*(the_gpgpusim->g_the_gpu_config), this);
+  }
   the_gpgpusim->g_stream_manager = new stream_manager(
       (the_gpgpusim->g_the_gpu), func_sim->g_cuda_launch_blocking);
 
@@ -237,12 +347,17 @@ gpgpu_sim *gpgpu_context::gpgpu_ptx_sim_init_perf() {
 void gpgpu_context::start_sim_thread(int api) {
   if (the_gpgpusim->g_sim_done) {
     the_gpgpusim->g_sim_done = false;
-    if (api == 1) {
-      pthread_create(&(the_gpgpusim->g_simulation_thread), NULL,
-                     gpgpu_sim_thread_concurrent, (void *)this);
+    if (the_gpgpusim->g_the_gpu_config->is_SST_mode()) {
+      // Do not create concurrent thread in SST mode
+      g_the_gpu()->init();
     } else {
-      pthread_create(&(the_gpgpusim->g_simulation_thread), NULL,
-                     gpgpu_sim_thread_sequential, (void *)this);
+      if (api == 1) {
+        pthread_create(&(the_gpgpusim->g_simulation_thread), NULL,
+                       gpgpu_sim_thread_concurrent, (void *)this);
+      } else {
+        pthread_create(&(the_gpgpusim->g_simulation_thread), NULL,
+                       gpgpu_sim_thread_sequential, (void *)this);
+      }
     }
   }
 }
@@ -266,8 +381,13 @@ void gpgpu_context::print_simulation_time() {
   const unsigned cycles_per_sec =
       (unsigned)(the_gpgpusim->g_the_gpu->gpu_tot_sim_cycle / difference);
   printf("gpgpu_simulation_rate = %u (cycle/sec)\n", cycles_per_sec);
-  printf("gpgpu_silicon_slowdown = %ux\n",
-         the_gpgpusim->g_the_gpu->shader_clock() * 1000 / cycles_per_sec);
+
+  if (cycles_per_sec == 0) {
+    printf("gpgpu_silicon_slowdown = Nan\n");
+  } else {
+    printf("gpgpu_silicon_slowdown = %ux\n",
+           the_gpgpusim->g_the_gpu->shader_clock() * 1000 / cycles_per_sec);
+  }
   fflush(stdout);
 }
 
diff --git a/src/stream_manager.cc b/src/stream_manager.cc
index 72f8bb0b2..b974791d0 100644
--- a/src/stream_manager.cc
+++ b/src/stream_manager.cc
@@ -34,6 +34,12 @@
 
 unsigned CUstream_st::sm_next_stream_uid = 0;
 
+// SST memcpy callbacks
+extern void SST_callback_memcpy_H2D_done();
+extern void SST_callback_memcpy_D2H_done();
+extern void SST_callback_memcpy_to_symbol_done();
+extern void SST_callback_memcpy_from_symbol_done();
+
 CUstream_st::CUstream_st() {
   m_pending = false;
   m_uid = sm_next_stream_uid++;
@@ -122,11 +128,13 @@ bool stream_operation::do_operation(gpgpu_sim *gpu) {
       if (g_debug_execution >= 3) printf("memcpy host-to-device\n");
       gpu->memcpy_to_gpu(m_device_address_dst, m_host_address_src, m_cnt);
       m_stream->record_next_done();
+      if (gpu->is_SST_mode()) SST_callback_memcpy_H2D_done();
       break;
     case stream_memcpy_device_to_host:
       if (g_debug_execution >= 3) printf("memcpy device-to-host\n");
       gpu->memcpy_from_gpu(m_host_address_dst, m_device_address_src, m_cnt);
       m_stream->record_next_done();
+      if (gpu->is_SST_mode()) SST_callback_memcpy_D2H_done();
       break;
     case stream_memcpy_device_to_device:
       if (g_debug_execution >= 3) printf("memcpy device-to-device\n");
@@ -138,12 +146,14 @@ bool stream_operation::do_operation(gpgpu_sim *gpu) {
       gpu->gpgpu_ctx->func_sim->gpgpu_ptx_sim_memcpy_symbol(
           m_symbol, m_host_address_src, m_cnt, m_offset, 1, gpu);
       m_stream->record_next_done();
+      if (gpu->is_SST_mode()) SST_callback_memcpy_to_symbol_done();
       break;
     case stream_memcpy_from_symbol:
       if (g_debug_execution >= 3) printf("memcpy from symbol\n");
       gpu->gpgpu_ctx->func_sim->gpgpu_ptx_sim_memcpy_symbol(
           m_symbol, m_host_address_dst, m_cnt, m_offset, 0, gpu);
       m_stream->record_next_done();
+      if (gpu->is_SST_mode()) SST_callback_memcpy_from_symbol_done();
       break;
     case stream_kernel_launch:
       if (m_sim_mode) {  // Functional Sim
@@ -472,7 +482,7 @@ void stream_manager::push(stream_operation op) {
   }
   if (g_debug_execution >= 3) print_impl(stdout);
   pthread_mutex_unlock(&m_lock);
-  if (m_cuda_launch_blocking || stream == NULL) {
+  if (!m_gpu->is_SST_mode() && (m_cuda_launch_blocking || stream == NULL)) {
     unsigned int wait_amount = 100;
     unsigned int wait_cap = 100000;  // 100ms
     while (!empty()) {

From 3844f7559e69f206a80ae9b970b19e7e609e949e Mon Sep 17 00:00:00 2001
From: WilliamMTK <China_Aisa@live.com>
Date: Tue, 14 Jan 2025 12:33:11 -0500
Subject: [PATCH 152/154] fix_sst_callbacks: add weak definitions for sst
 callbacks (#81)

* fix_sst_callbacks: add weak definitions for sst callbacks

* Automated Format

---------

Co-authored-by: purdue-jenkins <purdue-jenkins@users.noreply.github.com>
---
 src/gpgpu-sim/gpu-sim.h    | 10 +++++++++-
 src/gpgpusim_entrypoint.cc |  1 +
 src/stream_manager.cc      |  4 ++++
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/gpgpu-sim/gpu-sim.h b/src/gpgpu-sim/gpu-sim.h
index d0c2a1763..5b253ab10 100644
--- a/src/gpgpu-sim/gpu-sim.h
+++ b/src/gpgpu-sim/gpu-sim.h
@@ -78,6 +78,9 @@ extern tr1_hash_map<new_addr_type, unsigned> address_random_interleaving;
  * @return false
  */
 extern bool is_SST_buffer_full(unsigned core_id);
+__attribute__((weak)) bool is_SST_buffer_full(unsigned core_id) {
+  return false;
+}
 
 /**
  * @brief Send loads to SST memory backend
@@ -89,7 +92,9 @@ extern bool is_SST_buffer_full(unsigned core_id);
  */
 extern void send_read_request_SST(unsigned core_id, uint64_t address,
                                   size_t size, void *mem_req);
-
+__attribute__((weak)) void send_read_request_SST(unsigned core_id,
+                                                 uint64_t address, size_t size,
+                                                 void *mem_req) {}
 /**
  * @brief Send stores to SST memory backend
  *
@@ -100,6 +105,9 @@ extern void send_read_request_SST(unsigned core_id, uint64_t address,
  */
 extern void send_write_request_SST(unsigned core_id, uint64_t address,
                                    size_t size, void *mem_req);
+__attribute__((weak)) void send_write_request_SST(unsigned core_id,
+                                                  uint64_t address, size_t size,
+                                                  void *mem_req) {}
 
 enum dram_ctrl_t { DRAM_FIFO = 0, DRAM_FRFCFS = 1 };
 
diff --git a/src/gpgpusim_entrypoint.cc b/src/gpgpusim_entrypoint.cc
index 839fef619..e2b711ede 100644
--- a/src/gpgpusim_entrypoint.cc
+++ b/src/gpgpusim_entrypoint.cc
@@ -56,6 +56,7 @@ class stream_manager *g_stream_manager() {
 
 // SST callback
 extern void SST_callback_cudaThreadSynchronize_done();
+__attribute__((weak)) void SST_callback_cudaThreadSynchronize_done() {}
 
 void *gpgpu_sim_thread_sequential(void *ctx_ptr) {
   gpgpu_context *ctx = (gpgpu_context *)ctx_ptr;
diff --git a/src/stream_manager.cc b/src/stream_manager.cc
index b974791d0..58c2ec4b5 100644
--- a/src/stream_manager.cc
+++ b/src/stream_manager.cc
@@ -39,6 +39,10 @@ extern void SST_callback_memcpy_H2D_done();
 extern void SST_callback_memcpy_D2H_done();
 extern void SST_callback_memcpy_to_symbol_done();
 extern void SST_callback_memcpy_from_symbol_done();
+__attribute__((weak)) void SST_callback_memcpy_H2D_done() {}
+__attribute__((weak)) void SST_callback_memcpy_D2H_done() {}
+__attribute__((weak)) void SST_callback_memcpy_to_symbol_done() {}
+__attribute__((weak)) void SST_callback_memcpy_from_symbol_done() {}
 
 CUstream_st::CUstream_st() {
   m_pending = false;

From 45caf76587642b7d5dcd8b0992c55114c9ced35e Mon Sep 17 00:00:00 2001
From: Aaron Barnes <42706182+barnes88@users.noreply.github.com>
Date: Tue, 21 Jan 2025 17:22:56 -0600
Subject: [PATCH 153/154] move get_current_occupancy outside conditional (#83)

---
 src/gpgpu-sim/gpu-sim.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gpgpu-sim/gpu-sim.cc b/src/gpgpu-sim/gpu-sim.cc
index b92494b43..55d70d115 100644
--- a/src/gpgpu-sim/gpu-sim.cc
+++ b/src/gpgpu-sim/gpu-sim.cc
@@ -2070,10 +2070,10 @@ void gpgpu_sim::cycle() {
             m_power_stats->pwr_mem_stat->n_mem_to_simt[CURRENT_STAT_IDX][i]);
         m_cluster[i]->get_cache_stats(
             m_power_stats->pwr_mem_stat->core_cache_stats[CURRENT_STAT_IDX]);
-        m_cluster[i]->get_current_occupancy(
-            gpu_occupancy.aggregate_warp_slot_filled,
-            gpu_occupancy.aggregate_theoretical_warp_slots);
       }
+      m_cluster[i]->get_current_occupancy(
+          gpu_occupancy.aggregate_warp_slot_filled,
+          gpu_occupancy.aggregate_theoretical_warp_slots);
     }
     float temp = 0;
     for (unsigned i = 0; i < m_shader_config->num_shader(); i++) {

From 48af0c94ca6d7b05f800f535b3de4cefafcfa655 Mon Sep 17 00:00:00 2001
From: WilliamMTK <China_Aisa@live.com>
Date: Wed, 29 Jan 2025 15:40:45 -0500
Subject: [PATCH 154/154] Add accelsim test (#82)

* add_accelsim_test: add action and test script

* add_accelsim_test: add error status to exit command
---
 .github/workflows/accelsim.yml | 33 ++++++++++++++++++++++++++++
 short-tests-accelsim.sh        | 39 ++++++++++++++++++++++++++++++++++
 short-tests-cmake.sh           |  6 +++---
 short-tests.sh                 |  6 +++---
 4 files changed, 78 insertions(+), 6 deletions(-)
 create mode 100644 .github/workflows/accelsim.yml
 create mode 100755 short-tests-accelsim.sh

diff --git a/.github/workflows/accelsim.yml b/.github/workflows/accelsim.yml
new file mode 100644
index 000000000..741035226
--- /dev/null
+++ b/.github/workflows/accelsim.yml
@@ -0,0 +1,33 @@
+# Test backend changes with Accel-Sim
+
+name: Short-Tests-AccelSim
+
+# Controls when the workflow will run
+on:
+  push:
+    branches-ignore:
+      - "gh-readonly-queue**"
+  merge_group:
+  pull_request:
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# By default regress against accel-sim's dev branch
+env:
+  ACCELSIM_BRANCH: dev
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  build-QV100:
+    runs-on: ubuntu-latest
+    container:
+      image: tgrogers/accel-sim_regress:Ubuntu-22.04-cuda-11.7
+        
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+      # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+      - uses: actions/checkout@v4
+      - name: Run Simulation
+        run: /bin/bash $GITHUB_WORKSPACE/short-tests-accelsim.sh
+
diff --git a/short-tests-accelsim.sh b/short-tests-accelsim.sh
new file mode 100755
index 000000000..5cb4d2cc6
--- /dev/null
+++ b/short-tests-accelsim.sh
@@ -0,0 +1,39 @@
+if [ ! -n "$CUDA_INSTALL_PATH" ]; then
+	echo "ERROR ** Install CUDA Toolkit and set CUDA_INSTALL_PATH.";
+	exit 1;
+fi
+
+if [ ! -n "$ACCELSIM_BRANCH" ]; then
+	echo "ERROR ** set the ACCELSIM_BRANCH env variable";
+	exit 1;
+fi
+
+if [ ! -n "$GPUAPPS_ROOT" ]; then
+	echo "ERROR ** GPUAPPS_ROOT to a location where the apps have been compiled";
+	exit 1;
+fi
+
+git config --system --add safe.directory '*'
+
+export PATH=$CUDA_INSTALL_PATH/bin:$PATH
+source ./setup_environment
+make -j
+
+git clone https://github.com/accel-sim/accel-sim-framework.git
+
+# Build accel-sim
+cd accel-sim-framework
+git checkout $ACCELSIM_BRANCH
+source ./gpu-simulator/setup_environment.sh
+make -j -C ./gpu-simulator
+
+# Get rodinia traces
+rm -rf ./hw_run/rodinia_2.0-ft
+wget https://engineering.purdue.edu/tgrogers/accel-sim/traces/tesla-v100/latest/rodinia_2.0-ft.tgz
+mkdir -p ./hw_run
+tar -xzvf rodinia_2.0-ft.tgz -C ./hw_run
+rm rodinia_2.0-ft.tgz
+
+# Run rodinia traces
+./util/job_launching/run_simulations.py -C QV100-SASS -B rodinia_2.0-ft -T ./hw_run/rodinia_2.0-ft/9.1 -N myTest
+./util/job_launching/monitor_func_test.py -v -N myTest
diff --git a/short-tests-cmake.sh b/short-tests-cmake.sh
index e41444156..23cf66c1a 100755
--- a/short-tests-cmake.sh
+++ b/short-tests-cmake.sh
@@ -1,16 +1,16 @@
 if [ ! -n "$CUDA_INSTALL_PATH" ]; then
 	echo "ERROR ** Install CUDA Toolkit and set CUDA_INSTALL_PATH.";
-	exit;
+	exit 1;
 fi
 
 if [ ! -n "$CONFIG" ]; then
 	echo "ERROR ** set the CONFIG env variable to one of those found in ./accel-sim-framework/util/job_launching/configs/define-standard-cfgs.yml";
-	exit;
+	exit 1;
 fi
 
 if [ ! -n "$GPUAPPS_ROOT" ]; then
 	echo "ERROR ** GPUAPPS_ROOT to a location where the apps have been compiled";
-	exit;
+	exit 1;
 fi
 
 git config --system --add safe.directory '*'
diff --git a/short-tests.sh b/short-tests.sh
index 44f265a96..a1db76220 100755
--- a/short-tests.sh
+++ b/short-tests.sh
@@ -1,16 +1,16 @@
 if [ ! -n "$CUDA_INSTALL_PATH" ]; then
 	echo "ERROR ** Install CUDA Toolkit and set CUDA_INSTALL_PATH.";
-	exit;
+	exit 1;
 fi
 
 if [ ! -n "$CONFIG" ]; then
 	echo "ERROR ** set the CONFIG env variable to one of those found in ./accel-sim-framework/util/job_launching/configs/define-standard-cfgs.yml";
-	exit;
+	exit 1;
 fi
 
 if [ ! -n "$GPUAPPS_ROOT" ]; then
 	echo "ERROR ** GPUAPPS_ROOT to a location where the apps have been compiled";
-	exit;
+	exit 1;
 fi
 
 git config --system --add safe.directory '*'