From 1d10a975bada6b98c29822286fafb161f36335db Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Sat, 3 Feb 2018 13:12:25 -0500
Subject: [PATCH 001/127] Remove unused console.c

---
 src/platform/i386/Makefile  |   1 -
 src/platform/i386/console.c | 152 ------------------------------------
 src/platform/i386/kernel.c  |   3 -
 src/platform/i386/kernel.h  |   6 --
 4 files changed, 162 deletions(-)
 delete mode 100644 src/platform/i386/console.c

diff --git a/src/platform/i386/Makefile b/src/platform/i386/Makefile
index 9a4f0e0614..e6715d1f44 100644
--- a/src/platform/i386/Makefile
+++ b/src/platform/i386/Makefile
@@ -43,7 +43,6 @@ OBJS += hpet.o
 OBJS += chal.o
 OBJS += boot_comp.o
 OBJS += miniacpi.o
-#OBJS += console.o
 OBJS += vga.o
 OBJS += exception.o
 OBJS += lapic.o
diff --git a/src/platform/i386/console.c b/src/platform/i386/console.c
deleted file mode 100644
index 7003a5b15b..0000000000
--- a/src/platform/i386/console.c
+++ /dev/null
@@ -1,152 +0,0 @@
-#define ENABLE_CONSOLE
-
-#include "io.h"
-#include "string.h"
-#include "isr.h"
-#include "kernel.h"
-
-#define VIDEO_MEM 0xb8000
-
-#define VGA_CTL_REG 0x3D4
-#define VGA_DATA_REG 0x3D5
-
-#define KEY_DEVICE 0x60
-#define KEY_PENDING 0x64
-
-#define COLUMNS 80
-#define LINES 25
-
-/* FIXME these should go somewhere else */
-#define BACKSPACE 0x08
-#define TAB 0x09
-
-enum vga_colors
-{
-	BLACK = 0x00,
-	BLUE,
-	GREEN,
-	CYAN,
-	RED,
-	MAGENTA,
-	BROWN,
-	LIGHT_GREY,
-	DARK_GREY,
-	LIGHT_BLUE,
-	LIGHT_GREEN,
-	LIGHT_CYAN,
-	LIGHT_RED,
-	LIGHT_MAGENTA,
-	LIGHT_BROWN,
-	WHITE
-};
-
-static u16_t *video_mem = (u16_t *)VIDEO_MEM;
-static u8_t   cursor_x;
-static u8_t   cursor_y;
-
-static void
-wmemset(void *dst, int c, size_t count)
-{
-	unsigned short *tmp = (unsigned short *)dst;
-
-	for (; count != 0; count--) *tmp++ = c;
-}
-
-static inline u8_t
-gen_color(u8_t forground, u8_t background)
-{
-	return (background << 4) | (forground & 0x0F);
-}
-
-static void
-update_cursor(u8_t row, u8_t col)
-{
-	u16_t pos = row * COLUMNS + col;
-
-	outb(VGA_CTL_REG, 0x0E);
-	outb(VGA_DATA_REG, pos >> 8);
-	outb(VGA_CTL_REG, 0x0F);
-	outb(VGA_DATA_REG, pos);
-}
-
-static void
-scroll(void)
-{
-	u16_t    blank = ((u8_t)' ') | gen_color(WHITE, BLACK);
-	unsigned i;
-
-	if (cursor_y < LINES) return;
-
-	for (i = 0; i < (LINES - 1) * COLUMNS; i++) video_mem[i] = video_mem[i + COLUMNS];
-
-	wmemset(video_mem + ((LINES - 1) * COLUMNS), blank, COLUMNS);
-	cursor_y = LINES - 1;
-}
-
-static void
-vga_putch(char c)
-{
-	u8_t   color     = gen_color(LIGHT_GREY, BLACK);
-	u16_t  attribute = color << 8;
-	u16_t *location;
-
-	if (c == BACKSPACE && cursor_x)
-		cursor_x--;
-	else if (c == TAB)
-		cursor_x = (cursor_x + 8) & ~(8 - 1);
-	else if (c == '\r')
-		cursor_x = 0;
-	else if (c == '\n') {
-		cursor_x = 0;
-		cursor_y++;
-	} else if (c >= ' ') {
-		location  = video_mem + (cursor_y * COLUMNS + cursor_x);
-		*location = c | attribute;
-		cursor_x++;
-	}
-
-	if (cursor_x >= COLUMNS) {
-		cursor_x = 0;
-		cursor_y++;
-	}
-
-	scroll();
-	update_cursor(cursor_y, cursor_x);
-}
-
-void
-vga_puts(const char *s)
-{
-	for (; *s != '\0'; s++) vga_putch(*s);
-}
-
-void
-vga_clear(void)
-{
-	u8_t  color = gen_color(WHITE, BLACK);
-	u16_t blank = ((u8_t)' ') | color << 8;
-	wmemset(video_mem, blank, COLUMNS * LINES);
-}
-
-int
-keyboard_handler(struct pt_regs *regs)
-{
-	u16_t scancode;
-	int   preempt = 1;
-
-	ack_irq(IRQ_KEYBOARD);
-
-	while (inb(KEY_PENDING) & 2) {
-		/* wait for keypress to be ready */
-	}
-	scancode = inb(KEY_DEVICE);
-	printk("Keyboard press: %d\n", scancode);
-	return preempt;
-}
-
-void
-console_init(void)
-{
-	vga_clear();
-	printk_register_handler(vga_puts);
-}
diff --git a/src/platform/i386/kernel.c b/src/platform/i386/kernel.c
index c2d0eb3f3e..af21ded134 100644
--- a/src/platform/i386/kernel.c
+++ b/src/platform/i386/kernel.c
@@ -136,9 +136,6 @@ kmain(struct multiboot *mboot, u32_t mboot_magic, u32_t esp)
 
 #ifdef ENABLE_SERIAL
 	serial_init();
-#endif
-#ifdef ENABLE_CONSOLE
-	console_init();
 #endif
 	max = MAX((unsigned long)mboot->mods_addr,
 	          MAX((unsigned long)mboot->mmap_addr, (unsigned long)(chal_va2pa(&end))));
diff --git a/src/platform/i386/kernel.h b/src/platform/i386/kernel.h
index aaf638e392..6f9f2e6e47 100644
--- a/src/platform/i386/kernel.h
+++ b/src/platform/i386/kernel.h
@@ -10,12 +10,6 @@
 #include <thd.h>
 #include <hw.h>
 
-#ifdef ENABLE_CONSOLE
-void vga_clear(void);
-void vga_puts(const char *s);
-void console_init(void);
-#endif
-
 #ifdef ENABLE_VGA
 void vga_init(void);
 void vga_puts(const char *str);

From fa6012ce54f53812cb0f6fbc6a829a9a7f81b15d Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Sat, 3 Feb 2018 14:54:30 -0500
Subject: [PATCH 002/127] Cleaning up namespace in platform/i386

---
 src/pic.h                            |   7 ++
 src/platform/i386/Makefile           |   1 +
 src/platform/i386/chal/chal_config.h |   4 +-
 src/platform/i386/entry.S            |   4 +-
 src/platform/i386/hpet.c             | 127 ++++++++++++++-------------
 src/platform/i386/hpet.h             |  14 +++
 src/platform/i386/idt.c              |  55 +-----------
 src/platform/i386/isr.h              |  12 +--
 src/platform/i386/kernel.c           |   3 +-
 src/platform/i386/kernel.h           |  43 ++-------
 src/platform/i386/lapic.h            |  14 +++
 src/platform/i386/miniacpi.c         |   8 +-
 src/platform/i386/miniacpi.h         |   9 ++
 src/platform/i386/pic.c              |  57 ++++++++++++
 src/platform/i386/pic.h              |  15 ++++
 src/platform/i386/serial.c           |  12 ++-
 src/platform/i386/serial.h           |   8 ++
 src/platform/i386/vga.c              |  36 ++++----
 src/platform/i386/vga.h              |   8 ++
 src/platform/i386/vm.c               |  10 +--
 20 files changed, 246 insertions(+), 201 deletions(-)
 create mode 100644 src/pic.h
 create mode 100644 src/platform/i386/hpet.h
 create mode 100644 src/platform/i386/lapic.h
 create mode 100644 src/platform/i386/miniacpi.h
 create mode 100644 src/platform/i386/pic.c
 create mode 100644 src/platform/i386/pic.h
 create mode 100644 src/platform/i386/serial.h
 create mode 100644 src/platform/i386/vga.h

diff --git a/src/pic.h b/src/pic.h
new file mode 100644
index 0000000000..d092b841e8
--- /dev/null
+++ b/src/pic.h
@@ -0,0 +1,7 @@
+#ifndef PIC_H
+#define PIC_H
+
+void pic_init(void);
+void pic_ack_irq(int n);
+
+#endif /* PIC_H */
diff --git a/src/platform/i386/Makefile b/src/platform/i386/Makefile
index e6715d1f44..a06bf9219f 100644
--- a/src/platform/i386/Makefile
+++ b/src/platform/i386/Makefile
@@ -32,6 +32,7 @@ CFLAGS += $(WARNINGS)
 OBJS += kernel.o
 OBJS += gdt.o
 OBJS += idt.o
+OBJS += pic.o
 OBJS += vm.o
 OBJS += printk.o
 OBJS += string.o
diff --git a/src/platform/i386/chal/chal_config.h b/src/platform/i386/chal/chal_config.h
index 5b01cff05b..7302c37dd7 100644
--- a/src/platform/i386/chal/chal_config.h
+++ b/src/platform/i386/chal/chal_config.h
@@ -14,7 +14,7 @@ typedef signed long long   s64_t;
 #endif
 
 typedef enum {
-	HW_PERIODIC = 32, /* periodic timer interrupt */
+	HW_HPET_PERIODIC = 32, /* periodic timer interrupt */
 	HW_KEYBOARD,      /* keyboard interrupt */
 	HW_ID3,
 	HW_ID4,
@@ -22,7 +22,7 @@ typedef enum {
 	HW_ID6,
 	HW_ID7,
 	HW_ID8,
-	HW_ONESHOT, /* onetime timer interrupt */
+	HW_HPET_ONESHOT, /* onetime timer interrupt */
 	HW_ID10,
 	HW_ID11,
 	HW_ID12,
diff --git a/src/platform/i386/entry.S b/src/platform/i386/entry.S
index 41320e8cce..7fa5dc08b7 100644
--- a/src/platform/i386/entry.S
+++ b/src/platform/i386/entry.S
@@ -131,7 +131,7 @@ IRQ(smid_float_pt_except_fault)
 IRQ(virtualization_except_fault)
 IRQ_CODE(security_except_fault)
 
-IRQ(periodic)
+IRQ(hpet_periodic)
 IRQ(keyboard)
 IRQ_ID(34)
 IRQ_ID(35)
@@ -139,7 +139,7 @@ IRQ(serial)
 IRQ_ID(37)
 IRQ_ID(38)
 IRQ_ID(39)
-IRQ(oneshot)
+IRQ(hpet_oneshot)
 IRQ_ID(41)
 IRQ_ID(42)
 IRQ_ID(43)
diff --git a/src/platform/i386/hpet.c b/src/platform/i386/hpet.c
index 7ee06deeb9..17494bcca1 100644
--- a/src/platform/i386/hpet.c
+++ b/src/platform/i386/hpet.c
@@ -42,17 +42,17 @@
 
 /* Bits in HPET_Tn_CONFIG */
 /* 1 << 0 is reserved */
-#define TN_INT_TYPE_CNF (1ll << 1) /* 0 = edge trigger, 1 = level trigger */
-#define TN_INT_ENB_CNF (1ll << 2)  /* 0 = no interrupt, 1 = interrupt */
-#define TN_TYPE_CNF (1ll << 3)     /* 0 = one-shot, 1 = periodic */
-#define TN_PER_INT_CAP (1ll << 4)  /* read only, 1 = periodic supported */
-#define TN_SIZE_CAP (1ll << 5)     /* 0 = 32-bit, 1 = 64-bit */
-#define TN_VAL_SET_CNF (1ll << 6)  /* set to allow directly setting accumulator */
+#define HPET_TN_INT_TYPE_CNF (1ll << 1) /* 0 = edge trigger, 1 = level trigger */
+#define HPET_TN_INT_ENB_CNF (1ll << 2)  /* 0 = no interrupt, 1 = interrupt */
+#define HPET_TN_TYPE_CNF (1ll << 3)     /* 0 = one-shot, 1 = periodic */
+#define HPET_TN_PER_INT_CAP (1ll << 4)  /* read only, 1 = periodic supported */
+#define HPET_TN_SIZE_CAP (1ll << 5)     /* 0 = 32-bit, 1 = 64-bit */
+#define HPET_TN_VAL_SET_CNF (1ll << 6)  /* set to allow directly setting accumulator */
 /* 1 << 7 is reserved */
-#define TN_32MODE_CNF (1ll << 8)           /* 1 = force 32-bit access to 64-bit timer */
-/* #define TN_INT_ROUTE_CNF (1<<9:1<<13)*/ /* routing for interrupt */
-#define TN_FSB_EN_CNF (1ll << 14)          /* 1 = deliver interrupts via FSB instead of APIC */
-#define TN_FSB_INT_DEL_CAP (1ll << 15)     /* read only, 1 = FSB delivery available */
+#define HPET_TN_32MODE_CNF (1ll << 8)           /* 1 = force 32-bit access to 64-bit timer */
+/* #define HPET_TN_INT_ROUTE_CNF (1<<9:1<<13)*/ /* routing for interrupt */
+#define HPET_TN_FSB_EN_CNF (1ll << 14)          /* 1 = deliver interrupts via FSB instead of APIC */
+#define HPET_TN_FSB_INT_DEL_CAP (1ll << 15)     /* read only, 1 = FSB delivery available */
 
 #define HPET_INT_ENABLE(n) (*hpet_interrupt = (0x1 << n)) /* Clears the INT n for level-triggered mode. */
 
@@ -70,7 +70,7 @@ volatile struct hpet_timer {
 
 /*
  * When determining how many CPU cycles are in a HPET tick, we must
- * execute a number of periodic ticks (TIMER_CALIBRATION_ITER) at a
+ * execute a number of periodic ticks (HPET_CALIBRATION_ITER) at a
  * controlled interval, and use the HPET tick granularity to compute
  * how many CPU cycles per HPET tick there are.  Unfortunately, this
  * can be quite low (e.g. HPET tick of 10ns, CPU tick of 2ns) leading
@@ -80,32 +80,33 @@ volatile struct hpet_timer {
  * Practically, this will lead to the divisor in the conversion being
  * smaller than it should be, thus causing timers to go off _later_
  * than they should.  Thus we use a multiplicative factor
- * (TIMER_ERROR_BOUND_FACTOR) to lessen the rounding error.
+ * (HPET_ERROR_BOUND_FACTOR) to lessen the rounding error.
  *
  * All of the hardware is documented in the HPET specification @
  * http://www.intel.com/content/dam/www/public/us/en/documents/technical-specifications/software-developers-hpet-spec-1-0a.pdf
  */
 
-#define PICO_PER_MICRO 1000000UL
-#define FEMPTO_PER_PICO 1000UL
-#define TIMER_CALIBRATION_ITER 256
-#define TIMER_ERROR_BOUND_FACTOR 256
-static int           timer_calibration_init   = 1;
-static unsigned long timer_cycles_per_hpetcyc = TIMER_ERROR_BOUND_FACTOR;
-static unsigned long cycles_per_tick;
-static unsigned long hpetcyc_per_tick;
 #define ULONG_MAX 4294967295UL
+#define HPET_PICO_PER_MICRO 1000000UL
+#define HPET_FEMPTO_PER_PICO 1000UL
+#define HPET_CALIBRATION_ITER 256
+#define HPET_ERROR_BOUND_FACTOR 256
+#define HPET_DEFAULT_PERIOD_US 1000 /* US = microseconds */
+static int           hpet_calibration_init   = 1;
+static unsigned long hpet_cpucyc_per_hpetcyc = HPET_ERROR_BOUND_FACTOR;
+static unsigned long hpet_cpucyc_per_tick;
+static unsigned long hpet_hpetcyc_per_tick;
 
 static inline u64_t
-timer_cpu2hpet_cycles(u64_t cycles)
+hpet_cpu2hpet_cycles(u64_t cycles)
 {
 	unsigned long cyc;
 
 	/* demote precision to enable word-sized math */
 	cyc = (unsigned long)cycles;
-	if (unlikely((u64_t)cyc < cycles)) cyc= ULONG_MAX;
+	if (unlikely((u64_t)cyc < cycles)) cyc = ULONG_MAX;
 	/* convert from CPU cycles to HPET cycles */
-	cyc = (cyc / timer_cycles_per_hpetcyc) * TIMER_ERROR_BOUND_FACTOR;
+	cyc = (cyc / hpet_cpucyc_per_hpetcyc) * HPET_ERROR_BOUND_FACTOR;
 	/* promote the precision to interact with the hardware correctly */
 	cycles = cyc;
 
@@ -113,7 +114,7 @@ timer_cpu2hpet_cycles(u64_t cycles)
 }
 
 static void
-timer_disable(timer_type_t timer_type)
+hpet_disable(hpet_type_t timer_type)
 {
 	/* Disable timer interrupts */
 	*hpet_config &= ~HPET_ENABLE_CNF;
@@ -127,10 +128,10 @@ timer_disable(timer_type_t timer_type)
 }
 
 static void
-timer_calibration(void)
+hpet_calibration(void)
 {
-	static int   cnt   = 0;
-	static u64_t cycle = 0, tot = 0, prev;
+	static int   cnt       = 0;
+	static u64_t cycle     = 0, tot = 0, prev;
 	static u32_t apic_curr = 0, apic_tot = 0, apic_prev;
 
 	prev      = cycle;
@@ -142,30 +143,30 @@ timer_calibration(void)
 		tot += cycle - prev;
 		apic_tot += (apic_prev - apic_curr);
 	}
-	if (cnt >= TIMER_CALIBRATION_ITER) {
-		assert(hpetcyc_per_tick);
-		timer_calibration_init = 0;
-		cycles_per_tick        = (unsigned long)(tot / TIMER_CALIBRATION_ITER);
-		assert(cycles_per_tick > hpetcyc_per_tick);
+	if (cnt >= HPET_CALIBRATION_ITER) {
+		assert(hpet_hpetcyc_per_tick);
+		hpet_calibration_init = 0;
+		hpet_cpucyc_per_tick  = (unsigned long)(tot / HPET_CALIBRATION_ITER);
+		assert(hpet_cpucyc_per_tick > hpet_hpetcyc_per_tick);
 
 		if (lapic_timer_calib_init) {
 			u32_t cycs_to_apic_ratio = 0, apic_cycs_per_tick = 0;
 
-			apic_cycs_per_tick = apic_tot / TIMER_CALIBRATION_ITER;
+			apic_cycs_per_tick = apic_tot / HPET_CALIBRATION_ITER;
 			assert(apic_cycs_per_tick);
 
-			cycs_to_apic_ratio = cycles_per_tick / apic_cycs_per_tick;
+			cycs_to_apic_ratio = hpet_cpucyc_per_tick / apic_cycs_per_tick;
 			lapic_timer_calibration(cycs_to_apic_ratio);
 		}
 
 		/* Possibly significant rounding error here.  Bound by the factor */
-		timer_cycles_per_hpetcyc = (TIMER_ERROR_BOUND_FACTOR * cycles_per_tick) / hpetcyc_per_tick;
+		hpet_cpucyc_per_hpetcyc = (HPET_ERROR_BOUND_FACTOR * hpet_cpucyc_per_tick) / hpet_hpetcyc_per_tick;
 		printk("Timer calibrated:\n\tCPU cycles per HPET tick: %ld\n\tHPET ticks in %d us: %ld\n",
-		       timer_cycles_per_hpetcyc / TIMER_ERROR_BOUND_FACTOR, TIMER_DEFAULT_US_INTERARRIVAL,
-		       hpetcyc_per_tick);
+		       hpet_cpucyc_per_hpetcyc / HPET_ERROR_BOUND_FACTOR, HPET_DEFAULT_PERIOD_US,
+		       hpet_hpetcyc_per_tick);
 
-		timer_disable(TIMER_PERIODIC);
-		timer_disable(TIMER_PERIODIC);
+		hpet_disable(HPET_PERIODIC);
+		hpet_disable(HPET_PERIODIC);
 	}
 	cnt++;
 }
@@ -173,55 +174,55 @@ timer_calibration(void)
 int
 chal_cyc_usec(void)
 {
-	return cycles_per_tick / TIMER_DEFAULT_US_INTERARRIVAL;
+	return hpet_cpucyc_per_tick / HPET_DEFAULT_PERIOD_US;
 }
 
 int
-periodic_handler(struct pt_regs *regs)
+hpet_periodic_handler(struct pt_regs *regs)
 {
 	int preempt = 1;
 
-	if (unlikely(timer_calibration_init)) timer_calibration();
+	if (unlikely(hpet_calibration_init)) hpet_calibration();
 
-	ack_irq(HW_PERIODIC);
-	preempt = cap_hw_asnd(&hw_asnd_caps[HW_PERIODIC], regs);
-	HPET_INT_ENABLE(TIMER_PERIODIC);
+	pic_ack_irq(HW_HPET_PERIODIC);
+	preempt = cap_hw_asnd(&hw_asnd_caps[HW_HPET_PERIODIC], regs);
+	HPET_INT_ENABLE(HPET_PERIODIC);
 
 	return preempt;
 }
 
-extern int timer_process(struct pt_regs *regs);
-
 int
-oneshot_handler(struct pt_regs *regs)
+hpet_oneshot_handler(struct pt_regs *regs)
 {
 	int preempt = 1;
 
-	ack_irq(HW_ONESHOT);
-	preempt = timer_process(regs);
-	HPET_INT_ENABLE(TIMER_ONESHOT);
+	assert(!hpet_calibration_init);
+
+	pic_ack_irq(HW_HPET_ONESHOT);
+	preempt = cap_hw_asnd(&hw_asnd_caps[HW_HPET_ONESHOT], regs);
+	HPET_INT_ENABLE(HPET_ONESHOT);
 
 	return preempt;
 }
 
 void
-timer_set(timer_type_t timer_type, u64_t cycles)
+hpet_set(hpet_type_t timer_type, u64_t cycles)
 {
-	u64_t outconfig = TN_INT_TYPE_CNF | TN_INT_ENB_CNF;
+	u64_t outconfig = HPET_TN_INT_TYPE_CNF | HPET_TN_INT_ENB_CNF;
 
 	/* Disable timer interrupts */
 	*hpet_config &= ~HPET_ENABLE_CNF;
 
 	/* Reset main counter */
-	if (timer_type == TIMER_ONESHOT) {
-		cycles = timer_cpu2hpet_cycles(cycles);
+	if (timer_type == HPET_ONESHOT) {
+		cycles = hpet_cpu2hpet_cycles(cycles);
 
 		/* Set a static value to count up to */
 		hpet_timers[timer_type].config = outconfig;
 		cycles += HPET_COUNTER;
 	} else {
 		/* Set a periodic value */
-		hpet_timers[timer_type].config = outconfig | TN_TYPE_CNF | TN_VAL_SET_CNF;
+		hpet_timers[timer_type].config = outconfig | HPET_TN_TYPE_CNF | HPET_TN_VAL_SET_CNF;
 		/* Reset main counter */
 		HPET_COUNTER = 0x00;
 	}
@@ -232,7 +233,7 @@ timer_set(timer_type_t timer_type, u64_t cycles)
 }
 
 u64_t
-timer_find_hpet(void *timer)
+hpet_find(void *timer)
 {
 	u32_t          i;
 	unsigned char  sum      = 0;
@@ -259,7 +260,7 @@ timer_find_hpet(void *timer)
 }
 
 void
-timer_set_hpet_page(u32_t page)
+hpet_set_page(u32_t page)
 {
 	hpet              = (void *)(page * (1 << 22) | ((u32_t)hpet & ((1 << 22) - 1)));
 	hpet_capabilities = (u32_t *)((unsigned char *)hpet + HPET_CAPABILITIES);
@@ -271,14 +272,14 @@ timer_set_hpet_page(u32_t page)
 }
 
 void
-timer_init(void)
+hpet_init(void)
 {
 	unsigned long pico_per_hpetcyc;
 
 	assert(hpet_capabilities);
-	pico_per_hpetcyc = hpet_capabilities[1]
-	                   / FEMPTO_PER_PICO; /* bits 32-63 are # of femptoseconds per HPET clock tick */
-	hpetcyc_per_tick = (TIMER_DEFAULT_US_INTERARRIVAL * PICO_PER_MICRO) / pico_per_hpetcyc;
+	/* bits 32-63 are # of femptoseconds per HPET clock tick */
+	pico_per_hpetcyc      = hpet_capabilities[1] / HPET_FEMPTO_PER_PICO;
+	hpet_hpetcyc_per_tick = (HPET_DEFAULT_PERIOD_US * HPET_PICO_PER_MICRO) / pico_per_hpetcyc;
 
 	printk("Enabling timer @ %p with tick granularity %ld picoseconds\n", hpet, pico_per_hpetcyc);
 	/* Enable legacy interrupt routing */
@@ -288,5 +289,5 @@ timer_init(void)
 	 * Set the timer as specified.  This assumes that the cycle
 	 * specification is in hpet cycles (not cpu cycles).
 	 */
-	timer_set(TIMER_PERIODIC, hpetcyc_per_tick);
+	hpet_set(HPET_PERIODIC, hpet_hpetcyc_per_tick);
 }
diff --git a/src/platform/i386/hpet.h b/src/platform/i386/hpet.h
new file mode 100644
index 0000000000..f6aa186ce8
--- /dev/null
+++ b/src/platform/i386/hpet.h
@@ -0,0 +1,14 @@
+#ifndef HPET_H
+#define HPET_H
+
+typedef enum {
+	HPET_PERIODIC = 0,
+	HPET_ONESHOT  = 1,
+} hpet_type_t;
+
+void  hpet_set(hpet_type_t timer_type, u64_t cycles);
+void  hpet_init(void);
+u64_t hpet_find(void *timer);
+void  hpet_set_page(u32_t page);
+
+#endif /* HPET_H */
diff --git a/src/platform/i386/idt.c b/src/platform/i386/idt.c
index d9a9107f00..e6f021aaa3 100644
--- a/src/platform/i386/idt.c
+++ b/src/platform/i386/idt.c
@@ -3,31 +3,6 @@
 #include "isr.h"
 #include "io.h"
 
-/* Information taken from: http://wiki.osdev.org/PIC */
-/* FIXME:  Remove magic numbers and replace with this */
-#define PIC1 0x20
-#define PIC2 0xA0
-#define PIC1_COMMAND PIC1
-#define PIC1_DATA (PIC1 + 1)
-#define PIC2_COMMAND PIC2
-#define PIC2_DATA (PIC2 + 1)
-
-/* reinitialize the PIC controllers, giving them specified vector offsets
-   rather than 8 and 70, as configured by default */
-
-#define ICW1_ICW4 0x01      /* ICW4 (not) needed */
-#define ICW1_SINGLE 0x02    /* Single (cascade) mode */
-#define ICW1_INTERVAL4 0x04 /* Call address interval 4 (8) */
-#define ICW1_LEVEL 0x08     /* Level triggered (edge) mode */
-#define ICW1_INIT 0x10      /* Initialization - required! */
-
-#define ICW4_8086 0x01       /* 8086/88 (MCS-80/85) mode */
-#define ICW4_AUTO 0x02       /* Auto (normal) EOI */
-#define ICW4_BUF_SLAVE 0x08  /* Buffered mode/slave */
-#define ICW4_BUF_MASTER 0x0C /* Buffered mode/master */
-#define ICW4_SFNM 0x10       /* Special fully nested (not) */
-#define ICW1_ICW4 0x01
-
 struct idt_entry {
 	u16_t base_lo; // Lower 16 bits of address to jump too after int
 	u16_t sel;     // Kernel segment selector
@@ -72,25 +47,12 @@ hw_handler(struct pt_regs *regs)
 	 * TODO: ack here? or
 	 *       after user-level interrupt(rcv event) processing?
 	 */
-	ack_irq(regs->orig_ax);
+	pic_ack_irq(regs->orig_ax);
 	preempt = cap_hw_asnd(&hw_asnd_caps[regs->orig_ax], regs);
 
 	return preempt;
 }
 
-#if 0
-static inline void
-remap_irq_table(void)
-{
-	u8_t pic1_mask;
-	u8_t pic2_mask;
-
-	// Save masks
-	pic1_mask = inb(PIC1_DATA);
-	pic2_mask = inb(PIC2_DATA);
-}
-#endif
-
 void
 idt_init(void)
 {
@@ -98,17 +60,6 @@ idt_init(void)
 	idt_ptr.base  = (u32_t)&idt_entries;
 	memset(&idt_entries, 0, sizeof(struct idt_entry) * NUM_IDT_ENTRIES);
 
-	outb(0x20, 0x11);
-	outb(0xA0, 0x11);
-	outb(0x21, 0x20);
-	outb(0xA1, 0x28);
-	outb(0x21, 0x04);
-	outb(0xA1, 0x02);
-	outb(0x21, 0x01);
-	outb(0xA1, 0x01);
-	outb(0x21, 0x0);
-	outb(0xA1, 0x0);
-
 	idt_set_gate(IRQ_DIV_BY_ZERO_ERR_FAULT, (u32_t)div_by_zero_err_fault_irq, 0x08, 0x8E);
 	idt_set_gate(IRQ_DEBUG_TRAP, (u32_t)debug_trap_irq, 0x08, 0x8E);
 	idt_set_gate(IRQ_BREAKPOINT_TRAP, (u32_t)breakpoint_trap_irq, 0x08, 0x8E);
@@ -129,7 +80,7 @@ idt_init(void)
 	idt_set_gate(IRQ_VIRTUALIZATION_EXCEPT_FAULT, (u32_t)virtualization_except_fault_irq, 0x08, 0x8E);
 	idt_set_gate(IRQ_SECURITY_EXCEPT_FAULT, (u32_t)security_except_fault_irq, 0x08, 0x8E);
 
-	idt_set_gate(HW_PERIODIC, (u32_t)periodic_irq, 0x08, 0x8E);
+	idt_set_gate(HW_HPET_PERIODIC, (u32_t)hpet_periodic_irq, 0x08, 0x8E);
 	idt_set_gate(HW_KEYBOARD, (u32_t)keyboard_irq, 0x08, 0x8E);
 	idt_set_gate(HW_ID3, (u32_t)handler_hw_34, 0x08, 0x8E);
 	idt_set_gate(HW_ID4, (u32_t)handler_hw_35, 0x08, 0x8E);
@@ -137,7 +88,7 @@ idt_init(void)
 	idt_set_gate(HW_ID6, (u32_t)handler_hw_37, 0x08, 0x8E);
 	idt_set_gate(HW_ID7, (u32_t)handler_hw_38, 0x08, 0x8E);
 	idt_set_gate(HW_ID8, (u32_t)handler_hw_39, 0x08, 0x8E);
-	idt_set_gate(HW_ONESHOT, (u32_t)oneshot_irq, 0x08, 0x8E);
+	idt_set_gate(HW_HPET_ONESHOT, (u32_t)hpet_oneshot_irq, 0x08, 0x8E);
 	idt_set_gate(HW_ID10, (u32_t)handler_hw_41, 0x08, 0x8E);
 	idt_set_gate(HW_ID11, (u32_t)handler_hw_42, 0x08, 0x8E);
 	idt_set_gate(HW_ID12, (u32_t)handler_hw_43, 0x08, 0x8E);
diff --git a/src/platform/i386/isr.h b/src/platform/i386/isr.h
index f086fda957..9a36a16f07 100644
--- a/src/platform/i386/isr.h
+++ b/src/platform/i386/isr.h
@@ -2,7 +2,6 @@
 #define ISR_H
 
 #include "shared/cos_types.h"
-#include "io.h"
 #include "chal_asm_inc.h"
 #include <inv.h>
 
@@ -49,7 +48,7 @@ extern void smid_float_pt_except_fault_irq(struct pt_regs *);
 extern void virtualization_except_fault_irq(struct pt_regs *);
 extern void security_except_fault_irq(struct pt_regs *);
 
-extern void periodic_irq(struct pt_regs *);
+extern void hpet_periodic_irq(struct pt_regs *);
 extern void keyboard_irq(struct pt_regs *);
 extern void handler_hw_34(struct pt_regs *);
 extern void handler_hw_35(struct pt_regs *);
@@ -57,7 +56,7 @@ extern void serial_irq(struct pt_regs *);
 extern void handler_hw_37(struct pt_regs *);
 extern void handler_hw_38(struct pt_regs *);
 extern void handler_hw_39(struct pt_regs *);
-extern void oneshot_irq(struct pt_regs *);
+extern void hpet_oneshot_irq(struct pt_regs *);
 extern void handler_hw_41(struct pt_regs *);
 extern void handler_hw_42(struct pt_regs *);
 extern void handler_hw_43(struct pt_regs *);
@@ -83,11 +82,4 @@ extern void handler_hw_62(struct pt_regs *);
 extern void handler_hw_63(struct pt_regs *);
 extern void lapic_timer_irq(struct pt_regs *);
 
-static void
-ack_irq(int n)
-{
-	if (n >= 40) outb(0xA0, 0x20); /* Send reset signal to slave */
-	outb(0x20, 0x20);
-}
-
 #endif /* ISR_H */
diff --git a/src/platform/i386/kernel.c b/src/platform/i386/kernel.c
index af21ded134..c6b2d9492c 100644
--- a/src/platform/i386/kernel.c
+++ b/src/platform/i386/kernel.c
@@ -155,8 +155,9 @@ kmain(struct multiboot *mboot, u32_t mboot_magic, u32_t esp)
 #endif
 	kern_boot_comp();
 	smp_init();
-	timer_init();
+	hpet_init();
 	lapic_timer_init();
+	pic_init();
 	kern_boot_upcall();
 	/* should not get here... */
 	khalt();
diff --git a/src/platform/i386/kernel.h b/src/platform/i386/kernel.h
index 6f9f2e6e47..f68b943465 100644
--- a/src/platform/i386/kernel.h
+++ b/src/platform/i386/kernel.h
@@ -10,53 +10,22 @@
 #include <thd.h>
 #include <hw.h>
 
-#ifdef ENABLE_VGA
-void vga_init(void);
-void vga_puts(const char *str);
-#endif
-
-#ifdef ENABLE_SERIAL
-void serial_init(void);
-#endif
-
-/* These numbers map directly to actual timers in the HPET */
-typedef enum {
-	TIMER_PERIODIC = 0,
-	TIMER_ONESHOT  = 1,
-} timer_type_t;
-
-#define TIMER_DEFAULT_US_INTERARRIVAL 1000 /* US = microseconds */
-
-void  timer_set(timer_type_t timer_type, u64_t cycles);
-void  timer_init(void);
-u64_t timer_find_hpet(void *timer);
-void  timer_set_hpet_page(u32_t page);
-void  timer_thd_init(struct thread *t);
+#include "vga.h"
+#include "serial.h"
+#include "hpet.h"
+#include "miniacpi.h"
+#include "lapic.h"
+#include "pic.h"
 
 void  tss_init(void);
 void  idt_init(void);
 void  gdt_init(void);
 void  user_init(void);
 void  paging_init(void);
-void *acpi_find_rsdt(void);
-void *acpi_find_timer(void);
-void  acpi_set_rsdt_page(u32_t);
 void  kern_paging_map_init(void *pa);
 
-void *       acpi_find_apic(void);
-u32_t        lapic_find_localaddr(void *l);
-void         lapic_set_page(u32_t page);
-void         lapic_timer_init(void);
-void         lapic_set_timer(int timer_type, cycles_t deadline);
-u32_t        lapic_get_ccr(void);
-void         lapic_timer_calibration(u32_t ratio);
-extern u32_t lapic_timer_calib_init;
-
-void smp_init(void);
-
 void tls_update(u32_t addr);
 
-// void printk(const char *fmt, ...);
 int printk_register_handler(void (*handler)(const char *));
 
 void khalt(void);
diff --git a/src/platform/i386/lapic.h b/src/platform/i386/lapic.h
new file mode 100644
index 0000000000..5ea4841b66
--- /dev/null
+++ b/src/platform/i386/lapic.h
@@ -0,0 +1,14 @@
+#ifndef LAPIC_H
+#define LAPIC_H
+
+u32_t        lapic_find_localaddr(void *l);
+void         lapic_set_page(u32_t page);
+void         lapic_timer_init(void);
+void         lapic_set_timer(int timer_type, cycles_t deadline);
+u32_t        lapic_get_ccr(void);
+void         lapic_timer_calibration(u32_t ratio);
+extern u32_t lapic_timer_calib_init;
+
+void smp_init(void);
+
+#endif /* LAPIC_H */
diff --git a/src/platform/i386/miniacpi.c b/src/platform/i386/miniacpi.c
index c1647cfd25..68762dd291 100644
--- a/src/platform/i386/miniacpi.c
+++ b/src/platform/i386/miniacpi.c
@@ -43,7 +43,7 @@ pa2va(void *pa)
 }
 
 void *
-acpi_find_rsdt(void)
+miniacpi_find_rsdt(void)
 {
 	unsigned char *sig;
 	struct rsdp *  rsdp = NULL;
@@ -78,7 +78,7 @@ acpi_find_rsdt(void)
 }
 
 void *
-acpi_find_timer(void)
+miniacpi_find_hpet(void)
 {
 	pgtbl_t pgtbl = (pgtbl_t)boot_comp_pgd;
 	size_t  i;
@@ -108,7 +108,7 @@ acpi_find_timer(void)
 }
 
 void *
-acpi_find_apic(void)
+miniacpi_find_apic(void)
 {
 	size_t i;
 
@@ -137,7 +137,7 @@ acpi_find_apic(void)
 }
 
 void
-acpi_set_rsdt_page(u32_t page)
+miniacpi_set_rsdt_page(u32_t page)
 {
 	basepage = page * (1 << 22);
 	rsdt     = (struct rsdt *)pa2va(rsdt);
diff --git a/src/platform/i386/miniacpi.h b/src/platform/i386/miniacpi.h
new file mode 100644
index 0000000000..2c42b3f1f4
--- /dev/null
+++ b/src/platform/i386/miniacpi.h
@@ -0,0 +1,9 @@
+#ifndef MINIACPI_H
+#define MINIACPI_H
+
+void *miniacpi_find_apic(void);
+void *miniacpi_find_rsdt(void);
+void *miniacpi_find_hpet(void);
+void  miniacpi_set_rsdt_page(u32_t);
+
+#endif /* MINIACPI_H */
diff --git a/src/platform/i386/pic.c b/src/platform/i386/pic.c
new file mode 100644
index 0000000000..a7cfa7eb5c
--- /dev/null
+++ b/src/platform/i386/pic.c
@@ -0,0 +1,57 @@
+#include "pic.h"
+
+#define PIC_IRQ_BASE    0x20
+#define PIC_ALL_DISABLE 0xFF
+#define PIC_ALL_ENABLE  0x00
+
+/* Information taken from: http://wiki.osdev.org/PIC */
+#define PIC1 0x20
+#define PIC2 0xA0
+#define PIC1_CMD PIC1
+#define PIC1_DATA (PIC1 + 1)
+#define PIC2_CMD PIC2
+#define PIC2_DATA (PIC2 + 1)
+
+/* reinitialize the PIC controllers, giving them specified vector offsets
+   rather than 8 and 70, as configured by default */
+#define PIC_ICW1_ICW4      0x01 /* ICW4 (not) needed */
+#define PIC_ICW1_SINGLE    0x02 /* Single (cascade) mode */
+#define PIC_ICW1_INTERVAL4 0x04 /* Call address interval 4 (8) */
+#define PIC_ICW1_LEVEL     0x08 /* Level triggered (edge) mode */
+#define PIC_ICW1_INIT      0x10 /* Initialization - required! */
+
+#define PIC_ICW4_8086       0x01 /* 8086/88 (MCS-80/85) mode */
+#define PIC_ICW4_AUTO       0x02 /* Auto (normal) EOI */
+#define PIC_ICW4_BUF_SLAVE  0x08 /* Buffered mode/slave */
+#define PIC_ICW4_BUF_MASTER 0x0C /* Buffered mode/master */
+#define PIC_ICW4_SFNM       0x10 /* Special fully nested (not) */
+#define PIC_ICW1_ICW4       0x01
+
+static void
+pic_disable(void)
+{
+	outb(PIC1_DATA, PIC_ALL_DISABLE);
+	outb(PIC2_DATA, PIC_ALL_DISABLE);
+}
+
+static void
+pic_enable(void)
+{
+	outb(PIC1_DATA, PIC_ALL_ENABLE);
+	outb(PIC2_DATA, PIC_ALL_ENABLE);
+}
+
+void
+pic_init(void)
+{
+	outb(PIC1_CMD, PIC_ICW1_INIT | PIC_ICW1_ICW4);
+	outb(PIC2_CMD, PIC_ICW1_INIT | PIC_ICW1_ICW4);
+	outb(PIC1_DATA, PIC_IRQ_BASE);
+	outb(PIC2_DATA, PIC_IRQ_BASE + 8);
+	outb(PIC1_DATA, 4);
+	outb(PIC2_DATA, 2);
+	outb(PIC1_DATA, PIC_ICW4_8086);
+	outb(PIC2_DATA, PIC_ICW4_8086);
+
+	pic_enable();
+}
diff --git a/src/platform/i386/pic.h b/src/platform/i386/pic.h
new file mode 100644
index 0000000000..55f604fc38
--- /dev/null
+++ b/src/platform/i386/pic.h
@@ -0,0 +1,15 @@
+#ifndef PIC_H
+#define PIC_H
+
+#include "io.h"
+
+void pic_init(void);
+
+static void
+pic_ack_irq(int n)
+{
+	if (n >= 40) outb(0xA0, 0x20); /* Send reset signal to slave */
+	outb(0x20, 0x20);
+}
+
+#endif /* PIC_H */
diff --git a/src/platform/i386/serial.c b/src/platform/i386/serial.c
index 1c1bcf82f6..6c867b8f92 100644
--- a/src/platform/i386/serial.c
+++ b/src/platform/i386/serial.c
@@ -5,8 +5,6 @@
 #include "isr.h"
 #include "kernel.h"
 
-void serial_puts(const char *s);
-
 enum serial_ports
 {
 	SERIAL_PORT_A = 0x3F8,
@@ -43,7 +41,7 @@ serial_handler(struct pt_regs *r)
 	char serial;
 	int  preempt = 1;
 
-	ack_irq(HW_SERIAL);
+	pic_ack_irq(HW_SERIAL);
 
 	serial = serial_recv();
 
@@ -62,12 +60,12 @@ serial_handler(struct pt_regs *r)
 	case 3: /* FIXME: Obviously remove this once we have working components */
 		die("Break\n");
 	case 'o':
-		timer_set(TIMER_ONESHOT, 50000000);
-		timer_set(TIMER_ONESHOT, 50000000);
+		hpet_set(HPET_ONESHOT, 50000000);
+		hpet_set(HPET_ONESHOT, 50000000);
 		break;
 	case 'p':
-		timer_set(TIMER_PERIODIC, 100000000);
-		timer_set(TIMER_PERIODIC, 100000000);
+		hpet_set(HPET_PERIODIC, 100000000);
+		hpet_set(HPET_PERIODIC, 100000000);
 		break;
 	default:
 		break;
diff --git a/src/platform/i386/serial.h b/src/platform/i386/serial.h
new file mode 100644
index 0000000000..bc8461644d
--- /dev/null
+++ b/src/platform/i386/serial.h
@@ -0,0 +1,8 @@
+#ifndef SERIAL_H
+#define SERIAL_H
+
+#ifdef ENABLE_SERIAL
+void serial_init(void);
+#endif
+
+#endif
diff --git a/src/platform/i386/vga.c b/src/platform/i386/vga.c
index 298e228b57..991b73f60a 100644
--- a/src/platform/i386/vga.c
+++ b/src/platform/i386/vga.c
@@ -160,23 +160,6 @@ cls(void)
 	move_csr();
 }
 
-/*
- * Clear the screen and initialize VIDEO, XPOS and YPOS.
- * VIDEO virtual address set to HIGH address.
- */
-void
-vga_init(void)
-{
-	int i = 0;
-
-	video = chal_pa2va(VIDEO);
-
-	csr_x = 0;
-	csr_y = 0;
-	cls();
-	printk_register_handler(vga_puts);
-}
-
 /* Put the character C on the screen. */
 static void
 putchar(int c)
@@ -218,12 +201,29 @@ puts(unsigned char *text)
 	move_csr();
 }
 
+/*
+ * Clear the screen and initialize VIDEO, XPOS and YPOS.
+ * VIDEO virtual address set to HIGH address.
+ */
+void
+vga_init(void)
+{
+	int i = 0;
+
+	video = chal_pa2va(VIDEO);
+
+	csr_x = 0;
+	csr_y = 0;
+	cls();
+	printk_register_handler(vga_puts);
+}
+
 void
 keyboard_handler(struct pt_regs *regs)
 {
 	u16_t scancode = 0;
 
-	ack_irq(HW_KEYBOARD);
+	pic_ack_irq(HW_KEYBOARD);
 
 	while (inb(KEY_PENDING) & 2) {
 		/* wait for keypress to be ready */
diff --git a/src/platform/i386/vga.h b/src/platform/i386/vga.h
new file mode 100644
index 0000000000..1da89ee4ac
--- /dev/null
+++ b/src/platform/i386/vga.h
@@ -0,0 +1,8 @@
+#ifndef VGA_H
+#define VGA_H
+
+#ifdef ENABLE_VGA
+void vga_init(void);
+#endif
+
+#endif /* VGA_H */
diff --git a/src/platform/i386/vm.c b/src/platform/i386/vm.c
index f01c8950c5..99630b024d 100644
--- a/src/platform/i386/vm.c
+++ b/src/platform/i386/vm.c
@@ -74,26 +74,26 @@ kern_setup_image(void)
 
 	/* FIXME: Ugly hack to get the physical page with the ACPI RSDT mapped */
 	printk("ACPI initialization\n");
-	void *rsdt = acpi_find_rsdt();
+	void *rsdt = miniacpi_find_rsdt();
 	if (rsdt) {
 		u32_t lapic, page;
 		u64_t hpet;
 
 		page             = round_up_to_pgd_page(rsdt) - (1 << 22);
 		boot_comp_pgd[j] = page | PGTBL_PRESENT | PGTBL_WRITABLE | PGTBL_SUPER | PGTBL_GLOBAL;
-		acpi_set_rsdt_page(j);
+		miniacpi_set_rsdt_page(j);
 		j++;
 
-		hpet = timer_find_hpet(acpi_find_timer());
+		hpet = hpet_find(miniacpi_find_hpet());
 		if (hpet) {
 			page             = round_up_to_pgd_page(hpet & 0xffffffff) - (1 << 22);
 			boot_comp_pgd[j] = page | PGTBL_PRESENT | PGTBL_WRITABLE | PGTBL_SUPER | PGTBL_GLOBAL;
-			timer_set_hpet_page(j);
+			hpet_set_page(j);
 			j++;
 		}
 
 		/* lapic memory map */
-		lapic = lapic_find_localaddr(acpi_find_apic());
+		lapic = lapic_find_localaddr(miniacpi_find_apic());
 		if (lapic) {
 			page             = round_up_to_pgd_page(lapic & 0xffffffff) - (1 << 22);
 			boot_comp_pgd[j] = page | PGTBL_PRESENT | PGTBL_WRITABLE | PGTBL_SUPER | PGTBL_GLOBAL;

From 83503b37727bd1b690e9f457d2e621cb6d6e86df Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Sun, 4 Feb 2018 12:02:48 -0500
Subject: [PATCH 003/127] rename miniacpi to acpi.

---
 src/platform/i386/Makefile               | 2 +-
 src/platform/i386/{miniacpi.c => acpi.c} | 8 ++++----
 src/platform/i386/acpi.h                 | 9 +++++++++
 src/platform/i386/kernel.h               | 2 +-
 src/platform/i386/miniacpi.h             | 9 ---------
 src/platform/i386/vm.c                   | 8 ++++----
 6 files changed, 19 insertions(+), 19 deletions(-)
 rename src/platform/i386/{miniacpi.c => acpi.c} (96%)
 create mode 100644 src/platform/i386/acpi.h
 delete mode 100644 src/platform/i386/miniacpi.h

diff --git a/src/platform/i386/Makefile b/src/platform/i386/Makefile
index a06bf9219f..31874fb459 100644
--- a/src/platform/i386/Makefile
+++ b/src/platform/i386/Makefile
@@ -43,7 +43,7 @@ OBJS += serial.o
 OBJS += hpet.o
 OBJS += chal.o
 OBJS += boot_comp.o
-OBJS += miniacpi.o
+OBJS += acpi.o
 OBJS += vga.o
 OBJS += exception.o
 OBJS += lapic.o
diff --git a/src/platform/i386/miniacpi.c b/src/platform/i386/acpi.c
similarity index 96%
rename from src/platform/i386/miniacpi.c
rename to src/platform/i386/acpi.c
index 68762dd291..72273abc75 100644
--- a/src/platform/i386/miniacpi.c
+++ b/src/platform/i386/acpi.c
@@ -43,7 +43,7 @@ pa2va(void *pa)
 }
 
 void *
-miniacpi_find_rsdt(void)
+acpi_find_rsdt(void)
 {
 	unsigned char *sig;
 	struct rsdp *  rsdp = NULL;
@@ -78,7 +78,7 @@ miniacpi_find_rsdt(void)
 }
 
 void *
-miniacpi_find_hpet(void)
+acpi_find_hpet(void)
 {
 	pgtbl_t pgtbl = (pgtbl_t)boot_comp_pgd;
 	size_t  i;
@@ -108,7 +108,7 @@ miniacpi_find_hpet(void)
 }
 
 void *
-miniacpi_find_apic(void)
+acpi_find_apic(void)
 {
 	size_t i;
 
@@ -137,7 +137,7 @@ miniacpi_find_apic(void)
 }
 
 void
-miniacpi_set_rsdt_page(u32_t page)
+acpi_set_rsdt_page(u32_t page)
 {
 	basepage = page * (1 << 22);
 	rsdt     = (struct rsdt *)pa2va(rsdt);
diff --git a/src/platform/i386/acpi.h b/src/platform/i386/acpi.h
new file mode 100644
index 0000000000..f7eb3cb9e6
--- /dev/null
+++ b/src/platform/i386/acpi.h
@@ -0,0 +1,9 @@
+#ifndef ACPI_H
+#define ACPI_H
+
+void *acpi_find_apic(void);
+void *acpi_find_rsdt(void);
+void *acpi_find_hpet(void);
+void  acpi_set_rsdt_page(u32_t);
+
+#endif /* ACPI_H */
diff --git a/src/platform/i386/kernel.h b/src/platform/i386/kernel.h
index f68b943465..0ba6c6ae0f 100644
--- a/src/platform/i386/kernel.h
+++ b/src/platform/i386/kernel.h
@@ -13,7 +13,7 @@
 #include "vga.h"
 #include "serial.h"
 #include "hpet.h"
-#include "miniacpi.h"
+#include "acpi.h"
 #include "lapic.h"
 #include "pic.h"
 
diff --git a/src/platform/i386/miniacpi.h b/src/platform/i386/miniacpi.h
deleted file mode 100644
index 2c42b3f1f4..0000000000
--- a/src/platform/i386/miniacpi.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef MINIACPI_H
-#define MINIACPI_H
-
-void *miniacpi_find_apic(void);
-void *miniacpi_find_rsdt(void);
-void *miniacpi_find_hpet(void);
-void  miniacpi_set_rsdt_page(u32_t);
-
-#endif /* MINIACPI_H */
diff --git a/src/platform/i386/vm.c b/src/platform/i386/vm.c
index 99630b024d..ea5811b79f 100644
--- a/src/platform/i386/vm.c
+++ b/src/platform/i386/vm.c
@@ -74,17 +74,17 @@ kern_setup_image(void)
 
 	/* FIXME: Ugly hack to get the physical page with the ACPI RSDT mapped */
 	printk("ACPI initialization\n");
-	void *rsdt = miniacpi_find_rsdt();
+	void *rsdt = acpi_find_rsdt();
 	if (rsdt) {
 		u32_t lapic, page;
 		u64_t hpet;
 
 		page             = round_up_to_pgd_page(rsdt) - (1 << 22);
 		boot_comp_pgd[j] = page | PGTBL_PRESENT | PGTBL_WRITABLE | PGTBL_SUPER | PGTBL_GLOBAL;
-		miniacpi_set_rsdt_page(j);
+		acpi_set_rsdt_page(j);
 		j++;
 
-		hpet = hpet_find(miniacpi_find_hpet());
+		hpet = hpet_find(acpi_find_hpet());
 		if (hpet) {
 			page             = round_up_to_pgd_page(hpet & 0xffffffff) - (1 << 22);
 			boot_comp_pgd[j] = page | PGTBL_PRESENT | PGTBL_WRITABLE | PGTBL_SUPER | PGTBL_GLOBAL;
@@ -93,7 +93,7 @@ kern_setup_image(void)
 		}
 
 		/* lapic memory map */
-		lapic = lapic_find_localaddr(miniacpi_find_apic());
+		lapic = lapic_find_localaddr(acpi_find_apic());
 		if (lapic) {
 			page             = round_up_to_pgd_page(lapic & 0xffffffff) - (1 << 22);
 			boot_comp_pgd[j] = page | PGTBL_PRESENT | PGTBL_WRITABLE | PGTBL_SUPER | PGTBL_GLOBAL;

From a3f514ea9ccf7eeff12dd53c4ef873b8eb6648c1 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Sun, 4 Feb 2018 14:13:05 -0500
Subject: [PATCH 004/127] Moved ACPI MADT specifics out of LAPIC.c

---
 src/platform/i386/Makefile    |  1 +
 src/platform/i386/acpi.c      | 48 +++++++++++++++++--
 src/platform/i386/acpi.h      |  1 +
 src/platform/i386/apic_cntl.h | 39 +++++++++++++++
 src/platform/i386/ioapic.c    | 10 ++++
 src/platform/i386/ioapic.h    |  8 ++++
 src/platform/i386/lapic.c     | 90 ++++++-----------------------------
 src/platform/i386/lapic.h     |  3 ++
 8 files changed, 122 insertions(+), 78 deletions(-)
 create mode 100644 src/platform/i386/apic_cntl.h
 create mode 100644 src/platform/i386/ioapic.c
 create mode 100644 src/platform/i386/ioapic.h

diff --git a/src/platform/i386/Makefile b/src/platform/i386/Makefile
index 31874fb459..20b31e91c8 100644
--- a/src/platform/i386/Makefile
+++ b/src/platform/i386/Makefile
@@ -33,6 +33,7 @@ OBJS += kernel.o
 OBJS += gdt.o
 OBJS += idt.o
 OBJS += pic.o
+OBJS += ioapic.o
 OBJS += vm.o
 OBJS += printk.o
 OBJS += string.o
diff --git a/src/platform/i386/acpi.c b/src/platform/i386/acpi.c
index 72273abc75..6ebf8ddcce 100644
--- a/src/platform/i386/acpi.c
+++ b/src/platform/i386/acpi.c
@@ -2,6 +2,8 @@
 #include "string.h"
 #include "mem_layout.h"
 #include "pgtbl.h"
+#include "apic_cntl.h"
+#include "ioapic.h"
 
 #define RSDP_LO_ADDRESS ((unsigned char *)0xc00E0000)
 #define RSDP_HI_ADDRESS ((unsigned char *)0xc00FFFFF)
@@ -32,9 +34,10 @@ struct rsdt {
 	struct rsdt *entry[0];
 } __attribute__((packed));
 
-extern u8_t *       boot_comp_pgd;
-static u32_t        basepage;
-static struct rsdt *rsdt;
+extern u8_t *         boot_comp_pgd;
+static u32_t          basepage;
+static struct rsdt   *rsdt;
+static unsigned char *madt;
 
 static inline void *
 pa2va(void *pa)
@@ -142,3 +145,42 @@ acpi_set_rsdt_page(u32_t page)
 	basepage = page * (1 << 22);
 	rsdt     = (struct rsdt *)pa2va(rsdt);
 }
+
+void
+acpi_madt_intsrc_iter(unsigned char *addr)
+{
+	struct int_cntl_head *h   = NULL, *end = NULL;
+	u32_t                 len = 0;
+	int                   nl  = 0, nio = 0;
+
+	assert(addr);
+	madt = addr;
+	h    = (struct int_cntl_head *)(madt + APIC_CNTR_ARR_OFF);
+	len  = *(u32_t *)(madt + APIC_HDR_LEN_OFF);
+	end  = (struct int_cntl_head *)(madt + len);
+
+	printk("\tMADT length %d (base struct %d)\n", len, APIC_CNTR_ARR_OFF);
+	assert(h <= end);
+	for (; h < end; h = (struct int_cntl_head *)((char *)h + h->len)) {
+		/* termination condition */
+		assert(h->len >= sizeof(struct int_cntl_head));
+		switch (h->type) {
+		case APIC_CNTL_LAPIC: {
+			nl ++;
+			lapic_iter((struct lapic_cntl *)h);
+			break;
+		}
+		case APIC_CNTL_IOAPIC: {
+			nio ++;
+			ioapic_iter((struct ioapic_cntl *)h);
+			break;
+		}
+		default:
+			/* See 5.2.12 in the ACPI 5.0 Spec */
+			printk("\tInterrupt controller type %d: ignoring\n", h->type);
+			break;
+		}
+	}
+
+	printk("\tMADT => LAPICs=%d, IOAPICs=%d\n", nl, nio);
+}
diff --git a/src/platform/i386/acpi.h b/src/platform/i386/acpi.h
index f7eb3cb9e6..a46ed82e7a 100644
--- a/src/platform/i386/acpi.h
+++ b/src/platform/i386/acpi.h
@@ -5,5 +5,6 @@ void *acpi_find_apic(void);
 void *acpi_find_rsdt(void);
 void *acpi_find_hpet(void);
 void  acpi_set_rsdt_page(u32_t);
+void  acpi_madt_intsrc_iter(unsigned char *);
 
 #endif /* ACPI_H */
diff --git a/src/platform/i386/apic_cntl.h b/src/platform/i386/apic_cntl.h
new file mode 100644
index 0000000000..3a342efd20
--- /dev/null
+++ b/src/platform/i386/apic_cntl.h
@@ -0,0 +1,39 @@
+#ifndef APIC_CNTL_H
+#define APIC_CNTL_H
+
+#define APIC_DEFAULT_PHYS 0xfee00000
+#define APIC_HDR_LEN_OFF 0x04
+#define APIC_CNTRLR_ADDR_OFF 0x24
+#define APIC_CNTRLR_FLAGS_OFF 0x28
+#define APIC_CNTR_ARR_OFF 0x2C
+
+/* See 5.2.12 in the ACPI 5.0 Spec */
+enum
+{
+        APIC_CNTL_LAPIC  = 0,
+        APIC_CNTL_IOAPIC = 1,
+};
+
+struct int_cntl_head {
+        u8_t type;
+        u8_t len;
+} __attribute__((packed));
+
+struct lapic_cntl {
+        /* type == APIC_CNTL_LAPIC */
+        struct int_cntl_head header;
+        u8_t                 proc_id;
+        u8_t                 apic_id;
+        u32_t                flags; /* 0 = dead processor */
+} __attribute__((packed));
+
+struct ioapic_cntl {
+        /* type == APIC_CNTL_IOAPIC */
+        struct int_cntl_head header;
+        u8_t                 ioapic_id;
+        u8_t                 reserved;
+        u32_t                ioapic_phys_addr;
+        u32_t                glb_int_num_off; /* I/O APIC's interrupt base number offset  */
+} __attribute__((packed));
+
+#endif /* APIC_CNTL_H */
diff --git a/src/platform/i386/ioapic.c b/src/platform/i386/ioapic.c
new file mode 100644
index 0000000000..8c80452093
--- /dev/null
+++ b/src/platform/i386/ioapic.c
@@ -0,0 +1,10 @@
+#include "kernel.h"
+#include "ioapic.h"
+
+#define IOAPIC_MAX 4
+
+void
+ioapic_iter(struct ioapic_cntl *io)
+{
+}
+
diff --git a/src/platform/i386/ioapic.h b/src/platform/i386/ioapic.h
new file mode 100644
index 0000000000..0e807518ca
--- /dev/null
+++ b/src/platform/i386/ioapic.h
@@ -0,0 +1,8 @@
+#ifndef IOAPIC_H
+#define IOAPIC_H
+
+#include "apic_cntl.h"
+
+void ioapic_iter(struct ioapic_cntl *);
+
+#endif /* IOAPIC_H */
diff --git a/src/platform/i386/lapic.c b/src/platform/i386/lapic.c
index 1448ba3985..f28b77d2e5 100644
--- a/src/platform/i386/lapic.c
+++ b/src/platform/i386/lapic.c
@@ -2,41 +2,9 @@
 #include "kernel.h"
 #include "chal_cpu.h"
 #include "isr.h"
+#include "apic_cntl.h"
 
-#define APIC_DEFAULT_PHYS 0xfee00000
-#define APIC_HDR_LEN_OFF 0x04
-#define APIC_CNTRLR_ADDR_OFF 0x24
-#define APIC_CNTRLR_FLAGS_OFF 0x28
-#define APIC_CNTR_ARR_OFF 0x2C
-
-/* See 5.2.12 in the ACPI 5.0 Spec */
-enum
-{
-	APIC_CNTL_LAPIC  = 0,
-	APIC_CNTL_IOAPIC = 1,
-};
-
-struct int_cntl_head {
-	u8_t type;
-	u8_t len;
-} __attribute__((packed));
-
-struct lapic_cntl {
-	/* type == APIC_CNTL_LAPIC */
-	struct int_cntl_head header;
-	u8_t                 proc_id;
-	u8_t                 apic_id;
-	u32_t                flags; /* 0 = dead processor */
-} __attribute__((packed));
-
-struct ioapic_cntl {
-	/* type == APIC_CNTL_IOAPIC */
-	struct int_cntl_head header;
-	u8_t                 ioapic_id;
-	u8_t                 reserved;
-	u32_t                ioapic_phys_addr;
-	u32_t                glb_int_num_off; /* I/O APIC's interrupt base number offset  */
-} __attribute__((packed));
+#define LAPIC_MAX 8
 
 int ncpus = 1;
 int cpus[NUM_CPU];
@@ -147,48 +115,20 @@ lapic_apicid(void)
 }
 
 void
-lapic_intsrc_iter(unsigned char *madt)
+lapic_iter(struct lapic_cntl *l)
 {
-	struct int_cntl_head *h   = (struct int_cntl_head *)(madt + APIC_CNTR_ARR_OFF);
-	u32_t                 len = *(u32_t *)(madt + APIC_HDR_LEN_OFF);
-	struct int_cntl_head *end = (struct int_cntl_head *)(madt + len);
-	int                   us = lapic_apicid(), off = 1;
-
-	cpus[0] = us;
-	printk("\tMADT length %d (base struct %d)\n", len, APIC_CNTR_ARR_OFF);
-	assert(h <= end);
-	for (; h < end; h = (struct int_cntl_head *)((char *)h + h->len)) {
-		/* termination condition */
-		assert(h->len >= sizeof(struct int_cntl_head));
-		switch (h->type) {
-		case APIC_CNTL_LAPIC: {
-			struct lapic_cntl *l = (struct lapic_cntl *)h;
-
-			assert(l->header.len == sizeof(struct lapic_cntl));
-			printk("\tLAPIC found: coreid %d, apicid %d\n", l->proc_id, l->apic_id);
-
-			if (l->proc_id != us && NUM_CPU > 1) {
-				cpus[off++] = l->proc_id;
-				ncpus++;
-			}
-
-			break;
-		}
-		case APIC_CNTL_IOAPIC: {
-			struct ioapic_cntl *io = (struct ioapic_cntl *)h;
-
-			assert(io->header.len == sizeof(struct ioapic_cntl));
-			printk("\tI/O APIC found: ioapicid %d, addr %x, int offset %d\n", io->ioapic_id,
-			       io->ioapic_phys_addr, io->glb_int_num_off);
-			break;
-		}
-		default:
-			/* See 5.2.12 in the ACPI 5.0 Spec */
-			printk("\tInterrupt controller type %d: ignoring\n", h->type);
-			break;
-		}
+	static int off = 0;
+	int us = lapic_apicid();
+
+	assert(off < LAPIC_MAX);
+
+	assert(l->header.len == sizeof(struct lapic_cntl));
+	printk("\tLAPIC found: coreid %d, apicid %d\n", l->proc_id, l->apic_id);
+
+	if (l->proc_id != us && NUM_CPU > 1) {
+		cpus[off++] = l->proc_id;
+		ncpus++;
 	}
-	printk("\tAPICs processed, %d cores\n", ncpus);
 }
 
 u32_t
@@ -214,7 +154,7 @@ lapic_find_localaddr(void *l)
 	addr       = *(u32_t *)(lapicaddr + APIC_CNTRLR_ADDR_OFF);
 	apic_flags = *(u32_t *)(lapicaddr + APIC_CNTRLR_FLAGS_OFF);
 	assert(apic_flags == 1); /* we're assuming the PIC exists */
-	lapic_intsrc_iter(lapicaddr);
+	acpi_madt_intsrc_iter(lapicaddr);
 
 	printk("\tChecksum is OK\n");
 	lapic = (void *)(addr);
diff --git a/src/platform/i386/lapic.h b/src/platform/i386/lapic.h
index 5ea4841b66..bd2dfb08de 100644
--- a/src/platform/i386/lapic.h
+++ b/src/platform/i386/lapic.h
@@ -1,6 +1,9 @@
 #ifndef LAPIC_H
 #define LAPIC_H
 
+#include "apic_cntl.h"
+
+void         lapic_iter(struct lapic_cntl *);
 u32_t        lapic_find_localaddr(void *l);
 void         lapic_set_page(u32_t page);
 void         lapic_timer_init(void);

From 72f7eb1e8ff286726b0c2e7bc6a4ccfaa9becb4e Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Sun, 4 Feb 2018 20:01:03 -0500
Subject: [PATCH 005/127] Added support for IOAPIC (disabled PIC)

---
 src/platform/i386/Makefile    |   1 +
 src/platform/i386/acpi.c      |   4 +
 src/platform/i386/apic_cntl.h |  48 +++++++-----
 src/platform/i386/hpet.c      |   4 +-
 src/platform/i386/idt.c       |   2 +-
 src/platform/i386/ioapic.c    | 140 +++++++++++++++++++++++++++++++++-
 src/platform/i386/ioapic.h    |   7 ++
 src/platform/i386/kernel.h    |   2 +
 src/platform/i386/keyboard.c  |  18 +++++
 src/platform/i386/lapic.c     |   2 +-
 src/platform/i386/lapic.h     |   1 +
 src/platform/i386/pic.c       |   2 +-
 src/platform/i386/serial.c    |   2 +-
 src/platform/i386/vga.c       |  17 -----
 src/platform/i386/vm.c        |  36 +++++----
 15 files changed, 228 insertions(+), 58 deletions(-)
 create mode 100644 src/platform/i386/keyboard.c

diff --git a/src/platform/i386/Makefile b/src/platform/i386/Makefile
index 20b31e91c8..0b222c5920 100644
--- a/src/platform/i386/Makefile
+++ b/src/platform/i386/Makefile
@@ -46,6 +46,7 @@ OBJS += chal.o
 OBJS += boot_comp.o
 OBJS += acpi.o
 OBJS += vga.o
+OBJS += keyboard.o
 OBJS += exception.o
 OBJS += lapic.o
 
diff --git a/src/platform/i386/acpi.c b/src/platform/i386/acpi.c
index 6ebf8ddcce..4ba5791a2f 100644
--- a/src/platform/i386/acpi.c
+++ b/src/platform/i386/acpi.c
@@ -175,6 +175,10 @@ acpi_madt_intsrc_iter(unsigned char *addr)
 			ioapic_iter((struct ioapic_cntl *)h);
 			break;
 		}
+		case APIC_CNTL_ISO: {
+			ioapic_int_override((struct intsrcovrride_cntl *)h);
+			break;
+		}
 		default:
 			/* See 5.2.12 in the ACPI 5.0 Spec */
 			printk("\tInterrupt controller type %d: ignoring\n", h->type);
diff --git a/src/platform/i386/apic_cntl.h b/src/platform/i386/apic_cntl.h
index 3a342efd20..e76c32b33f 100644
--- a/src/platform/i386/apic_cntl.h
+++ b/src/platform/i386/apic_cntl.h
@@ -1,39 +1,49 @@
 #ifndef APIC_CNTL_H
 #define APIC_CNTL_H
 
-#define APIC_DEFAULT_PHYS 0xfee00000
-#define APIC_HDR_LEN_OFF 0x04
-#define APIC_CNTRLR_ADDR_OFF 0x24
+#define APIC_DEFAULT_PHYS     0xFEE00000
+#define APIC_HDR_LEN_OFF      0x04
+#define APIC_CNTRLR_ADDR_OFF  0x24
 #define APIC_CNTRLR_FLAGS_OFF 0x28
-#define APIC_CNTR_ARR_OFF 0x2C
+#define APIC_CNTR_ARR_OFF     0x2C
 
 /* See 5.2.12 in the ACPI 5.0 Spec */
 enum
 {
-        APIC_CNTL_LAPIC  = 0,
-        APIC_CNTL_IOAPIC = 1,
+	APIC_CNTL_LAPIC  = 0,
+	APIC_CNTL_IOAPIC = 1,
+	APIC_CNTL_ISO    = 2,
 };
 
 struct int_cntl_head {
-        u8_t type;
-        u8_t len;
+	u8_t type;
+	u8_t len;
 } __attribute__((packed));
 
 struct lapic_cntl {
-        /* type == APIC_CNTL_LAPIC */
-        struct int_cntl_head header;
-        u8_t                 proc_id;
-        u8_t                 apic_id;
-        u32_t                flags; /* 0 = dead processor */
+	/* type == APIC_CNTL_LAPIC */
+	struct int_cntl_head header;
+	u8_t                 proc_id;
+	u8_t                 apic_id;
+	u32_t                flags; /* 0 = dead processor */
 } __attribute__((packed));
 
 struct ioapic_cntl {
-        /* type == APIC_CNTL_IOAPIC */
-        struct int_cntl_head header;
-        u8_t                 ioapic_id;
-        u8_t                 reserved;
-        u32_t                ioapic_phys_addr;
-        u32_t                glb_int_num_off; /* I/O APIC's interrupt base number offset  */
+	/* type == APIC_CNTL_IOAPIC */
+	struct int_cntl_head header;
+	u8_t                 ioapic_id;
+	u8_t                 reserved;
+	u32_t                ioapic_phys_addr;
+	u32_t                glb_int_num_off; /* I/O APIC's interrupt base number offset  */
+} __attribute__((packed));
+
+struct intsrcovrride_cntl {
+	/* type == APIC_CNTL_ISO */
+	struct int_cntl_head header;
+	u8_t                 bus;
+	u8_t                 source;
+	u32_t                glb_int_num_off;
+	u16_t                flags;
 } __attribute__((packed));
 
 #endif /* APIC_CNTL_H */
diff --git a/src/platform/i386/hpet.c b/src/platform/i386/hpet.c
index 17494bcca1..d67a56e559 100644
--- a/src/platform/i386/hpet.c
+++ b/src/platform/i386/hpet.c
@@ -184,7 +184,7 @@ hpet_periodic_handler(struct pt_regs *regs)
 
 	if (unlikely(hpet_calibration_init)) hpet_calibration();
 
-	pic_ack_irq(HW_HPET_PERIODIC);
+	lapic_ack();
 	preempt = cap_hw_asnd(&hw_asnd_caps[HW_HPET_PERIODIC], regs);
 	HPET_INT_ENABLE(HPET_PERIODIC);
 
@@ -198,7 +198,7 @@ hpet_oneshot_handler(struct pt_regs *regs)
 
 	assert(!hpet_calibration_init);
 
-	pic_ack_irq(HW_HPET_ONESHOT);
+	lapic_ack();
 	preempt = cap_hw_asnd(&hw_asnd_caps[HW_HPET_ONESHOT], regs);
 	HPET_INT_ENABLE(HPET_ONESHOT);
 
diff --git a/src/platform/i386/idt.c b/src/platform/i386/idt.c
index e6f021aaa3..2ae18b5421 100644
--- a/src/platform/i386/idt.c
+++ b/src/platform/i386/idt.c
@@ -47,7 +47,7 @@ hw_handler(struct pt_regs *regs)
 	 * TODO: ack here? or
 	 *       after user-level interrupt(rcv event) processing?
 	 */
-	pic_ack_irq(regs->orig_ax);
+	lapic_ack();
 	preempt = cap_hw_asnd(&hw_asnd_caps[regs->orig_ax], regs);
 
 	return preempt;
diff --git a/src/platform/i386/ioapic.c b/src/platform/i386/ioapic.c
index 8c80452093..b417852d1a 100644
--- a/src/platform/i386/ioapic.c
+++ b/src/platform/i386/ioapic.c
@@ -3,8 +3,146 @@
 
 #define IOAPIC_MAX 4
 
+#define IOAPIC_IOAPICID  0x00
+#define IOAPIC_IOAPICVER 0x01
+#define IOAPIC_IOAPICARB 0x02
+
+#define IOAPIC_IOREGSEL 0x00
+#define IOAPIC_IOWIN    (IOAPIC_IOREGSEL + 0x10)
+#define IOAPIC_IOREDTBL 0x10
+#define IOAPIC_IOREDTBL_OFFSET(n) (IOAPIC_IOREDTBL + 2*n)
+
+#define IOAPIC_INT_DISABLED (1<<16)
+
+enum ioapic_deliverymode
+{
+        IOAPIC_DELIV_FIXED  = 0,
+        IOAPIC_DELIV_LOWEST = 1,
+        IOAPIC_DELIV_SMI    = 2,
+        IOAPIC_DELIV_NMI    = 4,
+        IOAPIC_DELIV_INIT   = 5,
+        IOAPIC_DELIV_EXTINT = 7,
+};
+
+enum ioapic_dstmode
+{
+        IOAPIC_DST_PHYSICAL = 0,
+        IOAPIC_DST_LOGICAL  = 1,
+};
+
+enum ioapic_pinpolarity
+{
+        IOAPIC_POL_ACTHIGH = 0,
+        IOAPIC_POL_ACTLOW  = 1,
+};
+
+enum ioapic_triggermode
+{
+        IOAPIC_TRIGGER_EDGE  = 0,
+        IOAPIC_TRIGGER_LEVEL = 1,
+};
+
+
+static volatile void *ioapic_base = (volatile void *)0xfec00000;
+static unsigned int ioapic_count;
+
 void
-ioapic_iter(struct ioapic_cntl *io)
+ioapic_set_page(u32_t page)
 {
+        ioapic_base = (volatile u32_t *)(page * (1 << 22) | ((u32_t)ioapic_base & ((1 << 22) - 1)));
+
+        printk("\tSet IOAPIC @ %p\n", ioapic_base);
+}
+
+static void
+ioapic_reg_write(u8_t offset, u32_t val)
+{
+        *(volatile u32_t *)(ioapic_base + IOAPIC_IOREGSEL) = offset;
+        *(volatile u32_t *)(ioapic_base + IOAPIC_IOWIN)    = val;
+}
+
+static u32_t
+ioapic_reg_read(u8_t offset)
+{
+        *(volatile u32_t *)(ioapic_base + IOAPIC_IOREGSEL) = offset;
+
+        return *(volatile u32_t *)(ioapic_base + IOAPIC_IOWIN);
 }
 
+void
+ioapic_int_mask(int intnum)
+{
+        /*
+         * TODO:
+         * 1. how to find which IOAPIC ? 
+         * 2. read register (only first 32bits) for that redirection entry.
+         * 3. mask that bit and write that register back.
+         */
+}
+
+void
+ioapic_int_unmask(int intnum)
+{
+        /*
+         * TODO:
+         * 1. how to find which IOAPIC ? 
+         * 2. read register (only first 32bits) for that redirection entry.
+         * 3. unmask that bit and write that register back.
+         */
+}
+
+void
+ioapic_int_override(struct intsrcovrride_cntl *iso)
+{
+	int src, gsi;
+
+	assert(iso->header.len == sizeof(struct intsrcovrride_cntl));
+	printk("\tInterrupt Source Override for [%u] => %u\n", iso->source, iso->glb_int_num_off);
+
+	/* TODO: Find the right IOAPIC based on the GSI base in IOAPIC info */
+
+	if (iso->source != iso->glb_int_num_off) {
+		ioapic_reg_write(IOAPIC_IOREDTBL_OFFSET(iso->glb_int_num_off), iso->source + 32);
+	}
+}
+
+void
+ioapic_int_enable(int irqnum, int cpunum, int addflag)
+{
+	if (addflag) {
+		/* TODO: logical destination = 1 and add core no or lapic number? */
+	} else {
+		ioapic_reg_write(IOAPIC_IOREDTBL_OFFSET(irqnum), irqnum + 32);
+		ioapic_reg_write(IOAPIC_IOREDTBL_OFFSET(irqnum)+1, cpunum<<24);
+	}
+}
+
+void
+ioapic_int_disable(int irqnum)
+{
+        ioapic_reg_write(IOAPIC_IOREDTBL_OFFSET(irqnum), IOAPIC_INT_DISABLED | irqnum);
+        ioapic_reg_write(IOAPIC_IOREDTBL_OFFSET(irqnum)+1, 0);
+}
+
+void
+ioapic_iter(struct ioapic_cntl *io)
+{
+	u32_t ver, ioent, i;
+
+	assert(io);
+
+	ioapic_count ++;
+	
+	/* FIXME: Just one for now! */
+	if (ioapic_count > 1) return;
+
+	ioapic_base = (volatile u32_t *)(io->ioapic_phys_addr);
+	ioapic_set_page(vm_set_supage((u32_t)ioapic_base));
+
+	ver   = ioapic_reg_read(IOAPIC_IOAPICVER);
+	ioent = ((ver >> 16) & 0xFF) + 1;
+
+	printk("\tIOAPIC %d: Number of entries = %d\n", io->ioapic_id, ioent);
+
+	for (i = 0; i < ioent; i++) ioapic_int_enable(i, 0, 0);
+}
diff --git a/src/platform/i386/ioapic.h b/src/platform/i386/ioapic.h
index 0e807518ca..67653815a8 100644
--- a/src/platform/i386/ioapic.h
+++ b/src/platform/i386/ioapic.h
@@ -4,5 +4,12 @@
 #include "apic_cntl.h"
 
 void ioapic_iter(struct ioapic_cntl *);
+void ioapic_int_mask(int irq);
+void ioapic_int_unmask(int irq);
+
+void ioapic_int_disable(int irq);
+void ioapic_int_enable(int irq, int cpu, int add);
+
+void ioapic_int_override(struct intsrcovrride_cntl *);
 
 #endif /* IOAPIC_H */
diff --git a/src/platform/i386/kernel.h b/src/platform/i386/kernel.h
index 0ba6c6ae0f..b42aeb2d3f 100644
--- a/src/platform/i386/kernel.h
+++ b/src/platform/i386/kernel.h
@@ -21,8 +21,10 @@ void  tss_init(void);
 void  idt_init(void);
 void  gdt_init(void);
 void  user_init(void);
+
 void  paging_init(void);
 void  kern_paging_map_init(void *pa);
+int   vm_set_supage(u32_t addr);
 
 void tls_update(u32_t addr);
 
diff --git a/src/platform/i386/keyboard.c b/src/platform/i386/keyboard.c
new file mode 100644
index 0000000000..511ae9e9d3
--- /dev/null
+++ b/src/platform/i386/keyboard.c
@@ -0,0 +1,18 @@
+#include "kernel.h"
+
+#define KEY_DEVICE  0x60
+#define KEY_PENDING 0x64
+
+void
+keyboard_handler(struct pt_regs *regs)
+{
+        u16_t scancode = 0;
+
+        lapic_ack();
+
+        while (inb(KEY_PENDING) & 2) {
+                /* wait for keypress to be ready */
+        }
+        scancode = inb(KEY_DEVICE);
+        printk("Keyboard press: %d\n", scancode);
+}
diff --git a/src/platform/i386/lapic.c b/src/platform/i386/lapic.c
index f28b77d2e5..db5b7848f4 100644
--- a/src/platform/i386/lapic.c
+++ b/src/platform/i386/lapic.c
@@ -66,7 +66,7 @@ lapic_write_reg(u32_t off, u32_t val)
 	*(u32_t *)(lapic + off) = val;
 }
 
-static void
+void
 lapic_ack(void)
 {
 	if (lapic) lapic_write_reg(LAPIC_EOI_REG, 0);
diff --git a/src/platform/i386/lapic.h b/src/platform/i386/lapic.h
index bd2dfb08de..57bcda614e 100644
--- a/src/platform/i386/lapic.h
+++ b/src/platform/i386/lapic.h
@@ -3,6 +3,7 @@
 
 #include "apic_cntl.h"
 
+void         lapic_ack(void);
 void         lapic_iter(struct lapic_cntl *);
 u32_t        lapic_find_localaddr(void *l);
 void         lapic_set_page(u32_t page);
diff --git a/src/platform/i386/pic.c b/src/platform/i386/pic.c
index a7cfa7eb5c..ed2047a2f8 100644
--- a/src/platform/i386/pic.c
+++ b/src/platform/i386/pic.c
@@ -53,5 +53,5 @@ pic_init(void)
 	outb(PIC1_DATA, PIC_ICW4_8086);
 	outb(PIC2_DATA, PIC_ICW4_8086);
 
-	pic_enable();
+	pic_disable();
 }
diff --git a/src/platform/i386/serial.c b/src/platform/i386/serial.c
index 6c867b8f92..fe1eb7a7cb 100644
--- a/src/platform/i386/serial.c
+++ b/src/platform/i386/serial.c
@@ -41,7 +41,7 @@ serial_handler(struct pt_regs *r)
 	char serial;
 	int  preempt = 1;
 
-	pic_ack_irq(HW_SERIAL);
+	lapic_ack();
 
 	serial = serial_recv();
 
diff --git a/src/platform/i386/vga.c b/src/platform/i386/vga.c
index 991b73f60a..c18b97900e 100644
--- a/src/platform/i386/vga.c
+++ b/src/platform/i386/vga.c
@@ -44,9 +44,6 @@
 #define VGA_CTL_REG 0x3D4
 #define VGA_DATA_REG 0x3D5
 
-#define KEY_DEVICE 0x60
-#define KEY_PENDING 0x64
-
 /* Variables. */
 /* Save the X position. */
 static int csr_x;
@@ -217,17 +214,3 @@ vga_init(void)
 	cls();
 	printk_register_handler(vga_puts);
 }
-
-void
-keyboard_handler(struct pt_regs *regs)
-{
-	u16_t scancode = 0;
-
-	pic_ack_irq(HW_KEYBOARD);
-
-	while (inb(KEY_PENDING) & 2) {
-		/* wait for keypress to be ready */
-	}
-	scancode = inb(KEY_DEVICE);
-	printk("Keyboard press: %d\n", scancode);
-}
diff --git a/src/platform/i386/vm.c b/src/platform/i386/vm.c
index ea5811b79f..92098817d7 100644
--- a/src/platform/i386/vm.c
+++ b/src/platform/i386/vm.c
@@ -52,6 +52,21 @@ u8_t *mem_boot_alloc(int npages) /* boot-time, bump-ptr heap */
 	return r;
 }
 
+static unsigned long vm_pgd_idx = COS_MEM_KERN_START_VA / PGD_RANGE;
+
+int
+vm_set_supage(u32_t addr)
+{
+	int idx = vm_pgd_idx;
+	u32_t page;
+	
+	page = round_to_pgd_page(addr);
+	boot_comp_pgd[idx] = page | PGTBL_PRESENT | PGTBL_WRITABLE | PGTBL_SUPER | PGTBL_GLOBAL;
+	vm_pgd_idx ++;
+
+	return idx;
+}
+
 int
 kern_setup_image(void)
 {
@@ -72,6 +87,8 @@ kern_setup_image(void)
 		boot_comp_pgd[i / PGD_RANGE] = 0; /* unmap lower addresses */
 	}
 
+	vm_pgd_idx = j;
+
 	/* FIXME: Ugly hack to get the physical page with the ACPI RSDT mapped */
 	printk("ACPI initialization\n");
 	void *rsdt = acpi_find_rsdt();
@@ -80,28 +97,17 @@ kern_setup_image(void)
 		u64_t hpet;
 
 		page             = round_up_to_pgd_page(rsdt) - (1 << 22);
-		boot_comp_pgd[j] = page | PGTBL_PRESENT | PGTBL_WRITABLE | PGTBL_SUPER | PGTBL_GLOBAL;
-		acpi_set_rsdt_page(j);
-		j++;
+		acpi_set_rsdt_page(vm_set_supage(page));
 
 		hpet = hpet_find(acpi_find_hpet());
-		if (hpet) {
-			page             = round_up_to_pgd_page(hpet & 0xffffffff) - (1 << 22);
-			boot_comp_pgd[j] = page | PGTBL_PRESENT | PGTBL_WRITABLE | PGTBL_SUPER | PGTBL_GLOBAL;
-			hpet_set_page(j);
-			j++;
-		}
+		if (hpet) hpet_set_page(vm_set_supage(hpet));
 
 		/* lapic memory map */
 		lapic = lapic_find_localaddr(acpi_find_apic());
-		if (lapic) {
-			page             = round_up_to_pgd_page(lapic & 0xffffffff) - (1 << 22);
-			boot_comp_pgd[j] = page | PGTBL_PRESENT | PGTBL_WRITABLE | PGTBL_SUPER | PGTBL_GLOBAL;
-			lapic_set_page(j);
-			j++;
-		}
+		if (lapic) lapic_set_page(vm_set_supage(lapic));
 	}
 
+	j = vm_pgd_idx;
 	for (; j < PAGE_SIZE / sizeof(unsigned int); i += PGD_RANGE, j++) {
 		boot_comp_pgd[j] = boot_comp_pgd[i / PGD_RANGE] = 0;
 	}

From 17a0a81aaf8d8a34fc2c006d214db1a7b45c7661 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Mon, 5 Feb 2018 14:06:35 -0500
Subject: [PATCH 006/127] Basic support for multiple IOAPICs (not tested with
 multiple)

---
 src/platform/i386/ioapic.c | 131 +++++++++++++++++++++++++------------
 src/platform/i386/ioapic.h |   2 +
 src/platform/i386/kernel.c |   1 +
 src/platform/i386/kernel.h |   1 +
 src/platform/i386/pic.c    |   8 ++-
 src/platform/i386/pic.h    |   2 +
 src/platform/i386/serial.c |   2 +-
 src/platform/i386/vga.c    |   1 +
 8 files changed, 102 insertions(+), 46 deletions(-)

diff --git a/src/platform/i386/ioapic.c b/src/platform/i386/ioapic.c
index b417852d1a..82e2755cb0 100644
--- a/src/platform/i386/ioapic.c
+++ b/src/platform/i386/ioapic.c
@@ -1,5 +1,6 @@
 #include "kernel.h"
 #include "ioapic.h"
+#include "pic.h"
 
 #define IOAPIC_MAX 4
 
@@ -42,107 +43,153 @@ enum ioapic_triggermode
         IOAPIC_TRIGGER_LEVEL = 1,
 };
 
+struct ioapic_info {
+	unsigned int   ioapicid;
+	volatile void *io_vaddr;
+	int            nentries;
+	int            glbint_base;
+};
 
-static volatile void *ioapic_base = (volatile void *)0xfec00000;
+static struct ioapic_info ioapicinfo[IOAPIC_MAX] = { { 0, NULL, 0, 0} };
 static unsigned int ioapic_count;
 
 void
-ioapic_set_page(u32_t page)
+ioapic_set_page(struct ioapic_info *io, u32_t page)
 {
-        ioapic_base = (volatile u32_t *)(page * (1 << 22) | ((u32_t)ioapic_base & ((1 << 22) - 1)));
+        io->io_vaddr = (volatile u32_t *)(page * (1 << 22) | ((u32_t)io->io_vaddr & ((1 << 22) - 1)));
 
-        printk("\tSet IOAPIC @ %p\n", ioapic_base);
+        printk("\tSet IOAPIC %d @ %p\n", io->ioapicid, io->io_vaddr);
 }
 
 static void
-ioapic_reg_write(u8_t offset, u32_t val)
+ioapic_reg_write(struct ioapic_info *io, u8_t offset, u32_t val)
 {
-        *(volatile u32_t *)(ioapic_base + IOAPIC_IOREGSEL) = offset;
-        *(volatile u32_t *)(ioapic_base + IOAPIC_IOWIN)    = val;
+        *(volatile u32_t *)(io->io_vaddr + IOAPIC_IOREGSEL) = offset;
+        *(volatile u32_t *)(io->io_vaddr + IOAPIC_IOWIN)    = val;
 }
 
 static u32_t
-ioapic_reg_read(u8_t offset)
+ioapic_reg_read(struct ioapic_info *io, u8_t offset)
 {
-        *(volatile u32_t *)(ioapic_base + IOAPIC_IOREGSEL) = offset;
+        *(volatile u32_t *)(io->io_vaddr + IOAPIC_IOREGSEL) = offset;
 
-        return *(volatile u32_t *)(ioapic_base + IOAPIC_IOWIN);
+        return *(volatile u32_t *)(io->io_vaddr + IOAPIC_IOWIN);
 }
 
 void
 ioapic_int_mask(int intnum)
 {
-        /*
-         * TODO:
-         * 1. how to find which IOAPIC ? 
-         * 2. read register (only first 32bits) for that redirection entry.
-         * 3. mask that bit and write that register back.
-         */
+	/* TODO */
 }
 
 void
 ioapic_int_unmask(int intnum)
 {
-        /*
-         * TODO:
-         * 1. how to find which IOAPIC ? 
-         * 2. read register (only first 32bits) for that redirection entry.
-         * 3. unmask that bit and write that register back.
-         */
+	/* TODO */
+}
+
+static struct ioapic_info *
+ioapic_findbygsi(int irq)
+{
+	unsigned int i = 0;
+
+	for (; i < ioapic_count; i++) {
+		if (irq >= ioapicinfo[i].glbint_base && irq < ioapicinfo[i].nentries) return &ioapicinfo[i];
+	}
+
+	return NULL;
+}
+
+static struct ioapic_info *
+ioapic_findbyid(int id)
+{
+	unsigned int i = 0;
+
+	for (; i < ioapic_count; i++) {
+		if (id == (int)(ioapicinfo[i].ioapicid)) return &ioapicinfo[i];
+	}
+
+	return NULL;
 }
 
 void
 ioapic_int_override(struct intsrcovrride_cntl *iso)
 {
-	int src, gsi;
-
 	assert(iso->header.len == sizeof(struct intsrcovrride_cntl));
-	printk("\tInterrupt Source Override for [%u] => %u\n", iso->source, iso->glb_int_num_off);
-
-	/* TODO: Find the right IOAPIC based on the GSI base in IOAPIC info */
 
 	if (iso->source != iso->glb_int_num_off) {
-		ioapic_reg_write(IOAPIC_IOREDTBL_OFFSET(iso->glb_int_num_off), iso->source + 32);
+		struct ioapic_info *ioap = ioapic_findbygsi(iso->glb_int_num_off);
+
+		assert(ioap);
+		printk("\tInterrupt Source Override for [%u] => %u with IOAPIC %d\n", iso->source, iso->glb_int_num_off, ioap->ioapicid);
+		ioapic_reg_write(ioap, IOAPIC_IOREDTBL_OFFSET(iso->glb_int_num_off), iso->source + 32);
 	}
 }
 
 void
 ioapic_int_enable(int irqnum, int cpunum, int addflag)
 {
+	struct ioapic_info *ioap = ioapic_findbygsi(irqnum);
+
+	assert(ioap);
 	if (addflag) {
 		/* TODO: logical destination = 1 and add core no or lapic number? */
 	} else {
-		ioapic_reg_write(IOAPIC_IOREDTBL_OFFSET(irqnum), irqnum + 32);
-		ioapic_reg_write(IOAPIC_IOREDTBL_OFFSET(irqnum)+1, cpunum<<24);
+		ioapic_reg_write(ioap, IOAPIC_IOREDTBL_OFFSET(irqnum), irqnum + 32);
+		ioapic_reg_write(ioap, IOAPIC_IOREDTBL_OFFSET(irqnum)+1, cpunum<<24);
 	}
 }
 
 void
 ioapic_int_disable(int irqnum)
 {
-        ioapic_reg_write(IOAPIC_IOREDTBL_OFFSET(irqnum), IOAPIC_INT_DISABLED | irqnum);
-        ioapic_reg_write(IOAPIC_IOREDTBL_OFFSET(irqnum)+1, 0);
+	struct ioapic_info *ioap = ioapic_findbygsi(irqnum);
+
+	assert(ioap);
+        ioapic_reg_write(ioap, IOAPIC_IOREDTBL_OFFSET(irqnum), IOAPIC_INT_DISABLED | irqnum);
+        ioapic_reg_write(ioap, IOAPIC_IOREDTBL_OFFSET(irqnum)+1, 0);
 }
 
 void
 ioapic_iter(struct ioapic_cntl *io)
 {
-	u32_t ver, ioent, i;
+	u32_t ver;
+	int ioent, j;
+	static int more = 0;
 
 	assert(io);
 
-	ioapic_count ++;
+	if (ioapic_count == IOAPIC_MAX) {
+		more ++;
+		printk("\t%d more than %d IOAPICs present..\n", more, IOAPIC_MAX);
+		return;
+	}
 	
-	/* FIXME: Just one for now! */
-	if (ioapic_count > 1) return;
-
-	ioapic_base = (volatile u32_t *)(io->ioapic_phys_addr);
-	ioapic_set_page(vm_set_supage((u32_t)ioapic_base));
+	ioapicinfo[ioapic_count].io_vaddr = (volatile void *)(io->ioapic_phys_addr);	
+	ioapicinfo[ioapic_count].ioapicid = io->ioapic_id;
+	ioapic_set_page(&(ioapicinfo[ioapic_count]), vm_set_supage((u32_t)(ioapicinfo[ioapic_count].io_vaddr)));
 
-	ver   = ioapic_reg_read(IOAPIC_IOAPICVER);
+	ver   = ioapic_reg_read(&ioapicinfo[ioapic_count], IOAPIC_IOAPICVER);
 	ioent = ((ver >> 16) & 0xFF) + 1;
+	printk("\tIOAPIC %d (counter:%d): Number of entries = %d\n", io->ioapic_id, ioapic_count, ioent);
+
+	ioapicinfo[ioapic_count].nentries    = ioent;
+	ioapicinfo[ioapic_count].glbint_base = io->glb_int_num_off;
+	ioapic_count ++;
+
+	for (j = 0; j < ioent; j++) ioapic_int_enable(io->glb_int_num_off + j, 0, 0); /* TODO: assign to different cores */
+}
+
+void
+ioapic_init(void)
+{
+	assert(ioapic_count);
+	pic_disable();
 
-	printk("\tIOAPIC %d: Number of entries = %d\n", io->ioapic_id, ioent);
+	printk("Setting up IOAPIC (disabling PIC)\n");
 
-	for (i = 0; i < ioent; i++) ioapic_int_enable(i, 0, 0);
+	/*
+	 * PCI Interrupts may need some attention here.
+	 * TODO: Test it with NIC in RK env.
+	 */
 }
diff --git a/src/platform/i386/ioapic.h b/src/platform/i386/ioapic.h
index 67653815a8..2014a94d39 100644
--- a/src/platform/i386/ioapic.h
+++ b/src/platform/i386/ioapic.h
@@ -3,6 +3,8 @@
 
 #include "apic_cntl.h"
 
+void ioapic_init(void);
+
 void ioapic_iter(struct ioapic_cntl *);
 void ioapic_int_mask(int irq);
 void ioapic_int_unmask(int irq);
diff --git a/src/platform/i386/kernel.c b/src/platform/i386/kernel.c
index c6b2d9492c..db272eb4ca 100644
--- a/src/platform/i386/kernel.c
+++ b/src/platform/i386/kernel.c
@@ -158,6 +158,7 @@ kmain(struct multiboot *mboot, u32_t mboot_magic, u32_t esp)
 	hpet_init();
 	lapic_timer_init();
 	pic_init();
+	ioapic_init();
 	kern_boot_upcall();
 	/* should not get here... */
 	khalt();
diff --git a/src/platform/i386/kernel.h b/src/platform/i386/kernel.h
index b42aeb2d3f..7cec95f600 100644
--- a/src/platform/i386/kernel.h
+++ b/src/platform/i386/kernel.h
@@ -16,6 +16,7 @@
 #include "acpi.h"
 #include "lapic.h"
 #include "pic.h"
+#include "ioapic.h"
 
 void  tss_init(void);
 void  idt_init(void);
diff --git a/src/platform/i386/pic.c b/src/platform/i386/pic.c
index ed2047a2f8..1de14dcabf 100644
--- a/src/platform/i386/pic.c
+++ b/src/platform/i386/pic.c
@@ -1,3 +1,4 @@
+#include "kernel.h"
 #include "pic.h"
 
 #define PIC_IRQ_BASE    0x20
@@ -27,14 +28,14 @@
 #define PIC_ICW4_SFNM       0x10 /* Special fully nested (not) */
 #define PIC_ICW1_ICW4       0x01
 
-static void
+void
 pic_disable(void)
 {
 	outb(PIC1_DATA, PIC_ALL_DISABLE);
 	outb(PIC2_DATA, PIC_ALL_DISABLE);
 }
 
-static void
+void
 pic_enable(void)
 {
 	outb(PIC1_DATA, PIC_ALL_ENABLE);
@@ -44,6 +45,7 @@ pic_enable(void)
 void
 pic_init(void)
 {
+	printk("Setting up PIC\n");
 	outb(PIC1_CMD, PIC_ICW1_INIT | PIC_ICW1_ICW4);
 	outb(PIC2_CMD, PIC_ICW1_INIT | PIC_ICW1_ICW4);
 	outb(PIC1_DATA, PIC_IRQ_BASE);
@@ -53,5 +55,5 @@ pic_init(void)
 	outb(PIC1_DATA, PIC_ICW4_8086);
 	outb(PIC2_DATA, PIC_ICW4_8086);
 
-	pic_disable();
+	pic_enable();
 }
diff --git a/src/platform/i386/pic.h b/src/platform/i386/pic.h
index 55f604fc38..29d0e9b4bf 100644
--- a/src/platform/i386/pic.h
+++ b/src/platform/i386/pic.h
@@ -4,6 +4,8 @@
 #include "io.h"
 
 void pic_init(void);
+void pic_enable(void);
+void pic_disable(void);
 
 static void
 pic_ack_irq(int n)
diff --git a/src/platform/i386/serial.c b/src/platform/i386/serial.c
index fe1eb7a7cb..e3ea1cac50 100644
--- a/src/platform/i386/serial.c
+++ b/src/platform/i386/serial.c
@@ -79,7 +79,6 @@ serial_handler(struct pt_regs *r)
 void
 serial_init(void)
 {
-	printk("Enabling serial I/O\n");
 	printk_register_handler(serial_puts);
 
 	/* We will initialize the first serial port */
@@ -92,4 +91,5 @@ serial_init(void)
 	outb(SERIAL_PORT_A + 4, 0x0B);
 
 	outb(SERIAL_PORT_A + 1, 0x01); /* Enable interrupts on receive */
+	printk("Enabling serial I/O\n");
 }
diff --git a/src/platform/i386/vga.c b/src/platform/i386/vga.c
index c18b97900e..0d05367a98 100644
--- a/src/platform/i386/vga.c
+++ b/src/platform/i386/vga.c
@@ -213,4 +213,5 @@ vga_init(void)
 	csr_y = 0;
 	cls();
 	printk_register_handler(vga_puts);
+	printk("Enabling VGA\n");
 }

From 37d3449fbe8421bd6eb01226cdbd2a0511699df2 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Mon, 5 Feb 2018 15:45:45 -0500
Subject: [PATCH 007/127] Cleanup from self-review of the PR

---
 src/pic.h                            | 7 -------
 src/platform/i386/chal/chal_config.h | 4 +++-
 src/platform/i386/ioapic.c           | 9 +++++----
 3 files changed, 8 insertions(+), 12 deletions(-)
 delete mode 100644 src/pic.h

diff --git a/src/pic.h b/src/pic.h
deleted file mode 100644
index d092b841e8..0000000000
--- a/src/pic.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef PIC_H
-#define PIC_H
-
-void pic_init(void);
-void pic_ack_irq(int n);
-
-#endif /* PIC_H */
diff --git a/src/platform/i386/chal/chal_config.h b/src/platform/i386/chal/chal_config.h
index 7302c37dd7..b16be21776 100644
--- a/src/platform/i386/chal/chal_config.h
+++ b/src/platform/i386/chal/chal_config.h
@@ -13,8 +13,10 @@ typedef signed int         s32_t;
 typedef signed long long   s64_t;
 #endif
 
+#define HW_IRQ_START 32
+
 typedef enum {
-	HW_HPET_PERIODIC = 32, /* periodic timer interrupt */
+	HW_HPET_PERIODIC = HW_IRQ_START, /* periodic timer interrupt */
 	HW_KEYBOARD,      /* keyboard interrupt */
 	HW_ID3,
 	HW_ID4,
diff --git a/src/platform/i386/ioapic.c b/src/platform/i386/ioapic.c
index 82e2755cb0..750a0b6738 100644
--- a/src/platform/i386/ioapic.c
+++ b/src/platform/i386/ioapic.c
@@ -122,7 +122,7 @@ ioapic_int_override(struct intsrcovrride_cntl *iso)
 
 		assert(ioap);
 		printk("\tInterrupt Source Override for [%u] => %u with IOAPIC %d\n", iso->source, iso->glb_int_num_off, ioap->ioapicid);
-		ioapic_reg_write(ioap, IOAPIC_IOREDTBL_OFFSET(iso->glb_int_num_off), iso->source + 32);
+		ioapic_reg_write(ioap, IOAPIC_IOREDTBL_OFFSET(iso->glb_int_num_off), iso->source + HW_IRQ_START);
 	}
 }
 
@@ -135,7 +135,7 @@ ioapic_int_enable(int irqnum, int cpunum, int addflag)
 	if (addflag) {
 		/* TODO: logical destination = 1 and add core no or lapic number? */
 	} else {
-		ioapic_reg_write(ioap, IOAPIC_IOREDTBL_OFFSET(irqnum), irqnum + 32);
+		ioapic_reg_write(ioap, IOAPIC_IOREDTBL_OFFSET(irqnum), irqnum + HW_IRQ_START);
 		ioapic_reg_write(ioap, IOAPIC_IOREDTBL_OFFSET(irqnum)+1, cpunum<<24);
 	}
 }
@@ -146,8 +146,8 @@ ioapic_int_disable(int irqnum)
 	struct ioapic_info *ioap = ioapic_findbygsi(irqnum);
 
 	assert(ioap);
-        ioapic_reg_write(ioap, IOAPIC_IOREDTBL_OFFSET(irqnum), IOAPIC_INT_DISABLED | irqnum);
-        ioapic_reg_write(ioap, IOAPIC_IOREDTBL_OFFSET(irqnum)+1, 0);
+	ioapic_reg_write(ioap, IOAPIC_IOREDTBL_OFFSET(irqnum), IOAPIC_INT_DISABLED | irqnum);
+	ioapic_reg_write(ioap, IOAPIC_IOREDTBL_OFFSET(irqnum)+1, 0);
 }
 
 void
@@ -162,6 +162,7 @@ ioapic_iter(struct ioapic_cntl *io)
 	if (ioapic_count == IOAPIC_MAX) {
 		more ++;
 		printk("\t%d more than %d IOAPICs present..\n", more, IOAPIC_MAX);
+
 		return;
 	}
 	

From b7342e3132fde3ecc00bdde344044f8f04a5f665 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Wed, 7 Feb 2018 10:47:01 -0500
Subject: [PATCH 008/127] ioapic mask/unmask (untested)

---
 src/platform/i386/ioapic.c | 37 +++++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/src/platform/i386/ioapic.c b/src/platform/i386/ioapic.c
index 750a0b6738..398833212a 100644
--- a/src/platform/i386/ioapic.c
+++ b/src/platform/i386/ioapic.c
@@ -13,7 +13,7 @@
 #define IOAPIC_IOREDTBL 0x10
 #define IOAPIC_IOREDTBL_OFFSET(n) (IOAPIC_IOREDTBL + 2*n)
 
-#define IOAPIC_INT_DISABLED (1<<16)
+#define IOAPIC_INT_MASKED (1<<16)
 
 enum ioapic_deliverymode
 {
@@ -76,18 +76,6 @@ ioapic_reg_read(struct ioapic_info *io, u8_t offset)
         return *(volatile u32_t *)(io->io_vaddr + IOAPIC_IOWIN);
 }
 
-void
-ioapic_int_mask(int intnum)
-{
-	/* TODO */
-}
-
-void
-ioapic_int_unmask(int intnum)
-{
-	/* TODO */
-}
-
 static struct ioapic_info *
 ioapic_findbygsi(int irq)
 {
@@ -126,6 +114,26 @@ ioapic_int_override(struct intsrcovrride_cntl *iso)
 	}
 }
 
+void
+ioapic_int_mask(int irqnum)
+{
+	struct ioapic_info *ioap = ioapic_findbygsi(irqnum);
+
+	assert(ioap);
+	ioapic_reg_write(ioap, IOAPIC_IOREDTBL_OFFSET(irqnum), IOAPIC_INT_MASKED | irqnum);
+}
+
+void
+ioapic_int_unmask(int irqnum)
+{
+	struct ioapic_info *ioap = ioapic_findbygsi(irqnum);
+	u32_t val = 0;
+
+	assert(ioap);
+	val = ioapic_reg_read(ioap, IOAPIC_IOREDTBL_OFFSET(irqnum));
+	ioapic_reg_write(ioap, IOAPIC_IOREDTBL_OFFSET(irqnum), val & ~IOAPIC_INT_MASKED);
+}
+
 void
 ioapic_int_enable(int irqnum, int cpunum, int addflag)
 {
@@ -134,6 +142,7 @@ ioapic_int_enable(int irqnum, int cpunum, int addflag)
 	assert(ioap);
 	if (addflag) {
 		/* TODO: logical destination = 1 and add core no or lapic number? */
+		assert(0);
 	} else {
 		ioapic_reg_write(ioap, IOAPIC_IOREDTBL_OFFSET(irqnum), irqnum + HW_IRQ_START);
 		ioapic_reg_write(ioap, IOAPIC_IOREDTBL_OFFSET(irqnum)+1, cpunum<<24);
@@ -146,7 +155,7 @@ ioapic_int_disable(int irqnum)
 	struct ioapic_info *ioap = ioapic_findbygsi(irqnum);
 
 	assert(ioap);
-	ioapic_reg_write(ioap, IOAPIC_IOREDTBL_OFFSET(irqnum), IOAPIC_INT_DISABLED | irqnum);
+	ioapic_reg_write(ioap, IOAPIC_IOREDTBL_OFFSET(irqnum), IOAPIC_INT_MASKED | irqnum);
 	ioapic_reg_write(ioap, IOAPIC_IOREDTBL_OFFSET(irqnum)+1, 0);
 }
 

From e15ba5b3ef523c9b45e59251183049dc3d1cf892 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Fri, 9 Feb 2018 11:34:16 -0500
Subject: [PATCH 009/127] Review feedback fixes

---
 src/platform/i386/acpi.c   | 4 ++--
 src/platform/i386/ioapic.c | 9 +++++++--
 src/platform/i386/kernel.h | 2 +-
 src/platform/i386/lapic.c  | 2 +-
 src/platform/i386/vm.c     | 8 ++++----
 5 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/src/platform/i386/acpi.c b/src/platform/i386/acpi.c
index 4ba5791a2f..bdc4d53aad 100644
--- a/src/platform/i386/acpi.c
+++ b/src/platform/i386/acpi.c
@@ -166,12 +166,12 @@ acpi_madt_intsrc_iter(unsigned char *addr)
 		assert(h->len >= sizeof(struct int_cntl_head));
 		switch (h->type) {
 		case APIC_CNTL_LAPIC: {
-			nl ++;
+			nl++;
 			lapic_iter((struct lapic_cntl *)h);
 			break;
 		}
 		case APIC_CNTL_IOAPIC: {
-			nio ++;
+			nio++;
 			ioapic_iter((struct ioapic_cntl *)h);
 			break;
 		}
diff --git a/src/platform/i386/ioapic.c b/src/platform/i386/ioapic.c
index 398833212a..6b96c51f1e 100644
--- a/src/platform/i386/ioapic.c
+++ b/src/platform/i386/ioapic.c
@@ -177,7 +177,7 @@ ioapic_iter(struct ioapic_cntl *io)
 	
 	ioapicinfo[ioapic_count].io_vaddr = (volatile void *)(io->ioapic_phys_addr);	
 	ioapicinfo[ioapic_count].ioapicid = io->ioapic_id;
-	ioapic_set_page(&(ioapicinfo[ioapic_count]), vm_set_supage((u32_t)(ioapicinfo[ioapic_count].io_vaddr)));
+	ioapic_set_page(&(ioapicinfo[ioapic_count]), vm_map_superpage((u32_t)(ioapicinfo[ioapic_count].io_vaddr)));
 
 	ver   = ioapic_reg_read(&ioapicinfo[ioapic_count], IOAPIC_IOAPICVER);
 	ioent = ((ver >> 16) & 0xFF) + 1;
@@ -200,6 +200,11 @@ ioapic_init(void)
 
 	/*
 	 * PCI Interrupts may need some attention here.
-	 * TODO: Test it with NIC in RK env.
+	 * https://forum.osdev.org/viewtopic.php?f=1&t=21745
+	 * The discussion in the above forum suggest modern PCIe devices bypass IOAPIC and send 
+	 * interrupts directly to the core. For legacy PCI, we probably need to read some APIC tables.
+	 *
+	 * Update: I've tested by porting IOAPIC to RK on Qemu, udpserver test went OK.
+	 *         But on HW, I can't get even "ping" to work. TODO: Debugging!
 	 */
 }
diff --git a/src/platform/i386/kernel.h b/src/platform/i386/kernel.h
index 7cec95f600..331c6de6f0 100644
--- a/src/platform/i386/kernel.h
+++ b/src/platform/i386/kernel.h
@@ -25,7 +25,7 @@ void  user_init(void);
 
 void  paging_init(void);
 void  kern_paging_map_init(void *pa);
-int   vm_set_supage(u32_t addr);
+int   vm_map_superpage(u32_t addr);
 
 void tls_update(u32_t addr);
 
diff --git a/src/platform/i386/lapic.c b/src/platform/i386/lapic.c
index db5b7848f4..146def5e3a 100644
--- a/src/platform/i386/lapic.c
+++ b/src/platform/i386/lapic.c
@@ -4,7 +4,7 @@
 #include "isr.h"
 #include "apic_cntl.h"
 
-#define LAPIC_MAX 8
+#define LAPIC_MAX NUM_CPU
 
 int ncpus = 1;
 int cpus[NUM_CPU];
diff --git a/src/platform/i386/vm.c b/src/platform/i386/vm.c
index 92098817d7..8faec7ab47 100644
--- a/src/platform/i386/vm.c
+++ b/src/platform/i386/vm.c
@@ -55,7 +55,7 @@ u8_t *mem_boot_alloc(int npages) /* boot-time, bump-ptr heap */
 static unsigned long vm_pgd_idx = COS_MEM_KERN_START_VA / PGD_RANGE;
 
 int
-vm_set_supage(u32_t addr)
+vm_map_superpage(u32_t addr)
 {
 	int idx = vm_pgd_idx;
 	u32_t page;
@@ -97,14 +97,14 @@ kern_setup_image(void)
 		u64_t hpet;
 
 		page             = round_up_to_pgd_page(rsdt) - (1 << 22);
-		acpi_set_rsdt_page(vm_set_supage(page));
+		acpi_set_rsdt_page(vm_map_superpage(page));
 
 		hpet = hpet_find(acpi_find_hpet());
-		if (hpet) hpet_set_page(vm_set_supage(hpet));
+		if (hpet) hpet_set_page(vm_map_superpage(hpet));
 
 		/* lapic memory map */
 		lapic = lapic_find_localaddr(acpi_find_apic());
-		if (lapic) lapic_set_page(vm_set_supage(lapic));
+		if (lapic) lapic_set_page(vm_map_superpage(lapic));
 	}
 
 	j = vm_pgd_idx;

From 1ed8310f98195b6fdd04eabb21a2f055b281235e Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Fri, 9 Mar 2018 17:14:38 -0500
Subject: [PATCH 010/127] IOAPIC bugfixes

---
 src/kernel/include/chal.h     |   3 +
 src/platform/i386/apic_cntl.h |  14 ++
 src/platform/i386/hpet.c      |   2 +
 src/platform/i386/ioapic.c    | 263 ++++++++++++++++++++++++++--------
 4 files changed, 220 insertions(+), 62 deletions(-)

diff --git a/src/kernel/include/chal.h b/src/kernel/include/chal.h
index 7d74a3aed3..3d46d885bd 100644
--- a/src/kernel/include/chal.h
+++ b/src/kernel/include/chal.h
@@ -95,6 +95,9 @@ void chal_idle(void);
 void chal_timer_set(cycles_t cycles);
 void chal_timer_disable(void);
 
+void chal_irq_disable(int irqline);
+void chal_irq_enable(int irqline);
+
 void chal_init(void);
 
 /* int cos_syscall_idle(void); */
diff --git a/src/platform/i386/apic_cntl.h b/src/platform/i386/apic_cntl.h
index e76c32b33f..47f3073698 100644
--- a/src/platform/i386/apic_cntl.h
+++ b/src/platform/i386/apic_cntl.h
@@ -46,4 +46,18 @@ struct intsrcovrride_cntl {
 	u16_t                flags;
 } __attribute__((packed));
 
+enum acpi_madt_iso_polarity {
+	ACPI_MADT_ISO_POL_CONFORMS = 0,
+	ACPI_MADT_ISO_POL_ACTHIGH,
+	ACPI_MADT_ISO_POL_RESERVED,
+	ACPI_MADT_ISO_POL_ACTLOW,
+};
+
+enum acpi_madt_iso_trigger {
+	ACPI_MADT_ISO_TRIG_CONFORMS = 0,
+	ACPI_MADT_ISO_TRIG_EDGE,
+	ACPI_MADT_ISO_TRIG_RESERVED,
+	ACPI_MADT_ISO_TRIG_LEVEL,
+};
+
 #endif /* APIC_CNTL_H */
diff --git a/src/platform/i386/hpet.c b/src/platform/i386/hpet.c
index d67a56e559..57d67a7159 100644
--- a/src/platform/i386/hpet.c
+++ b/src/platform/i386/hpet.c
@@ -290,4 +290,6 @@ hpet_init(void)
 	 * specification is in hpet cycles (not cpu cycles).
 	 */
 	hpet_set(HPET_PERIODIC, hpet_hpetcyc_per_tick);
+
+	chal_irq_enable(HW_HPET_PERIODIC);
 }
diff --git a/src/platform/i386/ioapic.c b/src/platform/i386/ioapic.c
index 6b96c51f1e..baa953503f 100644
--- a/src/platform/i386/ioapic.c
+++ b/src/platform/i386/ioapic.c
@@ -3,6 +3,7 @@
 #include "pic.h"
 
 #define IOAPIC_MAX 4
+#define IOAPIC_INT_ISA_MAX 16 /* ACPI 5.0 spec: only ISA interrupts can have overrides */
 
 #define IOAPIC_IOAPICID  0x00
 #define IOAPIC_IOAPICVER 0x01
@@ -13,34 +14,34 @@
 #define IOAPIC_IOREDTBL 0x10
 #define IOAPIC_IOREDTBL_OFFSET(n) (IOAPIC_IOREDTBL + 2*n)
 
-#define IOAPIC_INT_MASKED (1<<16)
+#define IOAPIC_INT_DISABLED (1<<16)
 
 enum ioapic_deliverymode
 {
-        IOAPIC_DELIV_FIXED  = 0,
-        IOAPIC_DELIV_LOWEST = 1,
-        IOAPIC_DELIV_SMI    = 2,
-        IOAPIC_DELIV_NMI    = 4,
-        IOAPIC_DELIV_INIT   = 5,
-        IOAPIC_DELIV_EXTINT = 7,
+	IOAPIC_DELIV_FIXED  = 0,
+	IOAPIC_DELIV_LOWEST = 1,
+	IOAPIC_DELIV_SMI    = 2,
+	IOAPIC_DELIV_NMI    = 4,
+	IOAPIC_DELIV_INIT   = 5,
+	IOAPIC_DELIV_EXTINT = 7,
 };
 
 enum ioapic_dstmode
 {
-        IOAPIC_DST_PHYSICAL = 0,
-        IOAPIC_DST_LOGICAL  = 1,
+	IOAPIC_DST_PHYSICAL = 0,
+	IOAPIC_DST_LOGICAL  = 1,
 };
 
 enum ioapic_pinpolarity
 {
-        IOAPIC_POL_ACTHIGH = 0,
-        IOAPIC_POL_ACTLOW  = 1,
+	IOAPIC_POL_ACTHIGH = 0,
+	IOAPIC_POL_ACTLOW  = 1,
 };
 
 enum ioapic_triggermode
 {
-        IOAPIC_TRIGGER_EDGE  = 0,
-        IOAPIC_TRIGGER_LEVEL = 1,
+	IOAPIC_TRIGGER_EDGE  = 0,
+	IOAPIC_TRIGGER_LEVEL = 1,
 };
 
 struct ioapic_info {
@@ -50,8 +51,59 @@ struct ioapic_info {
 	int            glbint_base;
 };
 
+union ioapic_int_redir_entry {
+	struct {
+		u64_t vector: 8;
+		u64_t delivmod: 3;
+		u64_t destmod: 1;
+		u64_t delivsts: 1;
+		u64_t polarity: 1;
+		u64_t remoteirr: 1;
+		u64_t trigger: 1;
+		u64_t mask: 1;
+		u64_t reserved: 39;
+		u64_t destination: 8;
+	};
+	struct {
+		u32_t low_dword;
+		u32_t high_dword;
+	};
+};
+
+struct ioapic_isa_override {
+	int source;
+	int gsi;
+	union {
+		struct {
+			u16_t polarity:2;
+			u16_t trigger:2;
+			u16_t reserved:12;
+		};
+		u16_t flags;
+	};
+};
+
 static struct ioapic_info ioapicinfo[IOAPIC_MAX] = { { 0, NULL, 0, 0} };
 static unsigned int ioapic_count;
+static struct ioapic_isa_override ioapic_isainfo[IOAPIC_INT_ISA_MAX];
+static unsigned int ioapic_isaoverride_count;
+static unsigned int ioapic_int_count;
+
+static union ioapic_int_redir_entry ioapic_int_isa_tmpl = {
+	.delivmod = IOAPIC_DELIV_FIXED,
+	.destmod  = IOAPIC_DST_PHYSICAL,
+	.polarity = IOAPIC_POL_ACTHIGH,
+	.trigger  = IOAPIC_TRIGGER_EDGE,
+	.mask     = 1,
+};
+
+static union ioapic_int_redir_entry ioapic_int_pci_tmpl = {
+	.delivmod = IOAPIC_DELIV_FIXED,
+	.destmod  = IOAPIC_DST_PHYSICAL,
+	.polarity = IOAPIC_POL_ACTLOW,
+	.trigger  = IOAPIC_TRIGGER_EDGE, /* ref. barrelfish doesn't use level */
+	.mask     = 1,
+};
 
 void
 ioapic_set_page(struct ioapic_info *io, u32_t page)
@@ -77,12 +129,12 @@ ioapic_reg_read(struct ioapic_info *io, u8_t offset)
 }
 
 static struct ioapic_info *
-ioapic_findbygsi(int irq)
+ioapic_findbygsi(int gsi)
 {
 	unsigned int i = 0;
 
 	for (; i < ioapic_count; i++) {
-		if (irq >= ioapicinfo[i].glbint_base && irq < ioapicinfo[i].nentries) return &ioapicinfo[i];
+		if (gsi >= ioapicinfo[i].glbint_base && gsi < ioapicinfo[i].nentries) return &ioapicinfo[i];
 	}
 
 	return NULL;
@@ -100,63 +152,128 @@ ioapic_findbyid(int id)
 	return NULL;
 }
 
-void
-ioapic_int_override(struct intsrcovrride_cntl *iso)
+static inline void
+ioapic_int_entry_write(struct ioapic_info *io, u8_t off, union ioapic_int_redir_entry entry)
 {
-	assert(iso->header.len == sizeof(struct intsrcovrride_cntl));
+	int tmpoff = IOAPIC_IOREDTBL_OFFSET(off);
 
-	if (iso->source != iso->glb_int_num_off) {
-		struct ioapic_info *ioap = ioapic_findbygsi(iso->glb_int_num_off);
+	ioapic_reg_write(io, tmpoff, entry.low_dword);
+	ioapic_reg_write(io, tmpoff+1, entry.high_dword);
+}
 
-		assert(ioap);
-		printk("\tInterrupt Source Override for [%u] => %u with IOAPIC %d\n", iso->source, iso->glb_int_num_off, ioap->ioapicid);
-		ioapic_reg_write(ioap, IOAPIC_IOREDTBL_OFFSET(iso->glb_int_num_off), iso->source + HW_IRQ_START);
-	}
+static inline union ioapic_int_redir_entry
+ioapic_int_entry_read(struct ioapic_info *io, u8_t off)
+{
+	union ioapic_int_redir_entry entry;
+	int tmpoff = IOAPIC_IOREDTBL_OFFSET(off);
+
+	entry.low_dword  = ioapic_reg_read(io, tmpoff);
+	entry.high_dword = ioapic_reg_read(io, tmpoff+1);
+
+	return entry;
 }
 
-void
-ioapic_int_mask(int irqnum)
+static inline void
+ioapic_int_mask_set(int gsi, int mask)
 {
-	struct ioapic_info *ioap = ioapic_findbygsi(irqnum);
+	struct ioapic_info *io = ioapic_findbygsi(gsi);
+	union ioapic_int_redir_entry entry;
+	u8_t off;
+
+	if (!io) return;
 
-	assert(ioap);
-	ioapic_reg_write(ioap, IOAPIC_IOREDTBL_OFFSET(irqnum), IOAPIC_INT_MASKED | irqnum);
+	off = gsi - io->glbint_base;
+	entry = ioapic_int_entry_read(io, off);
+	entry.mask = mask ? 1 : 0;
+	ioapic_int_entry_write(io, off, entry);
+	entry = ioapic_int_entry_read(io, off);
 }
 
-void
-ioapic_int_unmask(int irqnum)
+static inline int
+ioapic_int_gsi(int gsi)
 {
-	struct ioapic_info *ioap = ioapic_findbygsi(irqnum);
-	u32_t val = 0;
+	int override_gsi = gsi;
+	int i;
+
+	if (gsi < IOAPIC_INT_ISA_MAX) {
+		for (i = 0; i < (int)ioapic_isaoverride_count; i++) {
+			if (ioapic_isainfo[i].source == gsi && ioapic_isainfo[i].gsi != gsi) {
+				override_gsi = ioapic_isainfo[i].gsi;
+				break;
+			}
+		}
+	}
 
-	assert(ioap);
-	val = ioapic_reg_read(ioap, IOAPIC_IOREDTBL_OFFSET(irqnum));
-	ioapic_reg_write(ioap, IOAPIC_IOREDTBL_OFFSET(irqnum), val & ~IOAPIC_INT_MASKED);
+	return override_gsi;
 }
 
 void
-ioapic_int_enable(int irqnum, int cpunum, int addflag)
+ioapic_int_mask(int gsi)
 {
-	struct ioapic_info *ioap = ioapic_findbygsi(irqnum);
+	ioapic_int_mask_set(ioapic_int_gsi(gsi), 1);
+}
 
-	assert(ioap);
-	if (addflag) {
-		/* TODO: logical destination = 1 and add core no or lapic number? */
-		assert(0);
-	} else {
-		ioapic_reg_write(ioap, IOAPIC_IOREDTBL_OFFSET(irqnum), irqnum + HW_IRQ_START);
-		ioapic_reg_write(ioap, IOAPIC_IOREDTBL_OFFSET(irqnum)+1, cpunum<<24);
-	}
+void
+ioapic_int_unmask(int gsi)
+{
+	ioapic_int_mask_set(ioapic_int_gsi(gsi), 0);
 }
 
 void
-ioapic_int_disable(int irqnum)
+ioapic_int_override(struct intsrcovrride_cntl *iso)
 {
-	struct ioapic_info *ioap = ioapic_findbygsi(irqnum);
+	union ioapic_int_redir_entry entry = ioapic_int_isa_tmpl;
+	struct ioapic_info *iogsi = NULL, *iosrc = NULL;
+
+	assert(iso->header.len == sizeof(struct intsrcovrride_cntl));
+
+	assert(iso->source < IOAPIC_INT_ISA_MAX);
+	assert(ioapic_isaoverride_count < IOAPIC_INT_ISA_MAX);
 
-	assert(ioap);
-	ioapic_reg_write(ioap, IOAPIC_IOREDTBL_OFFSET(irqnum), IOAPIC_INT_MASKED | irqnum);
-	ioapic_reg_write(ioap, IOAPIC_IOREDTBL_OFFSET(irqnum)+1, 0);
+	if (iso->source != iso->glb_int_num_off) {
+		union ioapic_int_redir_entry srcentry = ioapic_int_isa_tmpl;
+
+		iosrc = ioapic_findbygsi(iso->source);
+		assert(iosrc);
+		srcentry.vector = iso->glb_int_num_off + HW_IRQ_START;
+		ioapic_int_entry_write(iosrc, iso->source - iosrc->glbint_base, srcentry);
+
+		ioapic_isainfo[ioapic_isaoverride_count].source = iso->glb_int_num_off;
+		ioapic_isainfo[ioapic_isaoverride_count].gsi    = iso->source;
+		ioapic_isainfo[ioapic_isaoverride_count].flags  = 0;
+		ioapic_isaoverride_count++;
+	}
+
+	ioapic_isainfo[ioapic_isaoverride_count].source = iso->source;
+	ioapic_isainfo[ioapic_isaoverride_count].gsi    = iso->glb_int_num_off;
+	ioapic_isainfo[ioapic_isaoverride_count].flags  = iso->flags;
+
+	printk("\tINT Override %u to %u, polarity: %u trigger: %u\n", iso->source, iso->glb_int_num_off,
+	       ioapic_isainfo[ioapic_isaoverride_count].polarity, ioapic_isainfo[ioapic_isaoverride_count].trigger);
+
+	switch(ioapic_isainfo[ioapic_isaoverride_count].trigger) {
+	case ACPI_MADT_ISO_TRIG_CONFORMS: break;
+	case ACPI_MADT_ISO_TRIG_EDGE: entry.trigger = IOAPIC_TRIGGER_EDGE; break;
+	case ACPI_MADT_ISO_TRIG_RESERVED: assert(0); break;
+	case ACPI_MADT_ISO_TRIG_LEVEL: entry.trigger = IOAPIC_TRIGGER_EDGE; break; /* XXX: should be level */
+	default: break;
+	}
+
+	switch(ioapic_isainfo[ioapic_isaoverride_count].polarity) {
+	case ACPI_MADT_ISO_POL_CONFORMS: break;
+	case ACPI_MADT_ISO_POL_ACTHIGH: entry.polarity = IOAPIC_POL_ACTHIGH; break;
+	case ACPI_MADT_ISO_POL_RESERVED: assert(0); break;
+	case ACPI_MADT_ISO_POL_ACTLOW: entry.polarity = IOAPIC_POL_ACTLOW; break;
+	default: break;
+	}
+
+	entry.vector = iso->source + HW_IRQ_START;
+	iogsi = ioapic_findbygsi(iso->glb_int_num_off);
+	assert(iogsi);
+
+	ioapic_int_entry_write(iogsi, iso->glb_int_num_off - iogsi->glbint_base, entry);
+
+	ioapic_isaoverride_count++;
 }
 
 void
@@ -165,6 +282,7 @@ ioapic_iter(struct ioapic_cntl *io)
 	u32_t ver;
 	int ioent, j;
 	static int more = 0;
+	unsigned int tmp_count = ioapic_count;
 
 	assert(io);
 
@@ -174,20 +292,41 @@ ioapic_iter(struct ioapic_cntl *io)
 
 		return;
 	}
-	
-	ioapicinfo[ioapic_count].io_vaddr = (volatile void *)(io->ioapic_phys_addr);	
-	ioapicinfo[ioapic_count].ioapicid = io->ioapic_id;
-	ioapic_set_page(&(ioapicinfo[ioapic_count]), vm_map_superpage((u32_t)(ioapicinfo[ioapic_count].io_vaddr)));
 
-	ver   = ioapic_reg_read(&ioapicinfo[ioapic_count], IOAPIC_IOAPICVER);
+	ioapic_count ++;
+	ioapicinfo[tmp_count].io_vaddr = (volatile void *)(io->ioapic_phys_addr);
+	ioapicinfo[tmp_count].ioapicid = io->ioapic_id;
+	ioapic_set_page(&(ioapicinfo[tmp_count]), vm_map_superpage((u32_t)(ioapicinfo[tmp_count].io_vaddr)));
+
+	ver   = ioapic_reg_read(&ioapicinfo[tmp_count], IOAPIC_IOAPICVER);
 	ioent = ((ver >> 16) & 0xFF) + 1;
-	printk("\tIOAPIC %d (counter:%d): Number of entries = %d\n", io->ioapic_id, ioapic_count, ioent);
+	printk("\tIOAPIC %d (counter:%d): Number of entries = %d\n", io->ioapic_id, tmp_count, ioent);
 
-	ioapicinfo[ioapic_count].nentries    = ioent;
-	ioapicinfo[ioapic_count].glbint_base = io->glb_int_num_off;
-	ioapic_count ++;
+	ioapicinfo[tmp_count].nentries    = ioent;
+	ioapicinfo[tmp_count].glbint_base = io->glb_int_num_off;
+	ioapic_int_count += ioent;
+
+	for (j = 0; j < ioent; j++) {
+		union ioapic_int_redir_entry entry = (io->glb_int_num_off + j) < IOAPIC_INT_ISA_MAX ? ioapic_int_isa_tmpl : ioapic_int_pci_tmpl;
+
+		entry.vector = io->glb_int_num_off + j + HW_IRQ_START;
+
+		ioapic_int_entry_write(&ioapicinfo[tmp_count], j, entry);
+	}
+}
 
-	for (j = 0; j < ioent; j++) ioapic_int_enable(io->glb_int_num_off + j, 0, 0); /* TODO: assign to different cores */
+void
+chal_irq_enable(int irq)
+{
+	if (irq - HW_IRQ_START >= (int)ioapic_int_count) return;
+	ioapic_int_unmask(irq - HW_IRQ_START);
+}
+
+void
+chal_irq_disable(int irq)
+{
+	if (irq - HW_IRQ_START >= (int)ioapic_int_count) return;
+	ioapic_int_mask(irq - HW_IRQ_START);
 }
 
 void
@@ -201,7 +340,7 @@ ioapic_init(void)
 	/*
 	 * PCI Interrupts may need some attention here.
 	 * https://forum.osdev.org/viewtopic.php?f=1&t=21745
-	 * The discussion in the above forum suggest modern PCIe devices bypass IOAPIC and send 
+	 * The discussion in the above forum suggest modern PCIe devices bypass IOAPIC and send
 	 * interrupts directly to the core. For legacy PCI, we probably need to read some APIC tables.
 	 *
 	 * Update: I've tested by porting IOAPIC to RK on Qemu, udpserver test went OK.

From 9a48aa255313027c183699d1d3a9e817c2727d48 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Sat, 14 Apr 2018 11:47:08 -0400
Subject: [PATCH 011/127] Bugfix, enable/disable on HW_ATTACH/DETACH

---
 src/kernel/capinv.c        | 4 ++++
 src/platform/i386/ioapic.c | 5 +++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index a240c1aaa4..233e8c8333 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -1622,12 +1622,16 @@ static int __attribute__((noinline)) composite_syscall_slowpath(struct pt_regs *
 			if (!CAP_TYPECHK(rcvc, CAP_ARCV)) cos_throw(err, -EINVAL);
 
 			ret = hw_attach_rcvcap((struct cap_hw *)ch, hwid, rcvc, rcvcap);
+			if (!ret) chal_irq_enable(hwid);
+
 			break;
 		}
 		case CAPTBL_OP_HW_DETACH: {
 			hwid_t hwid = __userregs_get1(regs);
 
 			ret = hw_detach_rcvcap((struct cap_hw *)ch, hwid);
+			if (!ret) chal_irq_disable(hwid);
+
 			break;
 		}
 		case CAPTBL_OP_HW_MAP: {
diff --git a/src/platform/i386/ioapic.c b/src/platform/i386/ioapic.c
index ef011cd83e..a390be2e2a 100644
--- a/src/platform/i386/ioapic.c
+++ b/src/platform/i386/ioapic.c
@@ -343,7 +343,8 @@ ioapic_init(void)
 	 * The discussion in the above forum suggest modern PCIe devices bypass IOAPIC and send
 	 * interrupts directly to the core. For legacy PCI, we probably need to read some APIC tables.
 	 *
-	 * Update: I've tested by porting IOAPIC to RK on Qemu, udpserver test went OK.
-	 *         But on HW, I can't get even "ping" to work. TODO: Debugging!
+	 * Update: with BMK_SCREW_INTERRUPT_ROUTING, got Rumpkernel to boot fine on HW as well.
+	 * The effect of that BMK_SCREW_INTERRUPT_ROUTING is mostly in the BMK intr.c to use an array of lists vs
+	 * single list. It doesn't change how NetBSD does interrupt processing.
 	 */
 }

From 38687de55ae40b5b9d81201d36ddb029b82c49e5 Mon Sep 17 00:00:00 2001
From: Zheng Yang <codeforyz@gmail.com>
Date: Tue, 17 Apr 2018 19:05:19 +0000
Subject: [PATCH 012/127] test

---
 .../tests/micro_booter/mb_tests.c             | 30 +++++++++----------
 src/components/include/llprint.h              |  4 +--
 src/kernel/include/chal.h                     |  4 ++-
 src/kernel/include/shared/cos_config.h        |  2 +-
 src/platform/i386/exception.c                 |  1 -
 src/platform/i386/hpet.c                      |  2 +-
 src/platform/i386/ioapic.c                    | 13 ++++----
 src/platform/i386/ioapic.h                    |  2 +-
 src/platform/i386/kernel.c                    |  2 ++
 src/platform/i386/keyboard.c                  | 11 +++++--
 src/platform/i386/lapic.h                     |  1 +
 src/platform/i386/serial.c                    |  5 +++-
 12 files changed, 46 insertions(+), 31 deletions(-)

diff --git a/src/components/implementation/tests/micro_booter/mb_tests.c b/src/components/implementation/tests/micro_booter/mb_tests.c
index 7662f09963..fd2949b2f3 100644
--- a/src/components/implementation/tests/micro_booter/mb_tests.c
+++ b/src/components/implementation/tests/micro_booter/mb_tests.c
@@ -857,21 +857,21 @@ test_run_mb(void)
 {
 	cyc_per_usec = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
 
-	test_timer();
-	test_budgets();
-
-	test_thds();
-	test_thds_perf();
-
-	test_mem();
-
-	test_async_endpoints();
-	test_async_endpoints_perf();
-
-	test_inv();
-	test_inv_perf();
-
-	test_captbl_expand();
+	// test_timer();
+	// test_budgets();
+	//
+	// test_thds();
+	// test_thds_perf();
+	//
+	// test_mem();
+	//
+	// test_async_endpoints();
+	// test_async_endpoints_perf();
+	//
+	// test_inv();
+	// test_inv_perf();
+	//
+	// test_captbl_expand();
 
 	/*
 	 * FIXME: Preemption stack mechanism in the kernel is disabled.
diff --git a/src/components/include/llprint.h b/src/components/include/llprint.h
index 94a4dda267..9a822916ad 100644
--- a/src/components/include/llprint.h
+++ b/src/components/include/llprint.h
@@ -34,8 +34,8 @@ printc(char *fmt, ...)
 	va_start(arg_ptr, fmt);
 	ret = vsnprintf(s, len, fmt, arg_ptr);
 	va_end(arg_ptr);
-	cos_llprint(s, ret);
-
+	// cos_llprint(s, ret);
+	cos_print(s, ret);
 	return ret;
 }
 
diff --git a/src/kernel/include/chal.h b/src/kernel/include/chal.h
index 3d46d885bd..56096d59aa 100644
--- a/src/kernel/include/chal.h
+++ b/src/kernel/include/chal.h
@@ -96,7 +96,7 @@ void chal_timer_set(cycles_t cycles);
 void chal_timer_disable(void);
 
 void chal_irq_disable(int irqline);
-void chal_irq_enable(int irqline);
+void chal_irq_enable(int irqline, int dest);
 
 void chal_init(void);
 
@@ -107,6 +107,8 @@ void chal_init(void);
 
 #include "../../platform/include/chal_plat.h"
 
+#define PRINTK(format, ...) printk("(CPU%ld:) " format, get_cpuid(), ## __VA_ARGS__)
+
 extern void printk(const char *fmt, ...);
 void        chal_khalt(void);
 
diff --git a/src/kernel/include/shared/cos_config.h b/src/kernel/include/shared/cos_config.h
index f8179556e2..132ed10586 100644
--- a/src/kernel/include/shared/cos_config.h
+++ b/src/kernel/include/shared/cos_config.h
@@ -37,7 +37,7 @@
 
 #define BOOT_COMP_MAX_SZ (1 << 24) /* 16 MB for the booter component */
 
-#define NUM_CPU 1
+#define NUM_CPU 2
 
 #define CPU_TIMER_FREQ 100 // set in your linux .config
 
diff --git a/src/platform/i386/exception.c b/src/platform/i386/exception.c
index 5b6694c01a..b4d2e4c538 100644
--- a/src/platform/i386/exception.c
+++ b/src/platform/i386/exception.c
@@ -6,7 +6,6 @@
 #include "isr.h"
 #include "chal_cpu.h"
 
-#define PRINTK(format, ...) printk("(CPU%ld:) " format, get_cpuid(), ## __VA_ARGS__)
 
 void
 print_regs_state(struct pt_regs *regs)
diff --git a/src/platform/i386/hpet.c b/src/platform/i386/hpet.c
index 6c172ac43b..031c318ee8 100644
--- a/src/platform/i386/hpet.c
+++ b/src/platform/i386/hpet.c
@@ -299,5 +299,5 @@ hpet_init(void)
 
 	hpet_calibration_init = 1;
 	hpet_set(HPET_PERIODIC, hpet_hpetcyc_per_tick);
-	chal_irq_enable(HW_HPET_PERIODIC);
+	chal_irq_enable(HW_HPET_PERIODIC, 0);
 }
diff --git a/src/platform/i386/ioapic.c b/src/platform/i386/ioapic.c
index a390be2e2a..0662dd7cde 100644
--- a/src/platform/i386/ioapic.c
+++ b/src/platform/i386/ioapic.c
@@ -174,7 +174,7 @@ ioapic_int_entry_read(struct ioapic_info *io, u8_t off)
 }
 
 static inline void
-ioapic_int_mask_set(int gsi, int mask)
+ioapic_int_mask_set(int gsi, int mask, int dest)
 {
 	struct ioapic_info *io = ioapic_findbygsi(gsi);
 	union ioapic_int_redir_entry entry;
@@ -185,6 +185,7 @@ ioapic_int_mask_set(int gsi, int mask)
 	off = gsi - io->glbint_base;
 	entry = ioapic_int_entry_read(io, off);
 	entry.mask = mask ? 1 : 0;
+	entry.destination = apicids[dest];
 	ioapic_int_entry_write(io, off, entry);
 	entry = ioapic_int_entry_read(io, off);
 }
@@ -210,13 +211,13 @@ ioapic_int_gsi(int gsi)
 void
 ioapic_int_mask(int gsi)
 {
-	ioapic_int_mask_set(ioapic_int_gsi(gsi), 1);
+	ioapic_int_mask_set(ioapic_int_gsi(gsi), 1, 0);
 }
 
 void
-ioapic_int_unmask(int gsi)
+ioapic_int_unmask(int gsi, int dest)
 {
-	ioapic_int_mask_set(ioapic_int_gsi(gsi), 0);
+	ioapic_int_mask_set(ioapic_int_gsi(gsi), 0, dest);
 }
 
 void
@@ -316,10 +317,10 @@ ioapic_iter(struct ioapic_cntl *io)
 }
 
 void
-chal_irq_enable(int irq)
+chal_irq_enable(int irq, int dest)
 {
 	if (irq - HW_IRQ_START >= (int)ioapic_int_count) return;
-	ioapic_int_unmask(irq - HW_IRQ_START);
+	ioapic_int_unmask(irq - HW_IRQ_START, dest);
 }
 
 void
diff --git a/src/platform/i386/ioapic.h b/src/platform/i386/ioapic.h
index 2014a94d39..c8a1211c84 100644
--- a/src/platform/i386/ioapic.h
+++ b/src/platform/i386/ioapic.h
@@ -7,7 +7,7 @@ void ioapic_init(void);
 
 void ioapic_iter(struct ioapic_cntl *);
 void ioapic_int_mask(int irq);
-void ioapic_int_unmask(int irq);
+void ioapic_int_unmask(int irq, int dest);
 
 void ioapic_int_disable(int irq);
 void ioapic_int_enable(int irq, int cpu, int add);
diff --git a/src/platform/i386/kernel.c b/src/platform/i386/kernel.c
index d71f46edd7..a767f72eff 100644
--- a/src/platform/i386/kernel.c
+++ b/src/platform/i386/kernel.c
@@ -163,6 +163,8 @@ kmain(struct multiboot *mboot, u32_t mboot_magic, u32_t esp)
 	kern_boot_comp(INIT_CORE);
 	lapic_init();
 	hpet_init();
+	chal_irq_enable(HW_SERIAL, 1);
+	chal_irq_enable(HW_KEYBOARD, 1);
 	pic_init();
 	ioapic_init();
 	smp_init(cores_ready);
diff --git a/src/platform/i386/keyboard.c b/src/platform/i386/keyboard.c
index 511ae9e9d3..4e9d476995 100644
--- a/src/platform/i386/keyboard.c
+++ b/src/platform/i386/keyboard.c
@@ -3,10 +3,12 @@
 #define KEY_DEVICE  0x60
 #define KEY_PENDING 0x64
 
-void
+int
 keyboard_handler(struct pt_regs *regs)
 {
         u16_t scancode = 0;
+	int preempt = 1;
+	static int chg = 0;
 
         lapic_ack();
 
@@ -14,5 +16,10 @@ keyboard_handler(struct pt_regs *regs)
                 /* wait for keypress to be ready */
         }
         scancode = inb(KEY_DEVICE);
-        printk("Keyboard press: %d\n", scancode);
+        PRINTK("Keyboard press: %d\n", scancode);
+
+	chal_irq_enable(HW_KEYBOARD, chg);
+	chg = !chg;
+
+	return preempt;
 }
diff --git a/src/platform/i386/lapic.h b/src/platform/i386/lapic.h
index f9a5507bf8..97cf13b5c3 100644
--- a/src/platform/i386/lapic.h
+++ b/src/platform/i386/lapic.h
@@ -12,6 +12,7 @@ void         lapic_set_timer(int timer_type, cycles_t deadline);
 u32_t        lapic_get_ccr(void);
 void         lapic_timer_calibration(u32_t ratio);
 extern u32_t lapic_timer_calib_init;
+extern int apicids[NUM_CPU];
 
 void lapic_init(void);
 void smp_init(volatile int *cores_ready);
diff --git a/src/platform/i386/serial.c b/src/platform/i386/serial.c
index e487c36118..7ef1ce787f 100644
--- a/src/platform/i386/serial.c
+++ b/src/platform/i386/serial.c
@@ -40,6 +40,7 @@ serial_handler(struct pt_regs *r)
 {
 	char serial;
 	int  preempt = 1;
+	static int chg = 0;
 
 	lapic_ack();
 
@@ -71,7 +72,9 @@ serial_handler(struct pt_regs *r)
 		break;
 	}
 
-	printk("Serial: %d\n", serial);
+	PRINTK("Serial: %d\n", serial);
+	chal_irq_enable(HW_SERIAL, chg);
+	chg = !chg;
 	// printk("%c", serial);
 	return preempt;
 }

From a77b6c1e432f4bf84c94f226c9fc7017a5dbc3c2 Mon Sep 17 00:00:00 2001
From: Zheng Yang <codeforyz@gmail.com>
Date: Thu, 19 Apr 2018 23:42:51 +0000
Subject: [PATCH 013/127] set ldr for cores

---
 src/kernel/capinv.c                    |  2 +-
 src/kernel/include/shared/cos_config.h |  2 +-
 src/platform/i386/lapic.c              | 38 ++++++++++++++++++++++++++
 src/platform/i386/lapic.h              |  2 ++
 4 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index 233e8c8333..f1e0b698cb 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -1622,7 +1622,7 @@ static int __attribute__((noinline)) composite_syscall_slowpath(struct pt_regs *
 			if (!CAP_TYPECHK(rcvc, CAP_ARCV)) cos_throw(err, -EINVAL);
 
 			ret = hw_attach_rcvcap((struct cap_hw *)ch, hwid, rcvc, rcvcap);
-			if (!ret) chal_irq_enable(hwid);
+			if (!ret) chal_irq_enable(hwid, get_cpuid());
 
 			break;
 		}
diff --git a/src/kernel/include/shared/cos_config.h b/src/kernel/include/shared/cos_config.h
index 132ed10586..34ffce3752 100644
--- a/src/kernel/include/shared/cos_config.h
+++ b/src/kernel/include/shared/cos_config.h
@@ -37,7 +37,7 @@
 
 #define BOOT_COMP_MAX_SZ (1 << 24) /* 16 MB for the booter component */
 
-#define NUM_CPU 2
+#define NUM_CPU 4
 
 #define CPU_TIMER_FREQ 100 // set in your linux .config
 
diff --git a/src/platform/i386/lapic.c b/src/platform/i386/lapic.c
index c49cc4e7c8..6c6e582b5c 100644
--- a/src/platform/i386/lapic.c
+++ b/src/platform/i386/lapic.c
@@ -7,6 +7,7 @@
 
 int ncpus = 1;
 int apicids[NUM_CPU];
+u32_t logical_apicids[NUM_CPU];
 
 #define CMOS_PORT    0x70
 
@@ -14,6 +15,7 @@ int apicids[NUM_CPU];
 #define LAPIC_VERSION_REG        0x030 /* version */
 #define LAPIC_TP_REG             0x080 /* Task Priority Register */
 
+#define LAPIC_LDR_REG            0x0D0 /* Logical destination register */
 #define LAPIC_SIV_REG            0x0F0 /* spurious interrupt vector */
 #define LAPIC_SIV_ENABLE         (1 << 8) /* enable bit in the SIV */
 #define LAPIC_EOI_REG            0x0B0 /* ack, or end-of-interrupt */
@@ -53,6 +55,10 @@ int apicids[NUM_CPU];
 #define LAPIC_ONESHOT_THRESH (1 << 12)
 #define LAPIC_TSCDEADLINE_THRESH 0
 
+#define LAPIC_LDR_OFFSET 24
+#define LAPIC_LDR_MAST (0xfful << LAPIC_LDR_OFFSET)
+
+
 extern int timer_process(struct pt_regs *regs);
 
 enum lapic_timer_type
@@ -191,12 +197,44 @@ lapic_find_localaddr(void *l)
 	return addr;
 }
 
+static u32_t
+cons_logical_id(const u32_t id)
+{
+	int lid;
+	/*
+	 * FIXME: xAPIC only support 8 bits bitmap for logical destination,
+	 * So we will configure the logical id of cores with id larger than 7
+	 * to 7 which means we should find out a way(x2APIC) to fix this when we
+	 * have more than 8 cores in ioapic.
+	 */
+
+	if (id >= 7) {
+		lid = 7;
+	} else {
+		lid = id;
+	}
+	return (1ul << lid) << LAPIC_LDR_OFFSET;
+}
+
+static u32_t
+lapic_set_ldr(const u32_t id)
+{
+	u32_t lid = cons_logical_id(id);
+
+	lapic_write_reg(LAPIC_LDR_REG, lid | ~LAPIC_LDR_MAST);
+	return lid >> LAPIC_LDR_OFFSET;
+}
+
 void
 lapic_init(void)
 {
 	u32_t version;
 
 	assert(lapic);
+
+	/* setup LDR for logic destination before init lapic */
+	logical_apicids[get_cpuid()] = lapic_set_ldr(get_cpuid());
+
 	lapic_write_reg(LAPIC_SIV_REG, LAPIC_SIV_ENABLE | HW_LAPIC_SPURIOUS);
 
 	version = lapic_read_reg(LAPIC_VERSION_REG);
diff --git a/src/platform/i386/lapic.h b/src/platform/i386/lapic.h
index 97cf13b5c3..bfc80b8163 100644
--- a/src/platform/i386/lapic.h
+++ b/src/platform/i386/lapic.h
@@ -11,8 +11,10 @@ void         lapic_timer_init(void);
 void         lapic_set_timer(int timer_type, cycles_t deadline);
 u32_t        lapic_get_ccr(void);
 void         lapic_timer_calibration(u32_t ratio);
+
 extern u32_t lapic_timer_calib_init;
 extern int apicids[NUM_CPU];
+extern u32_t logical_apicids[NUM_CPU];
 
 void lapic_init(void);
 void smp_init(volatile int *cores_ready);

From 5e892333a11b39778e161f0420f31b96d0592d90 Mon Sep 17 00:00:00 2001
From: Zheng Yang <codeforyz@gmail.com>
Date: Sat, 21 Apr 2018 07:10:13 +0000
Subject: [PATCH 014/127] finish logical mode

---
 src/kernel/capinv.c          |  2 +-
 src/kernel/include/chal.h    |  4 +--
 src/platform/i386/ioapic.c   | 57 ++++++++++++++++++++++++++++++------
 src/platform/i386/ioapic.h   |  6 ++--
 src/platform/i386/kernel.c   |  3 +-
 src/platform/i386/keyboard.c |  4 ---
 src/platform/i386/lapic.c    | 12 +++-----
 src/platform/i386/serial.c   |  9 ++++--
 8 files changed, 65 insertions(+), 32 deletions(-)

diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index f1e0b698cb..9c88806946 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -1630,7 +1630,7 @@ static int __attribute__((noinline)) composite_syscall_slowpath(struct pt_regs *
 			hwid_t hwid = __userregs_get1(regs);
 
 			ret = hw_detach_rcvcap((struct cap_hw *)ch, hwid);
-			if (!ret) chal_irq_disable(hwid);
+			if (!ret) chal_irq_disable(hwid, get_cpuid());
 
 			break;
 		}
diff --git a/src/kernel/include/chal.h b/src/kernel/include/chal.h
index 56096d59aa..956e95f34a 100644
--- a/src/kernel/include/chal.h
+++ b/src/kernel/include/chal.h
@@ -95,8 +95,8 @@ void chal_idle(void);
 void chal_timer_set(cycles_t cycles);
 void chal_timer_disable(void);
 
-void chal_irq_disable(int irqline);
-void chal_irq_enable(int irqline, int dest);
+void chal_irq_disable(int irqline, cpuid_t cpu_id);
+void chal_irq_enable(int irqline, cpuid_t cpu_id);
 
 void chal_init(void);
 
diff --git a/src/platform/i386/ioapic.c b/src/platform/i386/ioapic.c
index 0662dd7cde..e98b36c3bf 100644
--- a/src/platform/i386/ioapic.c
+++ b/src/platform/i386/ioapic.c
@@ -91,7 +91,7 @@ static unsigned int ioapic_int_count;
 
 static union ioapic_int_redir_entry ioapic_int_isa_tmpl = {
 	.delivmod = IOAPIC_DELIV_FIXED,
-	.destmod  = IOAPIC_DST_PHYSICAL,
+	.destmod  = IOAPIC_DST_LOGICAL,
 	.polarity = IOAPIC_POL_ACTHIGH,
 	.trigger  = IOAPIC_TRIGGER_EDGE,
 	.mask     = 1,
@@ -99,7 +99,7 @@ static union ioapic_int_redir_entry ioapic_int_isa_tmpl = {
 
 static union ioapic_int_redir_entry ioapic_int_pci_tmpl = {
 	.delivmod = IOAPIC_DELIV_FIXED,
-	.destmod  = IOAPIC_DST_PHYSICAL,
+	.destmod  = IOAPIC_DST_LOGICAL,
 	.polarity = IOAPIC_POL_ACTLOW,
 	.trigger  = IOAPIC_TRIGGER_EDGE, /* ref. barrelfish doesn't use level */
 	.mask     = 1,
@@ -185,7 +185,7 @@ ioapic_int_mask_set(int gsi, int mask, int dest)
 	off = gsi - io->glbint_base;
 	entry = ioapic_int_entry_read(io, off);
 	entry.mask = mask ? 1 : 0;
-	entry.destination = apicids[dest];
+	entry.destination = dest;
 	ioapic_int_entry_write(io, off, entry);
 	entry = ioapic_int_entry_read(io, off);
 }
@@ -211,6 +211,7 @@ ioapic_int_gsi(int gsi)
 void
 ioapic_int_mask(int gsi)
 {
+	/* clear destination when masking */
 	ioapic_int_mask_set(ioapic_int_gsi(gsi), 1, 0);
 }
 
@@ -317,17 +318,55 @@ ioapic_iter(struct ioapic_cntl *io)
 }
 
 void
-chal_irq_enable(int irq, int dest)
+chal_irq_enable(int irq, cpuid_t cpu_id)
 {
-	if (irq - HW_IRQ_START >= (int)ioapic_int_count) return;
-	ioapic_int_unmask(irq - HW_IRQ_START, dest);
+	int 	            gsi = ioapic_int_gsi(irq - HW_IRQ_START);
+	struct ioapic_info *io  = ioapic_findbygsi(gsi);
+	union ioapic_int_redir_entry entry;
+	u8_t off;
+
+	if (!io) return;
+
+	off = gsi - io->glbint_base;
+	entry = ioapic_int_entry_read(io, off);
+
+	/* the destination bitmap is 8 bits */
+	if (irq - HW_IRQ_START >= (int)ioapic_int_count || cpu_id > 7) return;
+
+	/* irq should be masked or in logical mode */
+	assert(entry.mask || entry.destmod == IOAPIC_DST_LOGICAL);
+
+	/* if irq is masked, destination should be 0 */
+	assert(!entry.mask || !entry.destination);
+
+	ioapic_int_unmask(irq - HW_IRQ_START, entry.destination | (u8_t)logical_apicids[cpu_id]);
 }
 
 void
-chal_irq_disable(int irq)
+chal_irq_disable(int irq, cpuid_t cpu_id)
 {
-	if (irq - HW_IRQ_START >= (int)ioapic_int_count) return;
-	ioapic_int_mask(irq - HW_IRQ_START);
+	int 	            gsi = ioapic_int_gsi(irq - HW_IRQ_START);
+	struct ioapic_info *io  = ioapic_findbygsi(gsi);
+	union ioapic_int_redir_entry entry;
+	u8_t off;
+
+	if (!io) return;
+
+	off = gsi - io->glbint_base;
+	entry = ioapic_int_entry_read(io, off);
+
+	/* the destination bitmap is 8 bits */
+	if (irq - HW_IRQ_START >= (int)ioapic_int_count || cpu_id > 7) return;
+
+	assert(entry.mask || entry.destmod == IOAPIC_DST_LOGICAL);
+
+	/* we should disable the irq if we remove the last core */
+	if (!(entry.destination & ~logical_apicids[cpu_id])) {
+		ioapic_int_mask(irq - HW_IRQ_START);
+		return;
+	}
+
+	ioapic_int_unmask(irq - HW_IRQ_START, entry.destination & ~logical_apicids[cpu_id]);
 }
 
 void
diff --git a/src/platform/i386/ioapic.h b/src/platform/i386/ioapic.h
index c8a1211c84..3cd3e31ea4 100644
--- a/src/platform/i386/ioapic.h
+++ b/src/platform/i386/ioapic.h
@@ -6,11 +6,11 @@
 void ioapic_init(void);
 
 void ioapic_iter(struct ioapic_cntl *);
-void ioapic_int_mask(int irq);
-void ioapic_int_unmask(int irq, int dest);
+// void ioapic_int_mask(int irq);
+// void ioapic_int_unmask(int irq);
 
 void ioapic_int_disable(int irq);
-void ioapic_int_enable(int irq, int cpu, int add);
+void ioapic_int_enable(int irq, cpuid_t cpu_id);
 
 void ioapic_int_override(struct intsrcovrride_cntl *);
 
diff --git a/src/platform/i386/kernel.c b/src/platform/i386/kernel.c
index a767f72eff..904ff722c5 100644
--- a/src/platform/i386/kernel.c
+++ b/src/platform/i386/kernel.c
@@ -163,8 +163,7 @@ kmain(struct multiboot *mboot, u32_t mboot_magic, u32_t esp)
 	kern_boot_comp(INIT_CORE);
 	lapic_init();
 	hpet_init();
-	chal_irq_enable(HW_SERIAL, 1);
-	chal_irq_enable(HW_KEYBOARD, 1);
+	chal_irq_enable(HW_SERIAL, 0);
 	pic_init();
 	ioapic_init();
 	smp_init(cores_ready);
diff --git a/src/platform/i386/keyboard.c b/src/platform/i386/keyboard.c
index 4e9d476995..b38987faa2 100644
--- a/src/platform/i386/keyboard.c
+++ b/src/platform/i386/keyboard.c
@@ -8,7 +8,6 @@ keyboard_handler(struct pt_regs *regs)
 {
         u16_t scancode = 0;
 	int preempt = 1;
-	static int chg = 0;
 
         lapic_ack();
 
@@ -18,8 +17,5 @@ keyboard_handler(struct pt_regs *regs)
         scancode = inb(KEY_DEVICE);
         PRINTK("Keyboard press: %d\n", scancode);
 
-	chal_irq_enable(HW_KEYBOARD, chg);
-	chg = !chg;
-
 	return preempt;
 }
diff --git a/src/platform/i386/lapic.c b/src/platform/i386/lapic.c
index 6c6e582b5c..5076e8e553 100644
--- a/src/platform/i386/lapic.c
+++ b/src/platform/i386/lapic.c
@@ -200,20 +200,16 @@ lapic_find_localaddr(void *l)
 static u32_t
 cons_logical_id(const u32_t id)
 {
-	int lid;
 	/*
 	 * FIXME: xAPIC only support 8 bits bitmap for logical destination,
 	 * So we will configure the logical id of cores with id larger than 7
-	 * to 7 which means we should find out a way(x2APIC) to fix this when we
+	 * to 0 which means we should find out a way(x2APIC) to fix this when we
 	 * have more than 8 cores in ioapic.
 	 */
 
-	if (id >= 7) {
-		lid = 7;
-	} else {
-		lid = id;
-	}
-	return (1ul << lid) << LAPIC_LDR_OFFSET;
+	if (id > 7) return 0;
+
+	return (1ul << id) << LAPIC_LDR_OFFSET;
 }
 
 static u32_t
diff --git a/src/platform/i386/serial.c b/src/platform/i386/serial.c
index 7ef1ce787f..d5926b5e30 100644
--- a/src/platform/i386/serial.c
+++ b/src/platform/i386/serial.c
@@ -43,7 +43,7 @@ serial_handler(struct pt_regs *r)
 	static int chg = 0;
 
 	lapic_ack();
-
+	PRINTK("Serial\n");
 	serial = serial_recv();
 
 	/*
@@ -72,9 +72,12 @@ serial_handler(struct pt_regs *r)
 		break;
 	}
 
+	if (chg / NUM_CPU % 2 == 0) chal_irq_enable(HW_SERIAL, chg % NUM_CPU);
+	else chal_irq_disable(HW_SERIAL, chg % NUM_CPU);
+	chg++;
+
 	PRINTK("Serial: %d\n", serial);
-	chal_irq_enable(HW_SERIAL, chg);
-	chg = !chg;
+
 	// printk("%c", serial);
 	return preempt;
 }

From 04041699c9a67b2c3f7c6e6a9cc411227e9bcad0 Mon Sep 17 00:00:00 2001
From: Zheng Yang <codeforyz@gmail.com>
Date: Sat, 21 Apr 2018 07:18:29 +0000
Subject: [PATCH 015/127] change hw_asnd_caps to 2d array

---
 src/kernel/include/hw.h  | 10 +++++-----
 src/platform/i386/hpet.c |  4 ++--
 src/platform/i386/idt.c  |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/kernel/include/hw.h b/src/kernel/include/hw.h
index fafc1ef7e1..6b28a17f2b 100644
--- a/src/kernel/include/hw.h
+++ b/src/kernel/include/hw.h
@@ -17,7 +17,7 @@
 #define HW_IRQ_EXTERNAL_MIN 32
 #define HW_IRQ_EXTERNAL_MAX 63
 
-struct cap_asnd hw_asnd_caps[HW_IRQ_TOTAL];
+struct cap_asnd hw_asnd_caps[NUM_CPU][HW_IRQ_TOTAL];
 
 struct cap_hw {
 	struct cap_header h;
@@ -27,7 +27,7 @@ struct cap_hw {
 static void
 hw_asndcap_init(void)
 {
-	memset(&hw_asnd_caps, 0, sizeof(struct cap_asnd) * HW_IRQ_TOTAL);
+	memset(&hw_asnd_caps, 0, sizeof(struct cap_asnd) * HW_IRQ_TOTAL * NUM_CPU);
 }
 
 /*
@@ -63,9 +63,9 @@ hw_attach_rcvcap(struct cap_hw *hwc, hwid_t hwid, struct cap_arcv *rcvc, capid_t
 {
 	if (hwid < HW_IRQ_EXTERNAL_MIN || hwid > HW_IRQ_EXTERNAL_MAX) return -EINVAL;
 	if (!(hwc->hw_bitmap & (1 << (hwid - HW_IRQ_EXTERNAL_MIN)))) return -EINVAL;
-	if (hw_asnd_caps[hwid].h.type == CAP_ASND) return -EEXIST;
+	if (hw_asnd_caps[get_cpuid()][hwid].h.type == CAP_ASND) return -EEXIST;
 
-	return asnd_construct(&hw_asnd_caps[hwid], rcvc, rcv_cap, 0, 0);
+	return asnd_construct(&hw_asnd_caps[get_cpuid()][hwid], rcvc, rcv_cap, 0, 0);
 }
 
 static int
@@ -78,7 +78,7 @@ hw_detach_rcvcap(struct cap_hw *hwc, hwid_t hwid)
 	 * FIXME: Need to synchronize using __xx_pre and
 	 *        __xx_post perhaps in asnd_deconstruct()
 	 */
-	memset(&hw_asnd_caps[hwid], 0, sizeof(struct cap_asnd));
+	memset(&hw_asnd_caps[get_cpuid()][hwid], 0, sizeof(struct cap_asnd));
 
 	return 0;
 }
diff --git a/src/platform/i386/hpet.c b/src/platform/i386/hpet.c
index 031c318ee8..72511ddc6b 100644
--- a/src/platform/i386/hpet.c
+++ b/src/platform/i386/hpet.c
@@ -185,7 +185,7 @@ hpet_periodic_handler(struct pt_regs *regs)
 	if (unlikely(hpet_calibration_init)) hpet_calibration();
 
 	lapic_ack();
-	preempt = cap_hw_asnd(&hw_asnd_caps[HW_HPET_PERIODIC], regs);
+	preempt = cap_hw_asnd(&hw_asnd_caps[get_cpuid()][HW_HPET_PERIODIC], regs);
 	HPET_INT_ENABLE(HPET_PERIODIC);
 
 	return preempt;
@@ -199,7 +199,7 @@ hpet_oneshot_handler(struct pt_regs *regs)
 	assert(!hpet_calibration_init);
 
 	lapic_ack();
-	preempt = cap_hw_asnd(&hw_asnd_caps[HW_HPET_ONESHOT], regs);
+	preempt = cap_hw_asnd(&hw_asnd_caps[get_cpuid()][HW_HPET_ONESHOT], regs);
 	HPET_INT_ENABLE(HPET_ONESHOT);
 
 	return preempt;
diff --git a/src/platform/i386/idt.c b/src/platform/i386/idt.c
index d471ba6d46..0081acc1fb 100644
--- a/src/platform/i386/idt.c
+++ b/src/platform/i386/idt.c
@@ -49,7 +49,7 @@ hw_handler(struct pt_regs *regs)
 	 *       after user-level interrupt(rcv event) processing?
 	 */
 	lapic_ack();
-	preempt = cap_hw_asnd(&hw_asnd_caps[regs->orig_ax], regs);
+	preempt = cap_hw_asnd(&hw_asnd_caps[get_cpuid()][regs->orig_ax], regs);
 
 	return preempt;
 }

From 2002aff9aa0ced8f38e53259a31e9923ddf06fef Mon Sep 17 00:00:00 2001
From: Zheng Yang <codeforyz@gmail.com>
Date: Sat, 21 Apr 2018 07:25:22 +0000
Subject: [PATCH 016/127] remove debug lines

---
 .../tests/micro_booter/mb_tests.c             | 30 +++++++++----------
 src/components/include/llprint.h              |  4 +--
 src/kernel/include/shared/cos_config.h        |  2 +-
 src/platform/i386/serial.c                    |  7 +----
 4 files changed, 19 insertions(+), 24 deletions(-)

diff --git a/src/components/implementation/tests/micro_booter/mb_tests.c b/src/components/implementation/tests/micro_booter/mb_tests.c
index fd2949b2f3..7662f09963 100644
--- a/src/components/implementation/tests/micro_booter/mb_tests.c
+++ b/src/components/implementation/tests/micro_booter/mb_tests.c
@@ -857,21 +857,21 @@ test_run_mb(void)
 {
 	cyc_per_usec = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
 
-	// test_timer();
-	// test_budgets();
-	//
-	// test_thds();
-	// test_thds_perf();
-	//
-	// test_mem();
-	//
-	// test_async_endpoints();
-	// test_async_endpoints_perf();
-	//
-	// test_inv();
-	// test_inv_perf();
-	//
-	// test_captbl_expand();
+	test_timer();
+	test_budgets();
+
+	test_thds();
+	test_thds_perf();
+
+	test_mem();
+
+	test_async_endpoints();
+	test_async_endpoints_perf();
+
+	test_inv();
+	test_inv_perf();
+
+	test_captbl_expand();
 
 	/*
 	 * FIXME: Preemption stack mechanism in the kernel is disabled.
diff --git a/src/components/include/llprint.h b/src/components/include/llprint.h
index 9a822916ad..94a4dda267 100644
--- a/src/components/include/llprint.h
+++ b/src/components/include/llprint.h
@@ -34,8 +34,8 @@ printc(char *fmt, ...)
 	va_start(arg_ptr, fmt);
 	ret = vsnprintf(s, len, fmt, arg_ptr);
 	va_end(arg_ptr);
-	// cos_llprint(s, ret);
-	cos_print(s, ret);
+	cos_llprint(s, ret);
+
 	return ret;
 }
 
diff --git a/src/kernel/include/shared/cos_config.h b/src/kernel/include/shared/cos_config.h
index 34ffce3752..f8179556e2 100644
--- a/src/kernel/include/shared/cos_config.h
+++ b/src/kernel/include/shared/cos_config.h
@@ -37,7 +37,7 @@
 
 #define BOOT_COMP_MAX_SZ (1 << 24) /* 16 MB for the booter component */
 
-#define NUM_CPU 4
+#define NUM_CPU 1
 
 #define CPU_TIMER_FREQ 100 // set in your linux .config
 
diff --git a/src/platform/i386/serial.c b/src/platform/i386/serial.c
index d5926b5e30..561c5c368d 100644
--- a/src/platform/i386/serial.c
+++ b/src/platform/i386/serial.c
@@ -40,10 +40,9 @@ serial_handler(struct pt_regs *r)
 {
 	char serial;
 	int  preempt = 1;
-	static int chg = 0;
 
 	lapic_ack();
-	PRINTK("Serial\n");
+
 	serial = serial_recv();
 
 	/*
@@ -72,10 +71,6 @@ serial_handler(struct pt_regs *r)
 		break;
 	}
 
-	if (chg / NUM_CPU % 2 == 0) chal_irq_enable(HW_SERIAL, chg % NUM_CPU);
-	else chal_irq_disable(HW_SERIAL, chg % NUM_CPU);
-	chg++;
-
 	PRINTK("Serial: %d\n", serial);
 
 	// printk("%c", serial);

From a643b0c66db798c3e9a1128db66670354ecc73ae Mon Sep 17 00:00:00 2001
From: Zheng Yang <codeforyz@gmail.com>
Date: Sat, 21 Apr 2018 07:33:08 +0000
Subject: [PATCH 017/127] add return value to chal_irq_XXX

---
 src/kernel/capinv.c        |  4 ++--
 src/kernel/include/chal.h  |  4 ++--
 src/platform/i386/ioapic.c | 17 ++++++++++-------
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index 9c88806946..a22edc37d6 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -1622,7 +1622,7 @@ static int __attribute__((noinline)) composite_syscall_slowpath(struct pt_regs *
 			if (!CAP_TYPECHK(rcvc, CAP_ARCV)) cos_throw(err, -EINVAL);
 
 			ret = hw_attach_rcvcap((struct cap_hw *)ch, hwid, rcvc, rcvcap);
-			if (!ret) chal_irq_enable(hwid, get_cpuid());
+			if (!ret) ret = chal_irq_enable(hwid, get_cpuid());
 
 			break;
 		}
@@ -1630,7 +1630,7 @@ static int __attribute__((noinline)) composite_syscall_slowpath(struct pt_regs *
 			hwid_t hwid = __userregs_get1(regs);
 
 			ret = hw_detach_rcvcap((struct cap_hw *)ch, hwid);
-			if (!ret) chal_irq_disable(hwid, get_cpuid());
+			if (!ret) ret = chal_irq_disable(hwid, get_cpuid());
 
 			break;
 		}
diff --git a/src/kernel/include/chal.h b/src/kernel/include/chal.h
index 956e95f34a..9fdf157df9 100644
--- a/src/kernel/include/chal.h
+++ b/src/kernel/include/chal.h
@@ -95,8 +95,8 @@ void chal_idle(void);
 void chal_timer_set(cycles_t cycles);
 void chal_timer_disable(void);
 
-void chal_irq_disable(int irqline, cpuid_t cpu_id);
-void chal_irq_enable(int irqline, cpuid_t cpu_id);
+int chal_irq_disable(int irqline, cpuid_t cpu_id);
+int chal_irq_enable(int irqline, cpuid_t cpu_id);
 
 void chal_init(void);
 
diff --git a/src/platform/i386/ioapic.c b/src/platform/i386/ioapic.c
index e98b36c3bf..1ae8231b7c 100644
--- a/src/platform/i386/ioapic.c
+++ b/src/platform/i386/ioapic.c
@@ -317,7 +317,7 @@ ioapic_iter(struct ioapic_cntl *io)
 	}
 }
 
-void
+int
 chal_irq_enable(int irq, cpuid_t cpu_id)
 {
 	int 	            gsi = ioapic_int_gsi(irq - HW_IRQ_START);
@@ -325,13 +325,13 @@ chal_irq_enable(int irq, cpuid_t cpu_id)
 	union ioapic_int_redir_entry entry;
 	u8_t off;
 
-	if (!io) return;
+	if (!io) return -EINVAL;
 
 	off = gsi - io->glbint_base;
 	entry = ioapic_int_entry_read(io, off);
 
 	/* the destination bitmap is 8 bits */
-	if (irq - HW_IRQ_START >= (int)ioapic_int_count || cpu_id > 7) return;
+	if (irq - HW_IRQ_START >= (int)ioapic_int_count || cpu_id > 7) return -EINVAL;
 
 	/* irq should be masked or in logical mode */
 	assert(entry.mask || entry.destmod == IOAPIC_DST_LOGICAL);
@@ -340,9 +340,11 @@ chal_irq_enable(int irq, cpuid_t cpu_id)
 	assert(!entry.mask || !entry.destination);
 
 	ioapic_int_unmask(irq - HW_IRQ_START, entry.destination | (u8_t)logical_apicids[cpu_id]);
+
+	return 0;
 }
 
-void
+int
 chal_irq_disable(int irq, cpuid_t cpu_id)
 {
 	int 	            gsi = ioapic_int_gsi(irq - HW_IRQ_START);
@@ -350,23 +352,24 @@ chal_irq_disable(int irq, cpuid_t cpu_id)
 	union ioapic_int_redir_entry entry;
 	u8_t off;
 
-	if (!io) return;
+	if (!io) return -EINVAL;
 
 	off = gsi - io->glbint_base;
 	entry = ioapic_int_entry_read(io, off);
 
 	/* the destination bitmap is 8 bits */
-	if (irq - HW_IRQ_START >= (int)ioapic_int_count || cpu_id > 7) return;
+	if (irq - HW_IRQ_START >= (int)ioapic_int_count || cpu_id > 7) return -EINVAL;
 
 	assert(entry.mask || entry.destmod == IOAPIC_DST_LOGICAL);
 
 	/* we should disable the irq if we remove the last core */
 	if (!(entry.destination & ~logical_apicids[cpu_id])) {
 		ioapic_int_mask(irq - HW_IRQ_START);
-		return;
+		return 0;
 	}
 
 	ioapic_int_unmask(irq - HW_IRQ_START, entry.destination & ~logical_apicids[cpu_id]);
+	return 0;
 }
 
 void

From 3f34e15017051d2a49ace72d1d10795a62fe7bce Mon Sep 17 00:00:00 2001
From: Zheng Yang <codeforyz@gmail.com>
Date: Mon, 23 Apr 2018 17:26:15 +0000
Subject: [PATCH 018/127] fix hpet

---
 src/platform/i386/idt.c | 138 ++++++++++++++++++++--------------------
 1 file changed, 70 insertions(+), 68 deletions(-)

diff --git a/src/platform/i386/idt.c b/src/platform/i386/idt.c
index c876cd5585..2bf3a4a22e 100644
--- a/src/platform/i386/idt.c
+++ b/src/platform/i386/idt.c
@@ -95,74 +95,76 @@ remap_irq_table(void)
 void
 idt_init(const cpuid_t cpu_id)
 {
-	idt_ptr.limit = (sizeof(struct idt_entry) * NUM_IDT_ENTRIES) - 1;
-	idt_ptr.base  = (u32_t)&(idt_entries);
-	memset(&(idt_entries), 0, sizeof(struct idt_entry) * NUM_IDT_ENTRIES);
-
-	outb(0x20, 0x11);
-	outb(0xA0, 0x11);
-	outb(0x21, 0x20);
-	outb(0xA1, 0x28);
-	outb(0x21, 0x04);
-	outb(0xA1, 0x02);
-	outb(0x21, 0x01);
-	outb(0xA1, 0x01);
-	outb(0x21, 0x0);
-	outb(0xA1, 0x0);
-
-	idt_set_gate(IRQ_DIV_BY_ZERO_ERR_FAULT, (u32_t)div_by_zero_err_fault_irq, 0x08, 0x8E);
-	idt_set_gate(IRQ_DEBUG_TRAP, (u32_t)debug_trap_irq, 0x08, 0x8E);
-	idt_set_gate(IRQ_BREAKPOINT_TRAP, (u32_t)breakpoint_trap_irq, 0x08, 0x8E);
-	idt_set_gate(IRQ_OVERFLOW_TRAP, (u32_t)overflow_trap_irq, 0x08, 0x8E);
-	idt_set_gate(IRQ_BOUND_RANGE_EXCEED_FAULT, (u32_t)bound_range_exceed_fault_irq, 0x08, 0x8E);
-	idt_set_gate(IRQ_INVALID_OPCODE_FAULT, (u32_t)invalid_opcode_fault_irq, 0x08, 0x8E);
-	idt_set_gate(IRQ_DEVICE_NOT_AVAIL_FAULT, (u32_t)device_not_avail_fault_irq, 0x08, 0x8E);
-	idt_set_gate(IRQ_DOUBLE_FAULT_ABORT, (u32_t)double_fault_abort_irq, 0x08, 0x8E);
-	idt_set_gate(IRQ_INVALID_TSS_FAULT, (u32_t)invalid_tss_fault_irq, 0x08, 0x8E);
-	idt_set_gate(IRQ_SEG_NOT_PRESENT_FAULT, (u32_t)seg_not_present_fault_irq, 0x08, 0x8E);
-	idt_set_gate(IRQ_STACK_SEG_FAULT, (u32_t)stack_seg_fault_irq, 0x08, 0x8E);
-	idt_set_gate(IRQ_GEN_PROTECT_FAULT, (u32_t)gen_protect_fault_irq, 0x08, 0x8E);
-	idt_set_gate(IRQ_PAGE_FAULT, (u32_t)page_fault_irq, 0x08, 0x8E);
-	idt_set_gate(IRQ_X87_FLOAT_PT_EXCEPT_FAULT, (u32_t)x87_float_pt_except_fault_irq, 0x08, 0x8E);
-	idt_set_gate(IRQ_ALIGN_CHECK_FAULT, (u32_t)align_check_fault_irq, 0x08, 0x8E);
-	idt_set_gate(IRQ_MACHINE_CHECK_ABORT, (u32_t)machine_check_abort_irq, 0x08, 0x8E);
-	idt_set_gate(IRQ_SMID_FLOAT_PT_EXCEPT_FAULT, (u32_t)smid_float_pt_except_fault_irq, 0x08, 0x8E);
-	idt_set_gate(IRQ_VIRTUALIZATION_EXCEPT_FAULT, (u32_t)virtualization_except_fault_irq, 0x08, 0x8E);
-	idt_set_gate(IRQ_SECURITY_EXCEPT_FAULT, (u32_t)security_except_fault_irq, 0x08, 0x8E);
-
-	idt_set_gate(HW_PERIODIC, (u32_t)periodic_irq, 0x08, 0x8E);
-	idt_set_gate(HW_KEYBOARD, (u32_t)keyboard_irq, 0x08, 0x8E);
-	idt_set_gate(HW_ID3, (u32_t)handler_hw_34, 0x08, 0x8E);
-	idt_set_gate(HW_ID4, (u32_t)handler_hw_35, 0x08, 0x8E);
-	idt_set_gate(HW_SERIAL, (u32_t)serial_irq, 0x08, 0x8E);
-	idt_set_gate(HW_ID6, (u32_t)handler_hw_37, 0x08, 0x8E);
-	idt_set_gate(HW_ID7, (u32_t)handler_hw_38, 0x08, 0x8E);
-	idt_set_gate(HW_ID8, (u32_t)handler_hw_39, 0x08, 0x8E);
-	idt_set_gate(HW_ONESHOT, (u32_t)oneshot_irq, 0x08, 0x8E);
-	idt_set_gate(HW_ID10, (u32_t)handler_hw_41, 0x08, 0x8E);
-	idt_set_gate(HW_ID11, (u32_t)handler_hw_42, 0x08, 0x8E);
-	idt_set_gate(HW_ID12, (u32_t)handler_hw_43, 0x08, 0x8E);
-	idt_set_gate(HW_ID13, (u32_t)handler_hw_44, 0x08, 0x8E);
-	idt_set_gate(HW_ID14, (u32_t)handler_hw_45, 0x08, 0x8E);
-	idt_set_gate(HW_ID15, (u32_t)handler_hw_46, 0x08, 0x8E);
-	idt_set_gate(HW_ID16, (u32_t)handler_hw_47, 0x08, 0x8E);
-	idt_set_gate(HW_ID17, (u32_t)handler_hw_48, 0x08, 0x8E);
-	idt_set_gate(HW_ID18, (u32_t)handler_hw_49, 0x08, 0x8E);
-	idt_set_gate(HW_ID19, (u32_t)handler_hw_50, 0x08, 0x8E);
-	idt_set_gate(HW_ID20, (u32_t)handler_hw_51, 0x08, 0x8E);
-	idt_set_gate(HW_ID21, (u32_t)handler_hw_52, 0x08, 0x8E);
-	idt_set_gate(HW_ID22, (u32_t)handler_hw_53, 0x08, 0x8E);
-	idt_set_gate(HW_ID23, (u32_t)handler_hw_54, 0x08, 0x8E);
-	idt_set_gate(HW_ID24, (u32_t)handler_hw_55, 0x08, 0x8E);
-	idt_set_gate(HW_ID25, (u32_t)handler_hw_56, 0x08, 0x8E);
-	idt_set_gate(HW_ID26, (u32_t)handler_hw_57, 0x08, 0x8E);
-	idt_set_gate(HW_ID27, (u32_t)handler_hw_58, 0x08, 0x8E);
-	idt_set_gate(HW_ID28, (u32_t)handler_hw_59, 0x08, 0x8E);
-	idt_set_gate(HW_ID29, (u32_t)handler_hw_60, 0x08, 0x8E);
-	idt_set_gate(HW_ID30, (u32_t)handler_hw_61, 0x08, 0x8E);
-	idt_set_gate(HW_ID31, (u32_t)handler_hw_62, 0x08, 0x8E);
-	idt_set_gate(HW_LAPIC_SPURIOUS, (u32_t)lapic_spurious_irq, 0x08, 0x8E);
-	idt_set_gate(HW_LAPIC_TIMER, (u32_t)lapic_timer_irq, 0x08, 0x8E);
+	if (cpu_id == INIT_CORE) {
+		idt_ptr.limit = (sizeof(struct idt_entry) * NUM_IDT_ENTRIES) - 1;
+		idt_ptr.base  = (u32_t)&(idt_entries);
+		memset(&(idt_entries), 0, sizeof(struct idt_entry) * NUM_IDT_ENTRIES);
+
+		outb(0x20, 0x11);
+		outb(0xA0, 0x11);
+		outb(0x21, 0x20);
+		outb(0xA1, 0x28);
+		outb(0x21, 0x04);
+		outb(0xA1, 0x02);
+		outb(0x21, 0x01);
+		outb(0xA1, 0x01);
+		outb(0x21, 0x0);
+		outb(0xA1, 0x0);
+
+		idt_set_gate(IRQ_DIV_BY_ZERO_ERR_FAULT, (u32_t)div_by_zero_err_fault_irq, 0x08, 0x8E);
+		idt_set_gate(IRQ_DEBUG_TRAP, (u32_t)debug_trap_irq, 0x08, 0x8E);
+		idt_set_gate(IRQ_BREAKPOINT_TRAP, (u32_t)breakpoint_trap_irq, 0x08, 0x8E);
+		idt_set_gate(IRQ_OVERFLOW_TRAP, (u32_t)overflow_trap_irq, 0x08, 0x8E);
+		idt_set_gate(IRQ_BOUND_RANGE_EXCEED_FAULT, (u32_t)bound_range_exceed_fault_irq, 0x08, 0x8E);
+		idt_set_gate(IRQ_INVALID_OPCODE_FAULT, (u32_t)invalid_opcode_fault_irq, 0x08, 0x8E);
+		idt_set_gate(IRQ_DEVICE_NOT_AVAIL_FAULT, (u32_t)device_not_avail_fault_irq, 0x08, 0x8E);
+		idt_set_gate(IRQ_DOUBLE_FAULT_ABORT, (u32_t)double_fault_abort_irq, 0x08, 0x8E);
+		idt_set_gate(IRQ_INVALID_TSS_FAULT, (u32_t)invalid_tss_fault_irq, 0x08, 0x8E);
+		idt_set_gate(IRQ_SEG_NOT_PRESENT_FAULT, (u32_t)seg_not_present_fault_irq, 0x08, 0x8E);
+		idt_set_gate(IRQ_STACK_SEG_FAULT, (u32_t)stack_seg_fault_irq, 0x08, 0x8E);
+		idt_set_gate(IRQ_GEN_PROTECT_FAULT, (u32_t)gen_protect_fault_irq, 0x08, 0x8E);
+		idt_set_gate(IRQ_PAGE_FAULT, (u32_t)page_fault_irq, 0x08, 0x8E);
+		idt_set_gate(IRQ_X87_FLOAT_PT_EXCEPT_FAULT, (u32_t)x87_float_pt_except_fault_irq, 0x08, 0x8E);
+		idt_set_gate(IRQ_ALIGN_CHECK_FAULT, (u32_t)align_check_fault_irq, 0x08, 0x8E);
+		idt_set_gate(IRQ_MACHINE_CHECK_ABORT, (u32_t)machine_check_abort_irq, 0x08, 0x8E);
+		idt_set_gate(IRQ_SMID_FLOAT_PT_EXCEPT_FAULT, (u32_t)smid_float_pt_except_fault_irq, 0x08, 0x8E);
+		idt_set_gate(IRQ_VIRTUALIZATION_EXCEPT_FAULT, (u32_t)virtualization_except_fault_irq, 0x08, 0x8E);
+		idt_set_gate(IRQ_SECURITY_EXCEPT_FAULT, (u32_t)security_except_fault_irq, 0x08, 0x8E);
+
+		idt_set_gate(HW_PERIODIC, (u32_t)periodic_irq, 0x08, 0x8E);
+		idt_set_gate(HW_KEYBOARD, (u32_t)keyboard_irq, 0x08, 0x8E);
+		idt_set_gate(HW_ID3, (u32_t)handler_hw_34, 0x08, 0x8E);
+		idt_set_gate(HW_ID4, (u32_t)handler_hw_35, 0x08, 0x8E);
+		idt_set_gate(HW_SERIAL, (u32_t)serial_irq, 0x08, 0x8E);
+		idt_set_gate(HW_ID6, (u32_t)handler_hw_37, 0x08, 0x8E);
+		idt_set_gate(HW_ID7, (u32_t)handler_hw_38, 0x08, 0x8E);
+		idt_set_gate(HW_ID8, (u32_t)handler_hw_39, 0x08, 0x8E);
+		idt_set_gate(HW_ONESHOT, (u32_t)oneshot_irq, 0x08, 0x8E);
+		idt_set_gate(HW_ID10, (u32_t)handler_hw_41, 0x08, 0x8E);
+		idt_set_gate(HW_ID11, (u32_t)handler_hw_42, 0x08, 0x8E);
+		idt_set_gate(HW_ID12, (u32_t)handler_hw_43, 0x08, 0x8E);
+		idt_set_gate(HW_ID13, (u32_t)handler_hw_44, 0x08, 0x8E);
+		idt_set_gate(HW_ID14, (u32_t)handler_hw_45, 0x08, 0x8E);
+		idt_set_gate(HW_ID15, (u32_t)handler_hw_46, 0x08, 0x8E);
+		idt_set_gate(HW_ID16, (u32_t)handler_hw_47, 0x08, 0x8E);
+		idt_set_gate(HW_ID17, (u32_t)handler_hw_48, 0x08, 0x8E);
+		idt_set_gate(HW_ID18, (u32_t)handler_hw_49, 0x08, 0x8E);
+		idt_set_gate(HW_ID19, (u32_t)handler_hw_50, 0x08, 0x8E);
+		idt_set_gate(HW_ID20, (u32_t)handler_hw_51, 0x08, 0x8E);
+		idt_set_gate(HW_ID21, (u32_t)handler_hw_52, 0x08, 0x8E);
+		idt_set_gate(HW_ID22, (u32_t)handler_hw_53, 0x08, 0x8E);
+		idt_set_gate(HW_ID23, (u32_t)handler_hw_54, 0x08, 0x8E);
+		idt_set_gate(HW_ID24, (u32_t)handler_hw_55, 0x08, 0x8E);
+		idt_set_gate(HW_ID25, (u32_t)handler_hw_56, 0x08, 0x8E);
+		idt_set_gate(HW_ID26, (u32_t)handler_hw_57, 0x08, 0x8E);
+		idt_set_gate(HW_ID27, (u32_t)handler_hw_58, 0x08, 0x8E);
+		idt_set_gate(HW_ID28, (u32_t)handler_hw_59, 0x08, 0x8E);
+		idt_set_gate(HW_ID29, (u32_t)handler_hw_60, 0x08, 0x8E);
+		idt_set_gate(HW_ID30, (u32_t)handler_hw_61, 0x08, 0x8E);
+		idt_set_gate(HW_ID31, (u32_t)handler_hw_62, 0x08, 0x8E);
+		idt_set_gate(HW_LAPIC_SPURIOUS, (u32_t)lapic_spurious_irq, 0x08, 0x8E);
+		idt_set_gate(HW_LAPIC_TIMER, (u32_t)lapic_timer_irq, 0x08, 0x8E);
+	}
 
 	struct {
 		unsigned short length;

From b6e847a6ec3e8594400eb4210a0bf2d95b1406f6 Mon Sep 17 00:00:00 2001
From: Zheng Yang <codeforyz@gmail.com>
Date: Mon, 23 Apr 2018 22:56:50 +0000
Subject: [PATCH 019/127] fix merge

---
 src/platform/i386/idt.c | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/src/platform/i386/idt.c b/src/platform/i386/idt.c
index bc54898b76..8397900ef0 100644
--- a/src/platform/i386/idt.c
+++ b/src/platform/i386/idt.c
@@ -62,17 +62,6 @@ idt_init(const cpuid_t cpu_id)
 		idt_ptr.base  = (u32_t)&(idt_entries);
 		memset(&(idt_entries), 0, sizeof(struct idt_entry) * NUM_IDT_ENTRIES);
 
-		outb(0x20, 0x11);
-		outb(0xA0, 0x11);
-		outb(0x21, 0x20);
-		outb(0xA1, 0x28);
-		outb(0x21, 0x04);
-		outb(0xA1, 0x02);
-		outb(0x21, 0x01);
-		outb(0xA1, 0x01);
-		outb(0x21, 0x0);
-		outb(0xA1, 0x0);
-
 		idt_set_gate(IRQ_DIV_BY_ZERO_ERR_FAULT, (u32_t)div_by_zero_err_fault_irq, 0x08, 0x8E);
 		idt_set_gate(IRQ_DEBUG_TRAP, (u32_t)debug_trap_irq, 0x08, 0x8E);
 		idt_set_gate(IRQ_BREAKPOINT_TRAP, (u32_t)breakpoint_trap_irq, 0x08, 0x8E);
@@ -93,7 +82,7 @@ idt_init(const cpuid_t cpu_id)
 		idt_set_gate(IRQ_VIRTUALIZATION_EXCEPT_FAULT, (u32_t)virtualization_except_fault_irq, 0x08, 0x8E);
 		idt_set_gate(IRQ_SECURITY_EXCEPT_FAULT, (u32_t)security_except_fault_irq, 0x08, 0x8E);
 
-		idt_set_gate(HW_PERIODIC, (u32_t)periodic_irq, 0x08, 0x8E);
+		idt_set_gate(HW_HPET_PERIODIC, (u32_t)hpet_periodic_irq, 0x08, 0x8E);
 		idt_set_gate(HW_KEYBOARD, (u32_t)keyboard_irq, 0x08, 0x8E);
 		idt_set_gate(HW_ID3, (u32_t)handler_hw_34, 0x08, 0x8E);
 		idt_set_gate(HW_ID4, (u32_t)handler_hw_35, 0x08, 0x8E);
@@ -101,7 +90,7 @@ idt_init(const cpuid_t cpu_id)
 		idt_set_gate(HW_ID6, (u32_t)handler_hw_37, 0x08, 0x8E);
 		idt_set_gate(HW_ID7, (u32_t)handler_hw_38, 0x08, 0x8E);
 		idt_set_gate(HW_ID8, (u32_t)handler_hw_39, 0x08, 0x8E);
-		idt_set_gate(HW_ONESHOT, (u32_t)oneshot_irq, 0x08, 0x8E);
+		idt_set_gate(HW_HPET_ONESHOT, (u32_t)hpet_oneshot_irq, 0x08, 0x8E);
 		idt_set_gate(HW_ID10, (u32_t)handler_hw_41, 0x08, 0x8E);
 		idt_set_gate(HW_ID11, (u32_t)handler_hw_42, 0x08, 0x8E);
 		idt_set_gate(HW_ID12, (u32_t)handler_hw_43, 0x08, 0x8E);

From 29ae53e5b5df27b00c602e665144885a06019f9b Mon Sep 17 00:00:00 2001
From: Zheng Yang <codeforyz@gmail.com>
Date: Mon, 23 Apr 2018 23:31:03 +0000
Subject: [PATCH 020/127] fix merge 2

---
 src/platform/i386/acpi.c  |  6 ++++++
 src/platform/i386/lapic.c | 11 ++---------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/platform/i386/acpi.c b/src/platform/i386/acpi.c
index bdc4d53aad..68aabe763b 100644
--- a/src/platform/i386/acpi.c
+++ b/src/platform/i386/acpi.c
@@ -187,4 +187,10 @@ acpi_madt_intsrc_iter(unsigned char *addr)
 	}
 
 	printk("\tMADT => LAPICs=%d, IOAPICs=%d\n", nl, nio);
+
+	if (nl < NUM_CPU) {
+		printk("Number of LAPICs processed =%d not meeting the requirement = %d\n", nl, NUM_CPU);
+		printk("Please reconfigure NUM_CPU in Composite/HW-BIOS\n");
+		assert(0);
+	}
 }
diff --git a/src/platform/i386/lapic.c b/src/platform/i386/lapic.c
index d88d7e55b9..083eadb704 100644
--- a/src/platform/i386/lapic.c
+++ b/src/platform/i386/lapic.c
@@ -150,22 +150,14 @@ void
 lapic_iter(struct lapic_cntl *l)
 {
 	static int off = 1;
-	int us = lapic_apicid();
 
 	assert(l->header.len == sizeof(struct lapic_cntl));
 	printk("\tLAPIC found: coreid %d, apicid %d\n", l->proc_id, l->apic_id);
 
-	if (l->apic_id != us && l->flags && ncpus < NUM_CPU && NUM_CPU > 1) {
+	if (l->apic_id != apicids[INIT_CORE] && l->flags && ncpus < NUM_CPU && NUM_CPU > 1) {
 		apicids[off++] = l->apic_id;
 		ncpus++;
 	}
-	printk("\tAPICs processed, %d cores\n", ncpus);
-
-	if (ncpus != NUM_CPU) {
-		printk("Number of LAPICs processed =%d not meeting the requirement = %d\n", ncpus, NUM_CPU);
-		printk("Please reconfigure NUM_CPU in Composite/HW-BIOS\n");
-		assert(0);
-	}
 }
 
 u32_t
@@ -179,6 +171,7 @@ lapic_find_localaddr(void *l)
 
 	printk("Initializing LAPIC @ %p\n", lapicaddr);
 
+	apicids[INIT_CORE] = lapic_apicid();
 	for (i = 0; i < length; i++) {
 		sum += lapicaddr[i];
 	}

From d5420756527ee7dfdf60173d99b9e02ff60a4ae9 Mon Sep 17 00:00:00 2001
From: Zheng Yang <codeforyz@gmail.com>
Date: Tue, 24 Apr 2018 00:23:11 +0000
Subject: [PATCH 021/127] use non-legacy mode of hpet

---
 src/platform/i386/hpet.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/platform/i386/hpet.c b/src/platform/i386/hpet.c
index 8c36ef6d11..15304bfede 100644
--- a/src/platform/i386/hpet.c
+++ b/src/platform/i386/hpet.c
@@ -49,12 +49,16 @@
 #define HPET_TN_VAL_SET_CNF (1ll << 6)  /* set to allow directly setting accumulator */
 /* 1 << 7 is reserved */
 #define HPET_TN_32MODE_CNF (1ll << 8)           /* 1 = force 32-bit access to 64-bit timer */
-/* #define HPET_TN_INT_ROUTE_CNF (1<<9:1<<13)*/ /* routing for interrupt */
+#define HPET_TN_INT_ROUTE_CNF (9) 	/* routing for interrupt */
 #define HPET_TN_FSB_EN_CNF (1ll << 14)          /* 1 = deliver interrupts via FSB instead of APIC */
 #define HPET_TN_FSB_INT_DEL_CAP (1ll << 15)     /* read only, 1 = FSB delivery available */
 
 #define HPET_INT_ENABLE(n) (*hpet_interrupt = (0x1 << n)) /* Clears the INT n for level-triggered mode. */
 
+/* vector for interrupts */
+#define HPET_PERIODIC_VEC 0ll
+#define HPET_ONESHOT_VEC 8ll
+
 static volatile u32_t *hpet_capabilities;
 static volatile u64_t *hpet_config;
 static volatile u64_t *hpet_interrupt;
@@ -224,10 +228,13 @@ hpet_set(hpet_type_t timer_type, u64_t cycles)
 
 		/* Set a static value to count up to */
 		hpet_timers[timer_type].config = outconfig;
+		hpet_timers[timer_type].config |= HPET_ONESHOT_VEC << HPET_TN_INT_ROUTE_CNF;
 		cycles += HPET_COUNTER;
 	} else {
 		/* Set a periodic value */
 		hpet_timers[timer_type].config = outconfig | HPET_TN_TYPE_CNF | HPET_TN_VAL_SET_CNF;
+		/* Set the interrupt vector for periodic timer */
+		hpet_timers[timer_type].config |= HPET_PERIODIC_VEC << HPET_TN_INT_ROUTE_CNF;
 		/* Reset main counter */
 		HPET_COUNTER = 0x00;
 	}
@@ -287,8 +294,6 @@ hpet_init(void)
 	hpet_hpetcyc_per_tick = (HPET_DEFAULT_PERIOD_US * HPET_PICO_PER_MICRO) / pico_per_hpetcyc;
 
 	printk("Enabling timer @ %p with tick granularity %ld picoseconds\n", hpet, pico_per_hpetcyc);
-	/* Enable legacy interrupt routing */
-	*hpet_config |= HPET_LEG_RT_CNF;
 
 	/*
 	 * Set the timer as specified.  This assumes that the cycle
@@ -305,4 +310,5 @@ hpet_init(void)
 	hpet_calibration_init = 1;
 	hpet_set(HPET_PERIODIC, hpet_hpetcyc_per_tick);
 	chal_irq_enable(HW_HPET_PERIODIC, 0);
+	chal_irq_enable(HW_HPET_ONESHOT, 0);
 }

From e5d7145fd871c292bdbecd555d8fd0edc54765f3 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Thu, 14 Feb 2019 16:40:09 -0500
Subject: [PATCH 022/127] Round-robin scheduling in sl (w/o priorities)

---
 src/components/lib/sl/Makefile    |  2 +-
 src/components/lib/sl/sl_mod_rr.c | 98 +++++++++++++++++++++++++++++++
 2 files changed, 99 insertions(+), 1 deletion(-)
 create mode 100644 src/components/lib/sl/sl_mod_rr.c

diff --git a/src/components/lib/sl/Makefile b/src/components/lib/sl/Makefile
index 6e908cda0b..11683f3f03 100644
--- a/src/components/lib/sl/Makefile
+++ b/src/components/lib/sl/Makefile
@@ -1,6 +1,6 @@
 include Makefile.src Makefile.comp
 
-LIB_OBJS=sl_capmgr.o sl_raw.o sl_sched.o sl_xcpu.o sl_child.o sl_mod_fprr.o sl_lock.o sl_thd_static_backend.o
+LIB_OBJS=sl_capmgr.o sl_raw.o sl_sched.o sl_xcpu.o sl_child.o sl_mod_rr.o sl_mod_fprr.o sl_lock.o sl_thd_static_backend.o
 LIBS=$(LIB_OBJS:%.o=%.a)
 CINC+=-m32
 
diff --git a/src/components/lib/sl/sl_mod_rr.c b/src/components/lib/sl/sl_mod_rr.c
new file mode 100644
index 0000000000..bd796a1346
--- /dev/null
+++ b/src/components/lib/sl/sl_mod_rr.c
@@ -0,0 +1,98 @@
+#include <sl.h>
+#include <sl_consts.h>
+#include <sl_mod_policy.h>
+#include <sl_plugins.h>
+
+#define SL_FPRR_PERIOD_US_MIN  SL_MIN_PERIOD_US
+
+struct ps_list_head threads[NUM_CPU] CACHE_ALIGNED;
+
+/* No RR yet */
+void
+sl_mod_execution(struct sl_thd_policy *t, cycles_t cycles)
+{ }
+
+struct sl_thd_policy *
+sl_mod_schedule(void)
+{
+	struct sl_thd_policy *t = NULL;
+
+	if (ps_list_head_empty(&threads[cos_cpuid()])) goto done;
+	t = ps_list_head_first_d(&threads[cos_cpuid()], struct sl_thd_policy);
+	ps_list_rem_d(t);
+	ps_list_head_append_d(&threads[cos_cpuid()], t);
+
+done:
+	return t;
+}
+
+void
+sl_mod_block(struct sl_thd_policy *t)
+{
+	ps_list_rem_d(t);
+}
+
+void
+sl_mod_wakeup(struct sl_thd_policy *t)
+{
+	assert(ps_list_singleton_d(t));
+
+	ps_list_head_append_d(&threads[cos_cpuid()], t);
+}
+
+void
+sl_mod_yield(struct sl_thd_policy *t, struct sl_thd_policy *yield_to)
+{
+	ps_list_rem_d(t);
+	ps_list_head_append_d(&threads[cos_cpuid()], t);
+}
+
+void
+sl_mod_thd_create(struct sl_thd_policy *t)
+{
+	t->priority    = TCAP_PRIO_MIN;
+	t->period      = 0;
+	t->period_usec = 0;
+	ps_list_init_d(t);
+}
+
+void
+sl_mod_thd_delete(struct sl_thd_policy *t)
+{ ps_list_rem_d(t); }
+
+void
+sl_mod_thd_param_set(struct sl_thd_policy *t, sched_param_type_t type, unsigned int v)
+{
+	int cpu = cos_cpuid();
+
+	switch (type) {
+	case SCHEDP_PRIO:
+	{
+		t->priority = v;
+		sl_thd_setprio(sl_mod_thd_get(t), t->priority);
+		ps_list_head_append_d(&threads[cos_cpuid()], t);
+
+		break;
+	}
+	case SCHEDP_WINDOW:
+	{
+		assert(v >= SL_FPRR_PERIOD_US_MIN);
+		t->period_usec    = v;
+		t->period         = sl_usec2cyc(v);
+		/* FIXME: synchronize periods for all tasks */
+
+		break;
+	}
+	case SCHEDP_BUDGET:
+	{
+		break;
+	}
+	default: assert(0);
+	}
+}
+
+void
+sl_mod_init(void)
+{
+	ps_list_head_init(&threads[cos_cpuid()]);
+}

From 1542f3cf3cccf79d4579b4658285852ffac1f73b Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Thu, 14 Feb 2019 16:56:34 -0500
Subject: [PATCH 023/127] Prototype for user-level dispatching of kernel-level
 thread resources

* Major limitations: no kernel entry allowed at anytime. That could potentially
  screw up things for the "current thread".
* SCB and DCB are user-level only. Have to enable kernel-user page sharing, TODO!
* Given these limitations, tested sl_yield(), and the benchmarks look promising.
  With only kernel-level dispatching: AVG: 620, WC:804
  With only user-level dispatching: AVG: 340, WC: 750
  With shared pages, a couple more branches in the kernel-dispatching.
---
 src/components/Makefile.comp                  |  2 +-
 .../tests/unit_schedtests/unit_schedlib.c     | 79 ++++++++++++++++++-
 src/components/include/sl.h                   | 56 ++++++++++---
 src/components/include/sl_thd.h               | 19 +++++
 src/components/lib/sl/Makefile                |  7 +-
 src/components/lib/sl/sl_sched.c              |  9 +++
 src/components/lib/sl/sl_slowpath.S           |  9 +++
 src/platform/i386/qemu-kvm.sh                 | 15 ++++
 8 files changed, 181 insertions(+), 15 deletions(-)
 create mode 100644 src/components/lib/sl/sl_slowpath.S
 create mode 100755 src/platform/i386/qemu-kvm.sh

diff --git a/src/components/Makefile.comp b/src/components/Makefile.comp
index 2e01089411..a70420f886 100644
--- a/src/components/Makefile.comp
+++ b/src/components/Makefile.comp
@@ -52,6 +52,6 @@ SERVER_STUB=s_stub.o
 CLIENT_STUB=c_stub.o
 
 LIBCOSDEFKERN=-lcos_kernel_api -lcos_defkernel_api
-LIBSLCORE=$(LIBCOSDEFKERN) -lsl_sched -lheap -lsl_xcpu -lsl_child -lck
+LIBSLCORE=$(LIBCOSDEFKERN) -lsl_sched -lheap -lsl_xcpu -lsl_child -lck -lsl_slowpath
 LIBSLCAPMGR=$(LIBSLCORE) -lsl_capmgr
 LIBSLRAW=$(LIBSLCORE) -lsl_raw
diff --git a/src/components/implementation/tests/unit_schedtests/unit_schedlib.c b/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
index 2ca97b36ff..dde4427fa9 100644
--- a/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
+++ b/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
@@ -31,6 +31,59 @@
 #define N_TESTTHDS 8
 #define WORKITERS 10000
 
+#define N_TESTTHDS_PERF 2
+#define PERF_ITERS 1000000
+
+static volatile cycles_t mid_cycs = 0;
+static volatile int testing = 1;
+
+void
+test_thd_perffn(void *data)
+{
+	cycles_t start_cycs = 0, end_cycs = 0, wc_cycs = 0, total_cycs = 0;
+	unsigned int i = 0;
+
+	rdtscll(start_cycs);
+	sl_thd_yield(0);
+	rdtscll(end_cycs);
+	assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs);
+
+	for (i = 0; i < PERF_ITERS; i++) {
+		cycles_t diff1_cycs = 0, diff2_cycs = 0;
+
+		mid_cycs = 0;
+		rdtscll(start_cycs);
+		sl_thd_yield(0);
+		rdtscll(end_cycs);
+		assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs);
+
+		diff1_cycs = mid_cycs - start_cycs;
+		diff2_cycs = end_cycs - mid_cycs;
+
+		if (diff1_cycs > wc_cycs) wc_cycs = diff1_cycs;
+		if (diff2_cycs > wc_cycs) wc_cycs = diff2_cycs;
+		total_cycs += (diff1_cycs + diff2_cycs);
+	}
+
+	PRINTC("SWITCH UBENCH: avg: %llu, wc: %llu, iters:%u\n", (total_cycs / (2 * PERF_ITERS)), wc_cycs, PERF_ITERS);
+	testing = 0;
+	/* done testing! let the spinfn cleanup! */
+	sl_thd_yield(0);
+
+	sl_thd_exit();
+}
+
+void
+test_thd_spinfn(void *data)
+{
+	while (likely(testing)) {
+		rdtscll(mid_cycs);
+		sl_thd_yield(0);
+	}
+
+	sl_thd_exit();
+}
+
 void
 test_thd_fn(void *data)
 {
@@ -43,6 +96,22 @@ test_thd_fn(void *data)
 	}
 }
 
+void
+test_yield_perf(void)
+{
+	int                     i;
+	struct sl_thd *         threads[N_TESTTHDS_PERF];
+	union sched_param_union sp = {.c = {.type = SCHEDP_PRIO, .value = 31}};
+
+	for (i = 0; i < N_TESTTHDS_PERF; i++) {
+		if (i == 1) threads[i] = sl_thd_alloc(test_thd_perffn, (void *)&threads[0]);
+		else        threads[i] = sl_thd_alloc(test_thd_spinfn, NULL);
+		assert(threads[i]);
+		sl_thd_param_set(threads[i], sp.v);
+		PRINTC("Thread %u:%lu created\n", sl_thd_thdid(threads[i]), sl_thd_thdcap(threads[i]));
+	}
+}
+
 void
 test_yields(void)
 {
@@ -51,9 +120,10 @@ test_yields(void)
 	union sched_param_union sp = {.c = {.type = SCHEDP_PRIO, .value = 10}};
 
 	for (i = 0; i < N_TESTTHDS; i++) {
-		threads[i] = sl_thd_alloc(test_thd_fn, (void *)(intptr_t)(i + 1));
+		threads[i] = sl_thd_alloc(test_thd_fn, (void *)i);
 		assert(threads[i]);
 		sl_thd_param_set(threads[i], sp.v);
+		PRINTC("Thread %u:%lu created\n", sl_thd_thdid(threads[i]), sl_thd_thdcap(threads[i]));
 	}
 }
 
@@ -151,9 +221,10 @@ cos_init(void)
 	cos_defcompinfo_init();
 	sl_init(SL_MIN_PERIOD_US);
 
-	//	test_yields();
-	//	test_blocking_directed_yield();
-	test_timeout_wakeup();
+	test_yield_perf();
+	//test_yields();
+	//test_blocking_directed_yield();
+	//test_timeout_wakeup();
 
 	sl_sched_loop_nonblock();
 
diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index 7252b07612..8b2e3281a6 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -40,6 +40,9 @@
 #include <sl_xcpu.h>
 #include <heap.h>
 
+extern int thd_dispatch_slowpath(struct sl_thd *t, sched_tok_t tok);
+extern int sl_thd_dispatch_slowpath(struct sl_thd *t, sched_tok_t tok);
+
 /* Critical section (cs) API to protect scheduler data-structures */
 struct sl_cs {
 	union sl_cs_intern {
@@ -51,6 +54,13 @@ struct sl_cs {
 	} u;
 };
 
+struct sl_scb_info {
+	thdcap_t curr_thd;
+	tcap_prio_t curr_prio;
+
+	cycles_t timer_next;
+};
+
 struct sl_global_cpu {
 	struct sl_cs lock;
 
@@ -65,6 +75,7 @@ struct sl_global_cpu {
 	cycles_t    timer_next;
 	tcap_time_t timeout_next;
 
+	struct sl_scb_info scb_info;
 	struct ps_list_head event_head; /* all pending events for sched end-point */
 };
 
@@ -76,6 +87,12 @@ sl__globals_cpu(void)
 	return &(sl_global_cpu_data[cos_cpuid()]);
 }
 
+static inline struct sl_scb_info *
+sl_scb_info_cpu(void)
+{
+	return &(sl__globals_cpu()->scb_info);
+}
+
 static inline void
 sl_thd_setprio(struct sl_thd *t, tcap_prio_t p)
 {
@@ -341,6 +358,8 @@ sl_timeout_oneshot(cycles_t absolute_us)
 {
 	sl__globals_cpu()->timer_next   = absolute_us;
 	sl__globals_cpu()->timeout_next = tcap_cyc2time(absolute_us);
+
+	sl_scb_info_cpu()->timer_next   = absolute_us;
 }
 
 static inline void
@@ -395,12 +414,38 @@ sl_thd_is_runnable(struct sl_thd *t)
 	return (t->state == SL_THD_RUNNABLE || t->state == SL_THD_WOKEN);
 }
 
+static inline int
+sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
+{
+	struct sl_scb_info *scb = sl_scb_info_cpu();
+
+	__asm__ __volatile__ (				\
+		"movl $2f, (%%eax)\n\t"			\
+		"movl %%esp, 4(%%eax)\n\t"		\
+		"cmp $0, 4(%%ebx)\n\t"			\
+		"je 1f\n\t"				\
+		"movl %%edx, (%%ecx)\n\t"		\
+		"movl 4(%%ebx), %%esp\n\t"		\
+		"jmp *(%%ebx)\n\t"			\
+		"1:\n\t"				\
+		"call thd_dispatch_slowpath\n\t"	\
+		"2:\n\t"				\
+		"movl $0, 4(%%ebx)\n\t"			\
+		:
+		: "a" (sl_thd_dcbinfo(curr)), "b" (sl_thd_dcbinfo(next)), "S" (next), "D" (tok),
+		  "c" (&(scb->curr_thd)), "d" (sl_thd_thdcap(next))
+		: "memory", "cc");
+
+	return 0;
+}
+
 static inline int
 sl_thd_activate(struct sl_thd *t, sched_tok_t tok)
 {
 	struct cos_defcompinfo *dci = cos_defcompinfo_curr_get();
 	struct cos_compinfo    *ci  = &dci->ci;
 	struct sl_global_cpu   *g   = sl__globals_cpu();
+	struct sl_scb_info     *scb = sl_scb_info_cpu();
 	int ret = 0;
 
 	if (t->properties & SL_THD_PROPERTY_SEND) {
@@ -409,15 +454,8 @@ sl_thd_activate(struct sl_thd *t, sched_tok_t tok)
 		return cos_switch(sl_thd_thdcap(t), sl_thd_tcap(t), t->prio,
 				  g->timeout_next, g->sched_rcv, tok);
 	} else {
-		ret = cos_defswitch(sl_thd_thdcap(t), t->prio, t == g->sched_thd ?
-				    TCAP_TIME_NIL : g->timeout_next, tok);
-		if (likely(t != g->sched_thd || ret != -EPERM)) return ret;
-
-		/*
-		 * Attempting to activate scheduler thread failed for no budget in it's tcap.
-		 * Force switch to the scheduler with current tcap.
-		 */
-		return cos_switch(sl_thd_thdcap(g->sched_thd), 0, t->prio, 0, g->sched_rcv, tok);
+		return sl_thd_dispatch(t, tok, sl_thd_curr());
+		//return sl_thd_dispatch_slowpath(t, tok);
 	}
 }
 
diff --git a/src/components/include/sl_thd.h b/src/components/include/sl_thd.h
index bd1035f27c..beadf4be74 100644
--- a/src/components/include/sl_thd.h
+++ b/src/components/include/sl_thd.h
@@ -33,6 +33,11 @@ struct event_info {
 	tcap_time_t timeout;
 };
 
+struct sl_dcb_info {
+	unsigned long ip;
+	unsigned long sp;
+} __attribute__((__packed__));
+
 struct sl_thd {
 	sl_thd_state_t       state;
 	/*
@@ -95,8 +100,22 @@ struct sl_thd {
 
 	struct event_info event_info;
 	struct ps_list    SL_THD_EVENT_LIST; /* list of events for the scheduler end-point */
+
+	struct sl_dcb_info dcb;
 };
 
+static inline struct sl_dcb_info *
+sl_thd_dcbinfo(struct sl_thd *t)
+{ return &t->dcb; }
+
+static inline unsigned long *
+sl_thd_ip(struct sl_thd *t)
+{ return &t->dcb.ip; }
+
+static inline unsigned long *
+sl_thd_sp(struct sl_thd *t)
+{ return &t->dcb.sp; }
+
 static inline struct cos_aep_info *
 sl_thd_aepinfo(struct sl_thd *t)
 { return (t->aepinfo); }
diff --git a/src/components/lib/sl/Makefile b/src/components/lib/sl/Makefile
index 11683f3f03..6599156abd 100644
--- a/src/components/lib/sl/Makefile
+++ b/src/components/lib/sl/Makefile
@@ -1,6 +1,6 @@
 include Makefile.src Makefile.comp
 
-LIB_OBJS=sl_capmgr.o sl_raw.o sl_sched.o sl_xcpu.o sl_child.o sl_mod_rr.o sl_mod_fprr.o sl_lock.o sl_thd_static_backend.o
+LIB_OBJS=sl_capmgr.o sl_raw.o sl_sched.o sl_xcpu.o sl_child.o sl_mod_fprr.o sl_mod_rr.o sl_lock.o sl_thd_static_backend.o sl_slowpath.o
 LIBS=$(LIB_OBJS:%.o=%.a)
 CINC+=-m32
 
@@ -13,5 +13,10 @@ all: $(LIBS)
 	@$(CC) $(CFLAGS) $(CINC) -o $(@:%.a=%.o) -c $<
 	@$(AR) cr lib$@ $(@:%.a=%.o)
 
+%.a:%.S
+	$(info |     [AS] Creating library file $@ from $^)
+	@$(AS) $(ASFLAGS) -c -o $(@:%.a=%.o) $^
+	@$(AR) cr lib$@ $(@:%.a=%.o)
+
 clean:
 	@rm -f *.o *.a *.d
diff --git a/src/components/lib/sl/sl_sched.c b/src/components/lib/sl/sl_sched.c
index 8a619da008..8cbdf5c5c3 100644
--- a/src/components/lib/sl/sl_sched.c
+++ b/src/components/lib/sl/sl_sched.c
@@ -190,6 +190,15 @@ sl_thd_sched_block_no_cs(struct sl_thd *t, sl_thd_state_t block_type, cycles_t t
 	return 0;
 }
 
+int
+sl_thd_dispatch_slowpath(struct sl_thd *t, sched_tok_t tok)
+{
+	struct sl_global_cpu *g = sl__globals_cpu();
+
+	/* no timeouts for now! */
+	return cos_switch(sl_thd_thdcap(t), g->sched_tcap, t->prio, TCAP_TIME_NIL /*t == g->sched_thd ? TCAP_TIME_NIL : g->timeout_next*/, 0 /* don't switch to scheduler in the middle of this! */, tok);
+}
+
 /*
  * Wake "t" up if it was previously blocked on cos_rcv and got
  * to run before the scheduler (tcap-activated)!
diff --git a/src/components/lib/sl/sl_slowpath.S b/src/components/lib/sl/sl_slowpath.S
new file mode 100644
index 0000000000..88cc2832a5
--- /dev/null
+++ b/src/components/lib/sl/sl_slowpath.S
@@ -0,0 +1,9 @@
+.text
+.globl thd_dispatch_slowpath
+.type thd_dispatch_slowpath, @function
+thd_dispatch_slowpath:
+	pushl %edi
+	pushl %esi
+	call sl_thd_dispatch_slowpath
+	popl %esi
+	popl %edi
diff --git a/src/platform/i386/qemu-kvm.sh b/src/platform/i386/qemu-kvm.sh
new file mode 100755
index 0000000000..ea964376b4
--- /dev/null
+++ b/src/platform/i386/qemu-kvm.sh
@@ -0,0 +1,15 @@
+#!/bin/sh
+if [ $# != 1 ]; then
+  echo "Usage: $0 <run-script.sh>"
+  exit 1
+fi
+
+if ! [ -r $1 ]; then
+  echo "Can't open run-script"
+  exit 1
+fi
+
+MODULES=$(sh $1 | awk '/^Writing image/ { print $3; }' | tr '\n' ' ')
+
+#qemu-system-i386 -m 768 -nographic -kernel kernel.img -no-reboot -s -initrd "$(echo $MODULES | tr ' ' ',')"
+qemu-system-i386 -enable-kvm -rtc base=localtime,clock=host,driftfix=none -smp sockets=1,cores=1,threads=1 -cpu host -nographic -m 768 -kernel kernel.img -initrd "$(echo $MODULES | tr ' ' ',')"

From a2c1cd3abb7c7c745b416574f164a570120e13b2 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Thu, 14 Feb 2019 17:17:56 -0500
Subject: [PATCH 024/127] Remove warnings with -O3 compilation

---
 src/components/Makefile.comp                          | 6 +++---
 src/components/implementation/capmgr/naive/cap_info.c | 3 ++-
 src/components/include/res_spec.h                     | 6 +++---
 src/components/lib/posix/posix.c                      | 2 +-
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/components/Makefile.comp b/src/components/Makefile.comp
index a70420f886..0219f4ada9 100644
--- a/src/components/Makefile.comp
+++ b/src/components/Makefile.comp
@@ -40,9 +40,9 @@ LUAINC=-I$(LUADIR)/src -I$(LUABASE)/cos/include
 
 INC_PATH=-I./ -I$(CDIR)/include/ -I$(CDIR)/interface/ -I$(SHAREDINC) -I$(CKINCDIR)
 SHARED_FLAGS=-fno-merge-constants -nostdinc -nostdlib -fno-pic
-OPT= -g
-#OPT= -O3
-CFLAGS=-m32 -D__x86__ -Wall -Wextra -Wno-unused-parameter -Wno-unused-function -fno-stack-protector -fno-omit-frame-pointer -Wno-unused-variable -fvar-tracking $(INC_PATH) $(MUSLINC) $(LWIPINC) $(LUAINC) $(OPT) $(SHARED_FLAGS)
+OPT= -g -fvar-tracking
+OPT= -O3
+CFLAGS=-m32 -D__x86__ -Wall -Wextra -Wno-unused-parameter -Wno-unused-function -fno-stack-protector -fno-omit-frame-pointer -Wno-unused-variable $(INC_PATH) $(MUSLINC) $(LWIPINC) $(LUAINC) $(OPT) $(SHARED_FLAGS)
 CXXFLAGS=-fno-exceptions -fno-threadsafe-statics -Wno-write-strings $(CFLAGS)
 LDFLAGS=-melf_i386
 MUSLCFLAGS=$(CFLAGS) -lc -lgcc -Xlinker -r
diff --git a/src/components/implementation/capmgr/naive/cap_info.c b/src/components/implementation/capmgr/naive/cap_info.c
index 3fa40f737a..1d2e242e7c 100644
--- a/src/components/implementation/capmgr/naive/cap_info.c
+++ b/src/components/implementation/capmgr/naive/cap_info.c
@@ -225,7 +225,8 @@ cap_shmem_region_find(cos_channelkey_t key)
 	cbuf_t i, free = rglb->free_region_id;
 
 	for (i = 1; i <= free; i++) {
-		if (ps_load((unsigned long *)&rglb->region_keys[i - 1]) == key) {
+		cos_channelkey_t *k = &rglb->region_keys[i - 1];
+		if (ps_load((unsigned long *)k) == (unsigned long)key) {
 			id = i;
 			break;
 		}
diff --git a/src/components/include/res_spec.h b/src/components/include/res_spec.h
index e109b8a2fb..e81736950a 100644
--- a/src/components/include/res_spec.h
+++ b/src/components/include/res_spec.h
@@ -64,10 +64,10 @@ sched_param_pack(sched_param_type_t type, unsigned int value)
 static inline void
 sched_param_get(sched_param_t sp, sched_param_type_t *type, unsigned int *value)
 {
-	struct sched_param_s s = *(struct sched_param_s *)(void *)&sp;
+	union sched_param_union us = *(union sched_param_union *)&sp;
 
-	*type  = s.type;
-	*value = s.value;
+	*type  = us.c.type;
+	*value = us.c.value;
 }
 
 #endif /* RES_SPEC_H */
diff --git a/src/components/lib/posix/posix.c b/src/components/lib/posix/posix.c
index 73166a7524..fc1e8c366b 100644
--- a/src/components/lib/posix/posix.c
+++ b/src/components/lib/posix/posix.c
@@ -362,7 +362,7 @@ struct sl_lock futex_lock = SL_LOCK_STATIC_INIT();
 int
 cos_futex_wait(struct futex_data *futex, int *uaddr, int val, const struct timespec *timeout)
 {
-	cycles_t   deadline;
+	cycles_t   deadline = sl_now();
 	microsec_t wait_time;
 	struct futex_waiter waiter = (struct futex_waiter) {
 		.thdid = sl_thdid()

From 735c7239188db92a5d93ab3fb30a33ebedb46c72 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Thu, 14 Feb 2019 17:45:33 -0500
Subject: [PATCH 025/127] Use RR in sl unit test

---
 src/components/implementation/tests/unit_schedtests/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/components/implementation/tests/unit_schedtests/Makefile b/src/components/implementation/tests/unit_schedtests/Makefile
index e46827dc8d..1735aff577 100644
--- a/src/components/implementation/tests/unit_schedtests/Makefile
+++ b/src/components/implementation/tests/unit_schedtests/Makefile
@@ -2,7 +2,7 @@ COMPONENT=unit_schedlibtests.o
 INTERFACES=
 DEPENDENCIES=
 IF_LIB=
-ADDITIONAL_LIBS=-lcobj_format $(LIBSLRAW) -lsl_mod_fprr -lsl_thd_static_backend
+ADDITIONAL_LIBS=-lcobj_format $(LIBSLRAW) -lsl_mod_rr -lsl_thd_static_backend
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o

From 45f653585c1d9ef2e71ae1b612144e3a5221d88a Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Mon, 18 Feb 2019 14:41:42 -0500
Subject: [PATCH 026/127] (incomplete) Modifications to have kernel shared DCB
 and SCB areas

* TODO: API changes in capmgr for thread creation to pass in the address.
  My design is to let the users/schedulers decide the locality of a thread in
  a dcb region.
  * API changes obviously in the scheduling API, etc!
  * Most importantly, the kernel is not modified to detect an inconsistent state
  of an actual thread running vs the thread kernel thinks is running!
  And other checks for invocations etc!

* But the good news is, it works! Using the same heap allocation type API with no
special bump pointers or special capability type for SCB/DCB!
---
 src/components/Makefile.comp                  |  4 +-
 .../implementation/capmgr/naive/cap_info.h    |  2 +
 .../implementation/capmgr/naive/cap_mgr.c     | 80 +++++++++--------
 .../implementation/capmgr/naive/init.c        | 22 ++++-
 .../implementation/capmgr/naive/mem_mgr.c     | 30 +++++++
 .../no_interface/llbooter/boot_deps.h         | 85 +++++++++++++++----
 .../no_interface/llbooter/llbooter.c          |  4 +-
 .../no_interface/vkernel/micro_booter.h       |  3 +
 .../no_interface/vkernel/vk_api.c             | 14 ++-
 .../no_interface/vkernel/vkernel.c            |  2 +-
 src/components/implementation/sched/sched.c   |  4 +-
 .../implementation/sched/sched_info.c         |  2 +-
 .../tests/micro_booter/mb_tests.c             | 59 ++++++++++---
 .../tests/micro_booter/micro_booter.c         | 50 ++++++++++-
 .../tests/micro_booter/micro_booter.h         |  4 +
 .../tests/unit_defcompinfo/unit_defcompinfo.c |  6 +-
 .../tests/unit_schedtests/unit_schedlib.c     | 19 ++++-
 src/components/include/cos_component.h        | 26 +++++-
 src/components/include/cos_dcb.h              | 11 +++
 src/components/include/cos_defkernel_api.h    | 23 +++--
 src/components/include/cos_kernel_api.h       | 13 +--
 src/components/include/hypercall.h            | 16 +++-
 src/components/include/sl.h                   | 21 ++---
 src/components/include/sl_thd.h               | 15 ++--
 src/components/include/sl_xcpu.h              |  1 +
 src/components/interface/capmgr/memmgr.h      |  3 +
 .../interface/capmgr/stubs/s_stub.S           |  2 +
 src/components/lib/Makefile                   |  2 +-
 src/components/lib/cos_component.c            |  6 +-
 src/components/lib/cos_dcbcapmgr.c            | 45 ++++++++++
 src/components/lib/cos_dcbraw.c               | 47 ++++++++++
 src/components/lib/cos_defkernel_api.c        | 40 ++++-----
 src/components/lib/cos_kernel_api.c           | 67 ++++++++++-----
 src/components/lib/sl/sl_capmgr.c             | 39 +++++----
 src/components/lib/sl/sl_raw.c                | 54 ++++++------
 src/components/lib/sl/sl_sched.c              | 13 ++-
 src/kernel/capinv.c                           | 45 ++++++----
 src/kernel/include/component.h                | 26 +++++-
 src/kernel/include/pgtbl.h                    |  2 +
 src/kernel/include/shared/cos_config.h        |  1 +
 src/kernel/include/shared/cos_types.h         | 20 ++++-
 src/kernel/include/thd.h                      | 13 ++-
 src/platform/i386/boot_comp.c                 | 52 +++++++++---
 43 files changed, 752 insertions(+), 241 deletions(-)
 create mode 100644 src/components/include/cos_dcb.h
 create mode 100644 src/components/lib/cos_dcbcapmgr.c
 create mode 100644 src/components/lib/cos_dcbraw.c

diff --git a/src/components/Makefile.comp b/src/components/Makefile.comp
index 0219f4ada9..602bb7a0a1 100644
--- a/src/components/Makefile.comp
+++ b/src/components/Makefile.comp
@@ -53,5 +53,5 @@ CLIENT_STUB=c_stub.o
 
 LIBCOSDEFKERN=-lcos_kernel_api -lcos_defkernel_api
 LIBSLCORE=$(LIBCOSDEFKERN) -lsl_sched -lheap -lsl_xcpu -lsl_child -lck -lsl_slowpath
-LIBSLCAPMGR=$(LIBSLCORE) -lsl_capmgr
-LIBSLRAW=$(LIBSLCORE) -lsl_raw
+LIBSLCAPMGR=$(LIBSLCORE) -lsl_capmgr -lcos_dcbcapmgr
+LIBSLRAW=$(LIBSLCORE) -lsl_raw -lcos_dcbraw
diff --git a/src/components/implementation/capmgr/naive/cap_info.h b/src/components/implementation/capmgr/naive/cap_info.h
index 8a1ded7e78..fc9a85dc03 100644
--- a/src/components/implementation/capmgr/naive/cap_info.h
+++ b/src/components/implementation/capmgr/naive/cap_info.h
@@ -43,6 +43,8 @@ struct cap_comp_cpu_info {
 	int p_thd_iterator; /* iterator for parent to get all threads created by capmgr in this component so far! */
 	thdcap_t p_initthdcap; /* init thread's cap in parent */
 	thdid_t  initthdid; /* init thread's tid */
+
+	vaddr_t  initdcbpg;
 } CACHE_ALIGNED;
 
 struct cap_comp_info {
diff --git a/src/components/implementation/capmgr/naive/cap_mgr.c b/src/components/implementation/capmgr/naive/cap_mgr.c
index bcf330af6a..3b4f5c6618 100644
--- a/src/components/implementation/capmgr/naive/cap_mgr.c
+++ b/src/components/implementation/capmgr/naive/cap_mgr.c
@@ -19,7 +19,7 @@ capmgr_thd_create_cserialized(thdid_t *tid, int *unused, thdclosure_index_t idx)
 	if (!cap_info_is_sched(cur)) return 0;
 	if (idx <= 0) return 0;
 
-	t = sl_thd_aep_alloc_ext(cap_info_dci(r), NULL, idx, 0, 0, 0, NULL);
+	t = sl_thd_aep_alloc_ext(cap_info_dci(r), NULL, idx, 0, 0, 0, 0, NULL);
 	if (!t) return 0;
 	thdcap = cos_cap_cpy(cap_info_ci(r), cap_ci, CAP_THD, sl_thd_thdcap(t));
 	if (!thdcap) goto err;
@@ -51,7 +51,7 @@ capmgr_thd_create_ext_cserialized(thdid_t *tid, int *unused, spdid_t s, thdclosu
 	if (cap_info_is_sched(s)) return 0;
 	if (idx <= 0) return 0;
 
-	t = sl_thd_aep_alloc_ext(cap_info_dci(rs), NULL, idx, 0, 0, 0, NULL);
+	t = sl_thd_aep_alloc_ext(cap_info_dci(rs), NULL, idx, 0, 0, 0, 0, NULL);
 	if (!t) return 0;
 	thdcap = cos_cap_cpy(cap_info_ci(rc), cap_ci, CAP_THD, sl_thd_thdcap(t));
 	if (!thdcap) goto err;
@@ -71,20 +71,25 @@ capmgr_thd_create_ext_cserialized(thdid_t *tid, int *unused, spdid_t s, thdclosu
 thdcap_t
 capmgr_initthd_create_cserialized(thdid_t *tid, int *unused, spdid_t s)
 {
-	spdid_t                 cur     = cos_inv_token();
-	struct cos_defcompinfo *cap_dci = cos_defcompinfo_curr_get();
-	struct cos_compinfo    *cap_ci  = cos_compinfo_get(cap_dci);
-	struct cap_comp_info   *rc      = cap_info_comp_find(cur);
-	struct cap_comp_info   *rs      = cap_info_comp_find(s);
-	struct sl_thd          *t       = NULL;
-	thdcap_t                thdcap  = 0;
+	spdid_t                   cur     = cos_inv_token();
+	struct cos_defcompinfo   *cap_dci = cos_defcompinfo_curr_get();
+	struct cos_compinfo      *cap_ci  = cos_compinfo_get(cap_dci);
+	struct cap_comp_info     *rc      = cap_info_comp_find(cur);
+	struct cap_comp_info     *rs      = cap_info_comp_find(s);
+	struct cap_comp_cpu_info *rs_cpu  = cap_info_cpu_local(rs);
+	struct cos_compinfo      *rs_ci   = cap_info_ci(rs);
+	struct sl_thd            *t       = NULL;
+	thdcap_t                  thdcap  = 0;
 
 	if (!rc || !cap_info_init_check(rc)) return 0;
 	if (!rs || !cap_info_init_check(rs)) return 0;
 	if (!cap_info_is_sched(cur) || !cap_info_is_child(rc, s)) return 0;
 	if (cap_info_is_sched(s)) return 0;
 
-	t = sl_thd_initaep_alloc(cap_info_dci(rs), NULL, 0, 0, 0);
+	assert(rs_cpu->initdcbpg == 0);
+	rs_cpu->initdcbpg = (vaddr_t)cos_dcbpg_bump_allocn(rs_ci, PAGE_SIZE);
+	assert(rs_cpu->initdcbpg);
+	t = sl_thd_initaep_alloc(cap_info_dci(rs), NULL, 0, 0, 0, rs_cpu->initdcbpg);
 	if (!t) return 0;
 	/* child is not a scheduler, don't copy into child */
 	/* parent only needs the thdcap */
@@ -106,18 +111,20 @@ capmgr_initthd_create_cserialized(thdid_t *tid, int *unused, spdid_t s)
 thdcap_t
 capmgr_initaep_create_cserialized(u32_t *sndtidret, u32_t *rcvtcret, spdid_t s, int owntc, cos_channelkey_t key)
 {
-	spdid_t                 cur     = cos_inv_token();
-	struct cos_defcompinfo *cap_dci = cos_defcompinfo_curr_get();
-	struct cos_compinfo    *cap_ci  = cos_compinfo_get(cap_dci);
-	struct cap_comp_info   *rc      = cap_info_comp_find(cur);
-	struct cap_comp_info   *rs      = cap_info_comp_find(s);
-	struct sl_thd          *t       = NULL, *rinit = NULL;
-	thdcap_t                thdcap  = 0;
-	int                     ret;
-	tcap_t                  tc;
-	arcvcap_t               rcv;
-	asndcap_t               snd;
-	thdid_t                 tid;
+	spdid_t                   cur     = cos_inv_token();
+	struct cos_defcompinfo   *cap_dci = cos_defcompinfo_curr_get();
+	struct cos_compinfo      *cap_ci  = cos_compinfo_get(cap_dci);
+	struct cap_comp_info     *rc      = cap_info_comp_find(cur);
+	struct cap_comp_info     *rs      = cap_info_comp_find(s);
+	struct cap_comp_cpu_info *rs_cpu  = cap_info_cpu_local(rs);
+	struct cos_compinfo      *rs_ci   = cap_info_ci(rs);
+	struct sl_thd            *t       = NULL, *rinit = NULL;
+	thdcap_t                  thdcap  = 0;
+	int                       ret;
+	tcap_t                    tc;
+	arcvcap_t                 rcv;
+	asndcap_t                 snd;
+	thdid_t                   tid;
 
 	if (!rc || !cap_info_init_check(rc)) return 0;
 	if (!rs || !cap_info_init_check(rs)) return 0;
@@ -126,36 +133,39 @@ capmgr_initaep_create_cserialized(u32_t *sndtidret, u32_t *rcvtcret, spdid_t s,
 
 	rinit = cap_info_initthd(rc);
 	if (!rinit) return 0;
-	t = sl_thd_initaep_alloc(cap_info_dci(rs), rinit, 1, owntc, 0);
+	assert(rs_cpu->initdcbpg == 0);
+	rs_cpu->initdcbpg = (vaddr_t)cos_dcbpg_bump_allocn(rs_ci, PAGE_SIZE);
+	assert(rs_cpu->initdcbpg);
+	t = sl_thd_initaep_alloc(cap_info_dci(rs), rinit, 1, owntc, 0, rs_cpu->initdcbpg);
 	if (!t) return 0;
 	/* child is a scheduler.. copy initcaps */
-	ret = cos_cap_cpy_at(cap_info_ci(rs), BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, cap_ci, sl_thd_thdcap(t));
+	ret = cos_cap_cpy_at(rs_ci, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, cap_ci, sl_thd_thdcap(t));
 	if (ret) goto err;
-	ret = cos_cap_cpy_at(cap_info_ci(rs), BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, cap_ci, sl_thd_rcvcap(t));
+	ret = cos_cap_cpy_at(rs_ci, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, cap_ci, sl_thd_rcvcap(t));
 	if (ret) goto err;
 	if (owntc) {
-		ret = cos_cap_cpy_at(cap_info_ci(rs), BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, cap_ci, sl_thd_tcap(t));
+		ret = cos_cap_cpy_at(rs_ci, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, cap_ci, sl_thd_tcap(t));
 		if (ret) goto err;
 	} else {
 		/* if it's a scheduler.. use parent's tcap (current spdid) */
-		ret = cos_cap_cpy_at(cap_info_ci(rs), BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, cap_ci, sl_thd_tcap(rinit));
+		ret = cos_cap_cpy_at(rs_ci, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, cap_ci, sl_thd_tcap(rinit));
 		if (ret) goto err;
 	}
 
 	/* parent needs tcap/rcv to manage time. thd/asnd to activate. */
-	ret = cos_cap_cpy(cap_info_ci(rc), cap_ci, CAP_THD, sl_thd_thdcap(t));
+	ret = cos_cap_cpy(rs_ci, cap_ci, CAP_THD, sl_thd_thdcap(t));
 	if (!ret) goto err;
-	rcv = cos_cap_cpy(cap_info_ci(rc), cap_ci, CAP_ARCV, sl_thd_rcvcap(t));
+	rcv = cos_cap_cpy(rs_ci, cap_ci, CAP_ARCV, sl_thd_rcvcap(t));
 	if (!rcv) goto err;
-	tc = cos_cap_cpy(cap_info_ci(rc), cap_ci, CAP_TCAP, sl_thd_tcap(t));
+	tc = cos_cap_cpy(rs_ci, cap_ci, CAP_TCAP, sl_thd_tcap(t));
 	if (!tc) goto err;
-	snd = cos_cap_cpy(cap_info_ci(rc), cap_ci, CAP_ASND, sl_thd_asndcap(t));
+	snd = cos_cap_cpy(rs_ci, cap_ci, CAP_ASND, sl_thd_asndcap(t));
 	if (!snd) goto err;
 
 	cap_info_thd_init(rc, t, key);
 	cap_info_initthd_init(rs, t, 0);
-	cap_info_cpu_local(rs)->p_initthdcap = thdcap = ret;
-	cap_info_cpu_local(rs)->initthdid    = tid = sl_thd_thdid(t);
+	rs_cpu->p_initthdcap = thdcap = ret;
+	rs_cpu->initthdid    = tid = sl_thd_thdid(t);
 	*rcvtcret  = (rcv << 16) | (tc);
 	*sndtidret = (snd << 16) | (tid);
 
@@ -190,7 +200,7 @@ capmgr_aep_create_ext_cserialized(u32_t *drcvtidret, u32_t *rcvtcret, spdid_t s,
 	rinit = cap_info_initthd(rc);
 	if (!rinit) return 0;
 
-	t = sl_thd_aep_alloc_ext(cap_info_dci(rs), rinit, tidx, 1, owntc, 0, &srcrcv);
+	t = sl_thd_aep_alloc_ext(cap_info_dci(rs), rinit, tidx, 1, owntc, 0, 0, &srcrcv);
 	if (!t) return 0;
 	/* cur is a scheduler, copy thdcap */
 	ret = cos_cap_cpy(cap_info_ci(rc), cap_ci, CAP_THD, sl_thd_thdcap(t));
@@ -252,7 +262,7 @@ capmgr_aep_create_cserialized(thdid_t *tid, u32_t *tcrcvret, thdclosure_index_t
 	rinit = cap_info_initthd(rc);
 	if (!rinit) return 0;
 
-	t = sl_thd_aep_alloc_ext(cap_info_dci(rc), rinit, tidx, 1, owntc, 0, &rcv);
+	t = sl_thd_aep_alloc_ext(cap_info_dci(rc), rinit, tidx, 1, owntc, 0, 0, &rcv);
 	if (!t) return 0;
 	/* current is a sched, so copy */
 	ret = cos_cap_cpy(cap_info_ci(rc), cap_ci, CAP_THD, sl_thd_thdcap(t));
diff --git a/src/components/implementation/capmgr/naive/init.c b/src/components/implementation/capmgr/naive/init.c
index 99c98cea30..77c9b4e0ea 100644
--- a/src/components/implementation/capmgr/naive/init.c
+++ b/src/components/implementation/capmgr/naive/init.c
@@ -29,6 +29,7 @@ capmgr_comp_info_iter_cpu(void)
 		spdid_t childid;
 		comp_flag_t ch_flags;
 		struct cos_aep_info aep;
+		vaddr_t initdcbpg = 0;
 
 		memset(&aep, 0, sizeof(struct cos_aep_info));
 		assert(rci);
@@ -40,7 +41,7 @@ capmgr_comp_info_iter_cpu(void)
 		if (spdid == 0 || (spdid != cos_spd_id() && cap_info_is_child(btinfo, spdid))) {
 			is_sched = (spdid == 0 || cap_info_is_sched_child(btinfo, spdid)) ? 1 : 0;
 
-			ret = hypercall_comp_initaep_get(spdid, is_sched, &aep);
+			ret = hypercall_comp_initaep_get(spdid, is_sched, &aep, &sched_spdid);
 			assert(ret == 0);
 		}
 
@@ -59,6 +60,12 @@ capmgr_comp_info_iter_cpu(void)
 			if (!remain_child) break;
 		}
 
+		if (sched_spdid == 0) {
+			initdcbpg = hypercall_initdcb_get(spdid);
+			assert(initdcbpg);
+			rci_cpu->initdcbpg = initdcbpg;
+		}
+
 		if (aep.thd) {
 			ithd = sl_thd_init_ext(&aep, NULL);
 			assert(ithd);
@@ -100,6 +107,7 @@ capmgr_comp_info_iter(void)
 		spdid_t childid;
 		comp_flag_t ch_flags;
 		struct cos_aep_info aep;
+		vaddr_t initdcbpg = 0;
 
 		memset(&aep, 0, sizeof(struct cos_aep_info));
 
@@ -111,10 +119,12 @@ capmgr_comp_info_iter(void)
 
 		num_comps ++;
 		if (spdid == 0 || (spdid != cos_spd_id() && cap_info_is_child(btinfo, spdid))) {
+			spdid_t ss = 0;
+
 			is_sched = (spdid == 0 || cap_info_is_sched_child(btinfo, spdid)) ? 1 : 0;
 
-			ret = hypercall_comp_initaep_get(spdid, is_sched, &aep);
-			assert(ret == 0);
+			ret = hypercall_comp_initaep_get(spdid, is_sched, &aep, &ss);
+			assert(ret == 0 && ss == sched_spdid);
 		}
 
 		ret = hypercall_comp_frontier_get(spdid, &vasfr, &capfr);
@@ -134,6 +144,12 @@ capmgr_comp_info_iter(void)
 			if (!remain_child) break;
 		}
 
+		if (sched_spdid == 0) {
+			initdcbpg = hypercall_initdcb_get(spdid);
+			assert(initdcbpg);
+			rci_cpu->initdcbpg = initdcbpg;
+		}
+
 		if (aep.thd) {
 			ithd = sl_thd_init_ext(&aep, NULL);
 			assert(ithd);
diff --git a/src/components/implementation/capmgr/naive/mem_mgr.c b/src/components/implementation/capmgr/naive/mem_mgr.c
index bb1d64936e..a0e58ee71c 100644
--- a/src/components/implementation/capmgr/naive/mem_mgr.c
+++ b/src/components/implementation/capmgr/naive/mem_mgr.c
@@ -24,6 +24,36 @@ memmgr_heap_page_allocn(unsigned long npages)
 	return dst_pg;
 }
 
+vaddr_t
+memmgr_dcbpage_allocn(unsigned long npages)
+{
+	spdid_t cur = cos_inv_token();
+	struct cos_compinfo  *cap_ci  = cos_compinfo_get(cos_defcompinfo_curr_get());
+	struct cap_comp_info *cur_rci = cap_info_comp_find(cur);
+	struct cos_compinfo  *cur_ci  = cap_info_ci(cur_rci);
+	vaddr_t pg;
+
+	if (!cur_rci || !cap_info_init_check(cur_rci)) return 0;
+	if (!cur_ci) return 0;
+
+	pg = (vaddr_t)cos_dcbpg_bump_allocn(cur_ci, npages * PAGE_SIZE);
+
+	return pg;
+}
+
+vaddr_t
+memmgr_initdcbpage_retrieve(void)
+{
+	spdid_t cur = cos_inv_token();
+	struct cap_comp_info *cur_rci     = cap_info_comp_find(cur);
+	struct cap_comp_cpu_info *rci_cpu = cap_info_cpu_local(cur_rci);
+
+	/* this should have been initialized either through the booter for capmgr/root-sched or by the capmgr on inithrd creation in cur component */
+	assert(rci_cpu->initdcbpg);
+
+	return rci_cpu->initdcbpg;
+}
+
 cbuf_t
 memmgr_shared_page_allocn_cserialized(vaddr_t *pgaddr, int *unused, unsigned long npages)
 {
diff --git a/src/components/implementation/no_interface/llbooter/boot_deps.h b/src/components/implementation/no_interface/llbooter/boot_deps.h
index e45121ee8f..3c9254beda 100644
--- a/src/components/implementation/no_interface/llbooter/boot_deps.h
+++ b/src/components/implementation/no_interface/llbooter/boot_deps.h
@@ -29,13 +29,15 @@ struct comp_sched_info {
 
 /* The booter uses this to keep track of each comp */
 struct comp_cap_info {
-	struct cos_defcompinfo  def_cinfo;
-	struct usr_inv_cap      ST_user_caps[INTERFACE_UNDEF_SYMBS];
-	vaddr_t                 vaddr_user_caps; /* vaddr of user caps table in comp */
-	vaddr_t                 addr_start;
-	vaddr_t                 vaddr_mapped_in_booter;
-	vaddr_t                 upcall_entry;
-	struct comp_sched_info *schedinfo[NUM_CPU];
+	struct cos_defcompinfo            def_cinfo;
+	struct usr_inv_cap                ST_user_caps[INTERFACE_UNDEF_SYMBS];
+	vaddr_t                           vaddr_user_caps; /* vaddr of user caps table in comp */
+	vaddr_t                           addr_start;
+	vaddr_t                           vaddr_mapped_in_booter;
+	vaddr_t                           upcall_entry;
+	vaddr_t                           scbpg, initdcbpg[NUM_CPU];
+	struct comp_sched_info           *schedinfo[NUM_CPU];
+	struct cos_component_information *cobj_info;
 } new_comp_cap_info[MAX_NUM_SPDS];
 
 int                   schedule[NUM_CPU][MAX_NUM_SPDS];
@@ -55,6 +57,14 @@ boot_spd_comp_schedinfo_curr_get(void)
 	return &comp_schedinfo[cos_cpuid()][0];
 }
 
+static inline struct cos_component_information *
+boot_spd_comp_cobj_info_get(spdid_t spdid)
+{
+	assert(spdid && spdid <= MAX_NUM_SPDS);
+
+	return boot_spd_compcapinfo_get(spdid)->cobj_info;
+}
+
 static inline struct comp_sched_info *
 boot_spd_comp_schedinfo_get(spdid_t spdid)
 {
@@ -146,14 +156,38 @@ boot_capmgr_mem_alloc(void)
 void
 boot_comp_mem_alloc(spdid_t spdid)
 {
-	struct cos_compinfo *compinfo = boot_spd_compinfo_get(spdid);
-	struct cos_compinfo *boot_info   = boot_spd_compinfo_curr_get();
+	struct cos_compinfo *compinfo  = boot_spd_compinfo_get(spdid);
+	struct cos_compinfo *boot_info = boot_spd_compinfo_curr_get();
 	unsigned long mem_sz = capmgr_spdid ? CAPMGR_MIN_UNTYPED_SZ : LLBOOT_NEWCOMP_UNTYPED_SZ;
 
 	if (capmgr_spdid) return;
 	cos_meminfo_alloc(compinfo, BOOT_MEM_KM_BASE, mem_sz);
 }
 
+static void
+boot_comp_scb_alloc(spdid_t spdid)
+{
+	struct comp_cap_info   *spdinfo  = boot_spd_compcapinfo_get(spdid);
+	struct cos_compinfo    *compinfo = boot_spd_compinfo_get(spdid);
+	struct comp_sched_info *spdsi    = boot_spd_comp_schedinfo_get(spdid);
+
+	spdinfo->scbpg = (vaddr_t)cos_scbpg_bump_allocn(compinfo, COS_SCB_SIZE);
+	assert(spdinfo->scbpg);
+}
+
+/* TODO: Should booter create that INITDCB page for all components for each core? */
+static void
+boot_comp_dcb_alloc(spdid_t spdid)
+{
+	int i;
+	struct comp_cap_info   *spdinfo  = boot_spd_compcapinfo_get(spdid);
+	struct cos_compinfo    *compinfo = boot_spd_compinfo_get(spdid);
+	struct comp_sched_info *spdsi    = boot_spd_comp_schedinfo_get(spdid);
+
+	spdinfo->initdcbpg[cos_cpuid()] = (vaddr_t)cos_dcbpg_bump_allocn(compinfo, PAGE_SIZE);
+	assert(spdinfo->initdcbpg[cos_cpuid()]);
+}
+
 /* Initialize just the captblcap and pgtblcap, due to hack for upcall_fn addr */
 static void
 boot_compinfo_init(spdid_t spdid, captblcap_t *ct, pgtblcap_t *pt, u32_t heap_start_vaddr)
@@ -189,8 +223,8 @@ boot_newcomp_sinv_alloc(spdid_t spdid)
 	int i = 0;
 	int intr_spdid;
 	void *user_cap_vaddr;
-	struct cos_compinfo *interface_compinfo;
-	struct cos_compinfo *compinfo  = boot_spd_compinfo_get(spdid);
+	struct cos_compinfo  *interface_compinfo;
+	struct cos_compinfo  *compinfo = boot_spd_compinfo_get(spdid);
 	struct comp_cap_info *spdinfo  = boot_spd_compcapinfo_get(spdid);
 	/* TODO: Purge rest of booter of spdid convention */
 	invtoken_t token = (invtoken_t)spdid;
@@ -240,8 +274,10 @@ boot_newcomp_defcinfo_init(spdid_t spdid)
 	struct cos_compinfo    *child_ci  = boot_spd_compinfo_get(spdid);
 	struct cos_compinfo    *boot_info = boot_spd_compinfo_curr_get();
 	struct comp_sched_info *spdsi     = boot_spd_comp_schedinfo_get(spdid);
+	struct comp_cap_info   *spdinfo   = boot_spd_compcapinfo_get(spdid);
 
-	child_aep->thd = cos_initthd_alloc(boot_info, child_ci->comp_cap);
+	boot_comp_dcb_alloc(spdid);
+	child_aep->thd = cos_initthd_alloc(boot_info, child_ci->comp_cap, child_ci->pgtbl_cap, spdinfo->initdcbpg[cos_cpuid()]);
 	assert(child_aep->thd);
 
 	if (spdsi->flags & COMP_FLAG_SCHED) {
@@ -359,6 +395,7 @@ boot_newcomp_create(spdid_t spdid, struct cos_compinfo *comp_info)
 	struct cos_compinfo *compinfo  = boot_spd_compinfo_get(spdid);
 	struct cos_compinfo *boot_info = boot_spd_compinfo_curr_get();
 	struct comp_cap_info *spdinfo  = boot_spd_compcapinfo_get(spdid);
+	struct cos_component_information *cobj_info = boot_spd_comp_cobj_info_get(spdid);
 	captblcap_t ct = compinfo->captbl_cap;
 	pgtblcap_t  pt = compinfo->pgtbl_cap;
 	compcap_t   cc;
@@ -368,9 +405,11 @@ boot_newcomp_create(spdid_t spdid, struct cos_compinfo *comp_info)
 	invtoken_t token = (invtoken_t)spdid;
 	int ret;
 
-	cc = cos_comp_alloc(boot_info, ct, pt, (vaddr_t)spdinfo->upcall_entry);
+	boot_comp_scb_alloc(spdid);
+	cc = cos_comp_alloc(boot_info, ct, pt, (vaddr_t)spdinfo->upcall_entry, spdinfo->scbpg);
 	assert(cc);
 	compinfo->comp_cap = cc;
+	cobj_info->cos_scb_data = (struct cos_scb_info *)spdinfo->scbpg;
 
 	/* Create sinv capability from Userspace to Booter components */
 	sinv = cos_sinv_alloc(boot_info, boot_info->comp_cap, (vaddr_t)hypercall_entry_rets_inv, token);
@@ -511,7 +550,7 @@ boot_comp_cap_cpy_at(spdid_t dstid, capid_t dstslot, spdid_t srcid, cap_t captyp
 }
 
 static inline int
-boot_comp_initaep_get(spdid_t dstid, spdid_t srcid, thdcap_t thdslot, arcvcap_t rcvslot, tcap_t tcslot)
+boot_comp_initaep_get(spdid_t dstid, spdid_t srcid, thdcap_t thdslot, arcvcap_t rcvslot, tcap_t tcslot, spdid_t *parent)
 {
 	struct comp_sched_info *si = NULL;
 	int ret = -1;
@@ -530,6 +569,8 @@ boot_comp_initaep_get(spdid_t dstid, spdid_t srcid, thdcap_t thdslot, arcvcap_t
 	if (ret) goto done;
 	ret = boot_comp_cap_cpy_at(dstid, tcslot, srcid, CAP_TCAP);
 
+	*parent = si->parent_spdid;
+
 done:
 	return ret;
 }
@@ -690,9 +731,12 @@ hypercall_entry(word_t *ret2, word_t *ret3, int op, word_t arg3, word_t arg4)
 		thdcap_t  thdslot = (arg3 << 16) >> 16;
 		tcap_t    tcslot  = (arg4 << 16) >> 16;;
 		arcvcap_t rcvslot = arg4 >> 16;
+		spdid_t   parent_spdid = 0;
 
 		if (!__hypercall_resource_access_check(client, srcid, 0)) return -EACCES;
-		ret1 = boot_comp_initaep_get(client, srcid, thdslot, rcvslot, tcslot);
+		ret1 = boot_comp_initaep_get(client, srcid, thdslot, rcvslot, tcslot, &parent_spdid);
+
+		*ret2 = (word_t)parent_spdid;
 
 		break;
 	}
@@ -754,6 +798,17 @@ hypercall_entry(word_t *ret2, word_t *ret3, int op, word_t arg3, word_t arg4)
 
 		break;
 	}
+	case HYPERCALL_COMP_INITDCB_GET:
+	{
+		spdid_t compid = arg3;
+		struct comp_cap_info *compinfo = NULL;
+
+		if (!__hypercall_resource_access_check(client, compid, 0)) return 0;
+		if (!compid || compid > num_cobj) return 0;
+		compinfo = boot_spd_compcapinfo_get(compid);
+
+		return compinfo->initdcbpg[cos_cpuid()];
+	}
 	case HYPERCALL_NUMCOMPS_GET:
 	{
 		ret1 = num_cobj + 1; /* including booter */
diff --git a/src/components/implementation/no_interface/llbooter/llbooter.c b/src/components/implementation/no_interface/llbooter/llbooter.c
index b49c8b5475..30a4406f77 100644
--- a/src/components/implementation/no_interface/llbooter/llbooter.c
+++ b/src/components/implementation/no_interface/llbooter/llbooter.c
@@ -214,15 +214,17 @@ boot_comp_map_populate(struct cobj_header *h, spdid_t spdid, vaddr_t comp_info)
 		}
 
 		if (sect->flags & COBJ_SECT_CINFO) {
+			int k;
+
 			assert((left % PAGE_SIZE) == 0);
 			assert(comp_info == (dest_daddr + (((left/PAGE_SIZE)-1)*PAGE_SIZE)));
 			boot_process_cinfo(h, spdid, boot_spd_end(h), start_addr + (comp_info - init_daddr), comp_info);
 			ci = (struct cos_component_information *)(start_addr + (comp_info - init_daddr));
+			spdinfo->cobj_info = ci;
 
 			hinfo = boot_spd_compcapinfo_get(h->id);
 			hinfo->upcall_entry = ci->cos_upcall_entry;
 		}
-
 	}
 
 	return 0;
diff --git a/src/components/implementation/no_interface/vkernel/micro_booter.h b/src/components/implementation/no_interface/vkernel/micro_booter.h
index b6afc1bb80..4c46b3e5cf 100644
--- a/src/components/implementation/no_interface/vkernel/micro_booter.h
+++ b/src/components/implementation/no_interface/vkernel/micro_booter.h
@@ -47,4 +47,7 @@ tls_set(size_t off, unsigned long val)
 
 extern void test_run_vk(void);
 
+void cos_dcb_info_init(void);
+struct cos_dcb_info *cos_dcb_info_get(void);
+
 #endif /* MICRO_BOOTER_H */
diff --git a/src/components/implementation/no_interface/vkernel/vk_api.c b/src/components/implementation/no_interface/vkernel/vk_api.c
index d9978c521e..feb5c20a3f 100644
--- a/src/components/implementation/no_interface/vkernel/vk_api.c
+++ b/src/components/implementation/no_interface/vkernel/vk_api.c
@@ -1,3 +1,4 @@
+#include <cos_dcb.h>
 #include "vk_api.h"
 
 extern vaddr_t cos_upcall_entry;
@@ -6,6 +7,12 @@ extern void vm_init(void *);
 extern void dom0_io_fn(void *);
 extern void vm_io_fn(void *);
 
+struct cos_dcb_info *
+cos_dcb_info_get(void)
+{
+	return cos_dcb_info_assign();
+}
+
 static struct cos_aep_info *
 vm_schedaep_get(struct vms_info *vminfo)
 { return cos_sched_aep_get(&(vminfo->dci)); }
@@ -19,6 +26,7 @@ vk_vm_create(struct vms_info *vminfo, struct vkernel_info *vkinfo)
 	struct cos_aep_info    *initaep  = cos_sched_aep_get(vmdci);
 	pgtblcap_t              vmutpt;
 	int                     ret;
+	vaddr_t                 scbpg, initdcbpg;
 
 	assert(vminfo && vkinfo);
 
@@ -27,7 +35,7 @@ vk_vm_create(struct vms_info *vminfo, struct vkernel_info *vkinfo)
 
 	cos_meminfo_init(&(vmcinfo->mi), BOOT_MEM_KM_BASE, VM_UNTYPED_SIZE, vmutpt);
 	ret = cos_defcompinfo_child_alloc(vmdci, (vaddr_t)&cos_upcall_entry, (vaddr_t)BOOT_MEM_VM_BASE,
-					  VM_CAPTBL_FREE, 1);
+					  VM_CAPTBL_FREE, 1, &initdcbpg, &scbpg);
 	cos_compinfo_init(&(vminfo->shm_cinfo), vmcinfo->pgtbl_cap, vmcinfo->captbl_cap, vmcinfo->comp_cap,
 			  (vaddr_t)VK_VM_SHM_BASE, VM_CAPTBL_FREE, vk_cinfo);
 
@@ -99,7 +107,7 @@ vk_vm_io_init(struct vms_info *vminfo, struct vms_info *dom0info, struct vkernel
 	assert(vminfo->id && !dom0info->id);
 	assert(vmidx >= 0 && vmidx <= VM_COUNT - 1);
 
-	d0io->iothds[vmidx] = cos_thd_alloc(vkcinfo, d0cinfo->comp_cap, dom0_io_fn, (void *)vminfo->id);
+	d0io->iothds[vmidx] = cos_thd_alloc(vkcinfo, d0cinfo->comp_cap, dom0_io_fn, (void *)vminfo->id, 0, 0);
 	assert(d0io->iothds[vmidx]);
 	d0io->iotcaps[vmidx] = cos_tcap_alloc(vkcinfo);
 	assert(d0io->iotcaps[vmidx]);
@@ -113,7 +121,7 @@ vk_vm_io_init(struct vms_info *vminfo, struct vms_info *dom0info, struct vkernel
 	ret = cos_cap_cpy_at(d0cinfo, dom0_vio_rcvcap(vminfo->id), vkcinfo, d0io->iorcvs[vmidx]);
 	assert(ret == 0);
 
-	vio->iothd = cos_thd_alloc(vkcinfo, vmcinfo->comp_cap, vm_io_fn, (void *)vminfo->id);
+	vio->iothd = cos_thd_alloc(vkcinfo, vmcinfo->comp_cap, vm_io_fn, (void *)vminfo->id, 0, 0);
 	assert(vio->iothd);
 	vio->iorcv = cos_arcv_alloc(vkcinfo, vio->iothd, vmaep->tc, vkcinfo->comp_cap, vmaep->rcv);
 	assert(vio->iorcv);
diff --git a/src/components/implementation/no_interface/vkernel/vkernel.c b/src/components/implementation/no_interface/vkernel/vkernel.c
index 49106ac335..bab3e7af1f 100644
--- a/src/components/implementation/no_interface/vkernel/vkernel.c
+++ b/src/components/implementation/no_interface/vkernel/vkernel.c
@@ -53,7 +53,7 @@ cos_init(void)
 	cos_compinfo_init(&vk_info.shm_cinfo, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_COMP,
 			  (vaddr_t)VK_VM_SHM_BASE, BOOT_CAPTBL_FREE, ci);
 
-	vk_info.termthd = cos_thd_alloc(vk_cinfo, vk_cinfo->comp_cap, vk_terminate, NULL);
+	vk_info.termthd = cos_thd_alloc(vk_cinfo, vk_cinfo->comp_cap, vk_terminate, NULL, 0, 0);
 	assert(vk_info.termthd);
 
 	vk_info.sinv = cos_sinv_alloc(vk_cinfo, vk_cinfo->comp_cap, (vaddr_t)__inv_vkernel_hypercallfn, 0);
diff --git a/src/components/implementation/sched/sched.c b/src/components/implementation/sched/sched.c
index 426f1971de..5fa092a2ee 100644
--- a/src/components/implementation/sched/sched.c
+++ b/src/components/implementation/sched/sched.c
@@ -50,7 +50,7 @@ sched_thd_create_cserialized(thdclosure_index_t idx)
 	dci = sched_child_defci_get(sched_childinfo_find(c));
 	if (!dci) return 0;
 
-	t = sl_thd_aep_alloc_ext(dci, NULL, idx, 0, 0, 0, NULL);
+	t = sl_thd_aep_alloc_ext(dci, NULL, idx, 0, 0, 0, 0, NULL);
 	if (!t) return 0;
 
 	return sl_thd_thdid(t);
@@ -67,7 +67,7 @@ sched_aep_create_cserialized(arcvcap_t *extrcv, int *unused, thdclosure_index_t
 	dci = sched_child_defci_get(sched_childinfo_find(c));
 	if (!dci) return 0;
 
-	t = sl_thd_aep_alloc_ext(dci, NULL, idx, 1, owntc, key, extrcv);
+	t = sl_thd_aep_alloc_ext(dci, NULL, idx, 1, owntc, key, 0, extrcv);
 	if (!t) return 0;
 
 	return sl_thd_thdid(t);
diff --git a/src/components/implementation/sched/sched_info.c b/src/components/implementation/sched/sched_info.c
index 5e32dfeed8..5629220deb 100644
--- a/src/components/implementation/sched/sched_info.c
+++ b/src/components/implementation/sched/sched_info.c
@@ -90,7 +90,7 @@ sched_childinfo_init_intern(int is_raw)
 		assert(schedinfo);
 		child_dci = sched_child_defci_get(schedinfo);
 
-		initthd = sl_thd_initaep_alloc(child_dci, NULL, childflags & COMP_FLAG_SCHED, childflags & COMP_FLAG_SCHED ? 1 : 0, 0);
+		initthd = sl_thd_initaep_alloc(child_dci, NULL, childflags & COMP_FLAG_SCHED, childflags & COMP_FLAG_SCHED ? 1 : 0, 0, 0);
 		assert(initthd);
 		sched_child_initthd_set(schedinfo, initthd);
 
diff --git a/src/components/implementation/tests/micro_booter/mb_tests.c b/src/components/implementation/tests/micro_booter/mb_tests.c
index bf89469ed1..ecfa74cdbd 100644
--- a/src/components/implementation/tests/micro_booter/mb_tests.c
+++ b/src/components/implementation/tests/micro_booter/mb_tests.c
@@ -4,6 +4,33 @@
 
 unsigned int cyc_per_usec;
 
+static void
+test_scb(void)
+{
+	thdcap_t thdc;
+	struct cos_scb_info *scb_info = cos_scb_info_get();
+
+	scb_info->curr_thd = BOOT_CAPTBL_SELF_INITTHD_CPU_BASE;
+	thdc = cos_introspect(&booter_info, booter_info.comp_cap, COMP_GET_SCB_CURTHD);
+	if (thdc == (thdcap_t)BOOT_CAPTBL_SELF_INITTHD_CPU_BASE) PRINTC("Success: Kernel and user have consistent thdcap set in SCB\n");
+	else PRINTC("Failure: Kernel and user don't have a consistent thdcap in SCB\n");
+}
+
+static void
+test_dcb(void)
+{
+	struct cos_dcb_info *init_dcbpg = cos_init_dcb_get();
+
+	assert(init_dcbpg);
+}
+
+static void
+test_scb_dcb(void)
+{
+	test_scb();
+	test_dcb();
+}
+
 static void
 thd_fn_perf(void *d)
 {
@@ -23,7 +50,7 @@ test_thds_perf(void)
 	long long start_swt_cycles = 0, end_swt_cycles = 0;
 	int       i;
 
-	ts = cos_thd_alloc(&booter_info, booter_info.comp_cap, thd_fn_perf, NULL);
+	ts = cos_thd_alloc(&booter_info, booter_info.comp_cap, thd_fn_perf, NULL, booter_info.pgtbl_cap, (vaddr_t)cos_dcb_info_get());
 	assert(ts);
 	cos_thd_switch(ts);
 
@@ -56,7 +83,7 @@ test_thds(void)
 	intptr_t i;
 
 	for (i = 0; i < TEST_NTHDS; i++) {
-		ts[i] = cos_thd_alloc(&booter_info, booter_info.comp_cap, thd_fn, (void *)i);
+		ts[i] = cos_thd_alloc(&booter_info, booter_info.comp_cap, thd_fn, (void *)i, booter_info.pgtbl_cap, (vaddr_t)cos_dcb_info_get());
 		assert(ts[i]);
 		tls_test[cos_cpuid()][i] = i;
 		cos_thd_mod(&booter_info, ts[i], &tls_test[cos_cpuid()][i]);
@@ -252,7 +279,7 @@ test_async_endpoints(void)
 	PRINTC("Creating threads, and async end-points.\n");
 	/* parent rcv capabilities */
 	tcp = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_parent,
-	                    (void *)BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
+	                    (void *)BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, booter_info.pgtbl_cap, (vaddr_t)cos_dcb_info_get());
 	assert(tcp);
 	tccp = cos_tcap_alloc(&booter_info);
 	assert(tccp);
@@ -264,7 +291,7 @@ test_async_endpoints(void)
 	}
 
 	/* child rcv capabilities */
-	tcc = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_fn, (void *)tcp);
+	tcc = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_fn, (void *)tcp, booter_info.pgtbl_cap, (vaddr_t)cos_dcb_info_get());
 	assert(tcc);
 	tccc = cos_tcap_alloc(&booter_info);
 	assert(tccc);
@@ -297,7 +324,7 @@ test_async_endpoints_perf(void)
 
 	/* parent rcv capabilities */
 	tcp = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_parent_perf,
-	                    (void *)BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
+	                    (void *)BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, booter_info.pgtbl_cap, (vaddr_t)cos_dcb_info_get());
 	assert(tcp);
 	tccp = cos_tcap_alloc(&booter_info);
 	assert(tccp);
@@ -306,7 +333,7 @@ test_async_endpoints_perf(void)
 	if (cos_tcap_transfer(rcp, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF, TCAP_PRIO_MAX + 1)) assert(0);
 
 	/* child rcv capabilities */
-	tcc = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_fn_perf, (void *)tcp);
+	tcc = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_fn_perf, (void *)tcp, booter_info.pgtbl_cap, (vaddr_t)cos_dcb_info_get());
 	assert(tcc);
 	tccc = cos_tcap_alloc(&booter_info);
 	assert(tccc);
@@ -340,7 +367,7 @@ test_timer(void)
 	cycles_t c = 0, p = 0, t = 0;
 
 	PRINTC("Starting timer test.\n");
-	tc = cos_thd_alloc(&booter_info, booter_info.comp_cap, spinner, NULL);
+	tc = cos_thd_alloc(&booter_info, booter_info.comp_cap, spinner, NULL, booter_info.pgtbl_cap, (vaddr_t)cos_dcb_info_get());
 
 	for (i = 0; i <= 16; i++) {
 		thdid_t     tid;
@@ -387,7 +414,7 @@ exec_cluster_alloc(struct exec_cluster *e, cos_thd_fn_t fn, void *d, arcvcap_t p
 {
 	e->tcc = cos_tcap_alloc(&booter_info);
 	assert(e->tcc);
-	e->tc = cos_thd_alloc(&booter_info, booter_info.comp_cap, fn, d);
+	e->tc = cos_thd_alloc(&booter_info, booter_info.comp_cap, fn, d, booter_info.pgtbl_cap, (vaddr_t)cos_dcb_info_get());
 	assert(e->tc);
 	e->rc = cos_arcv_alloc(&booter_info, e->tc, e->tcc, booter_info.comp_cap, parentc);
 	assert(e->rc);
@@ -798,8 +825,11 @@ test_inv(void)
 	compcap_t    cc;
 	sinvcap_t    ic;
 	unsigned int r;
+	vaddr_t      scbpg;
 
-	cc = cos_comp_alloc(&booter_info, booter_info.captbl_cap, booter_info.pgtbl_cap, (vaddr_t)NULL);
+	scbpg = (vaddr_t)cos_scbpg_bump_allocn(&booter_info, PAGE_SIZE);
+	assert(scbpg);
+	cc = cos_comp_alloc(&booter_info, booter_info.captbl_cap, booter_info.pgtbl_cap, (vaddr_t)NULL, scbpg);
 	assert(cc > 0);
 	ic = cos_sinv_alloc(&booter_info, cc, (vaddr_t)__inv_test_serverfn, 0);
 	assert(ic > 0);
@@ -817,8 +847,11 @@ test_inv_perf(void)
 	int          i;
 	long long    total_inv_cycles = 0LL, total_ret_cycles = 0LL;
 	unsigned int ret;
+	vaddr_t      scbpg;
 
-	cc = cos_comp_alloc(&booter_info, booter_info.captbl_cap, booter_info.pgtbl_cap, (vaddr_t)NULL);
+	scbpg = (vaddr_t)cos_scbpg_bump_allocn(&booter_info, PAGE_SIZE);
+	assert(scbpg);
+	cc = cos_comp_alloc(&booter_info, booter_info.captbl_cap, booter_info.pgtbl_cap, (vaddr_t)NULL, scbpg);
 	assert(cc > 0);
 	ic = cos_sinv_alloc(&booter_info, cc, (vaddr_t)__inv_test_serverfn, 0);
 	assert(ic > 0);
@@ -847,8 +880,11 @@ test_captbl_expand(void)
 {
 	int       i;
 	compcap_t cc;
+	vaddr_t      scbpg;
 
-	cc = cos_comp_alloc(&booter_info, booter_info.captbl_cap, booter_info.pgtbl_cap, (vaddr_t)NULL);
+	scbpg = (vaddr_t)cos_scbpg_bump_allocn(&booter_info, PAGE_SIZE);
+	assert(scbpg);
+	cc = cos_comp_alloc(&booter_info, booter_info.captbl_cap, booter_info.pgtbl_cap, (vaddr_t)NULL, scbpg);
 	assert(cc);
 	for (i = 0; i < 1024; i++) {
 		sinvcap_t ic;
@@ -868,6 +904,7 @@ test_run_mb(void)
 	test_timer();
 	test_budgets();
 
+	test_scb_dcb();
 	test_thds();
 	test_thds_perf();
 
diff --git a/src/components/implementation/tests/micro_booter/micro_booter.c b/src/components/implementation/tests/micro_booter/micro_booter.c
index a4d2176aef..3868c90c02 100644
--- a/src/components/implementation/tests/micro_booter/micro_booter.c
+++ b/src/components/implementation/tests/micro_booter/micro_booter.c
@@ -1,3 +1,4 @@
+#include <cos_types.h>
 #include "micro_booter.h"
 
 struct cos_compinfo booter_info;
@@ -17,11 +18,43 @@ term_fn(void *d)
 
 static int test_done[NUM_CPU];
 
+#define COS_DCB_MAX_PER_PAGE (PAGE_SIZE / sizeof(struct cos_dcb_info))
+static unsigned long free_off[NUM_CPU] = { 0 }, total[NUM_CPU] = { 0 };
+static struct cos_dcb_info *dcb_st[NUM_CPU] = { NULL };
+
+void
+cos_dcb_info_init(void)
+{
+	free_off[cos_cpuid()] = 1;
+
+	dcb_st[cos_cpuid()] = cos_init_dcb_get();
+}
+
+struct cos_dcb_info *
+cos_dcb_info_get(void)
+{
+	unsigned int curr_off = 0;
+
+	curr_off = ps_faa(&free_off[cos_cpuid()], 1);
+	if (curr_off == COS_DCB_MAX_PER_PAGE) {
+		/* will need a version that calls down to capmgr for more pages */
+		dcb_st[cos_cpuid()] = cos_dcbpg_bump_allocn(&booter_info, PAGE_SIZE);
+		assert(dcb_st[cos_cpuid()]);
+
+		free_off[cos_cpuid()] = 0;
+
+		return dcb_st[cos_cpuid()];
+	}
+
+	return (dcb_st[cos_cpuid()] + curr_off);
+}
+
 void
 cos_init(void)
 {
 	int cycs, i;
 	static int first_init = 1, init_done = 0;
+	struct cos_dcb_info *initaddr, *termaddr;
 
 	cycs = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
 	printc("\t%d cycles per microsecond\n", cycs);
@@ -35,9 +68,24 @@ cos_init(void)
 	}
 
 	while (!init_done) ;
+	cos_dcb_info_init();
+	initaddr = cos_init_dcb_get();
+	PRINTC("%u DCB IP: %lx, DCB SP: %lx\n", (unsigned int)BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, (unsigned long)cos_introspect(&booter_info, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, THD_GET_DCB_IP), (unsigned long)cos_introspect(&booter_info, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, THD_GET_DCB_SP));
+	initaddr->ip = 10;
+	initaddr->sp = 20;
+	PRINTC("%u DCB IP: %lx, DCB SP: %lx\n", (unsigned int)BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, (unsigned long)cos_introspect(&booter_info, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, THD_GET_DCB_IP), (unsigned long)cos_introspect(&booter_info, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, THD_GET_DCB_SP));
 
-	termthd[cos_cpuid()] = cos_thd_alloc(&booter_info, booter_info.comp_cap, term_fn, NULL);
+
+	termaddr = cos_dcb_info_get();
+	termthd[cos_cpuid()] = cos_thd_alloc(&booter_info, booter_info.comp_cap, term_fn, NULL, booter_info.pgtbl_cap, (vaddr_t)termaddr);
 	assert(termthd[cos_cpuid()]);
+	PRINTC("%u DCB IP: %lx, DCB SP: %lx\n", (unsigned int)termthd[cos_cpuid()], (unsigned long)cos_introspect(&booter_info, termthd[cos_cpuid()], THD_GET_DCB_IP), (unsigned long)cos_introspect(&booter_info, termthd[cos_cpuid()], THD_GET_DCB_SP));
+	termaddr->ip = 30;
+	termaddr->sp = 40;
+	PRINTC("%u DCB IP: %lx, DCB SP: %lx\n", (unsigned int)termthd[cos_cpuid()], (unsigned long)cos_introspect(&booter_info, termthd[cos_cpuid()], THD_GET_DCB_IP), (unsigned long)cos_introspect(&booter_info, termthd[cos_cpuid()], THD_GET_DCB_SP));
+	PRINTC("%u DCB IP: %lx, DCB SP: %lx\n", (unsigned int)BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, (unsigned long)cos_introspect(&booter_info, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, THD_GET_DCB_IP), (unsigned long)cos_introspect(&booter_info, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, THD_GET_DCB_SP));
+	PRINTC("%u DCB IP: %lx, DCB SP: %lx\n", (unsigned int)termthd[cos_cpuid()], (unsigned long)cos_introspect(&booter_info, termthd[cos_cpuid()], THD_GET_DCB_IP), (unsigned long)cos_introspect(&booter_info, termthd[cos_cpuid()], THD_GET_DCB_SP));
+
 	PRINTC("Micro Booter started.\n");
 	test_run_mb();
 
diff --git a/src/components/implementation/tests/micro_booter/micro_booter.h b/src/components/implementation/tests/micro_booter/micro_booter.h
index daba68899f..632897aae9 100644
--- a/src/components/implementation/tests/micro_booter/micro_booter.h
+++ b/src/components/implementation/tests/micro_booter/micro_booter.h
@@ -30,6 +30,7 @@
 #define ITER 10000
 #define TEST_NTHDS 5
 
+extern struct cos_dcb_info *init_dcbinfo[];
 extern struct cos_compinfo booter_info;
 extern thdcap_t            termthd[]; /* switch to this to shutdown */
 extern unsigned long       tls_test[][TEST_NTHDS];
@@ -53,4 +54,7 @@ tls_set(size_t off, unsigned long val)
 
 extern void test_run_mb(void);
 
+void cos_dcb_info_init(void);
+struct cos_dcb_info *cos_dcb_info_get(void);
+
 #endif /* MICRO_BOOTER_H */
diff --git a/src/components/implementation/tests/unit_defcompinfo/unit_defcompinfo.c b/src/components/implementation/tests/unit_defcompinfo/unit_defcompinfo.c
index 21133d1e21..5b6a306e4f 100644
--- a/src/components/implementation/tests/unit_defcompinfo/unit_defcompinfo.c
+++ b/src/components/implementation/tests/unit_defcompinfo/unit_defcompinfo.c
@@ -56,7 +56,7 @@ test_aeps(void)
 		asndcap_t snd;
 
 		printc("\tCreating AEP [%d]\n", i);
-		ret = cos_aep_tcap_alloc(&(test_aep[i]), BOOT_CAPTBL_SELF_INITTCAP_BASE, aep_thd_fn, (void *)i);
+		ret = cos_aep_tcap_alloc(&(test_aep[i]), BOOT_CAPTBL_SELF_INITTCAP_BASE, aep_thd_fn, (void *)i, 0);
 		assert(ret == 0);
 
 		snd = cos_asnd_alloc(ci, test_aep[i].rcv, ci->captbl_cap);
@@ -125,7 +125,7 @@ cos_init(void)
 		cos_defcompinfo_init();
 
 		for (id = 0; id < CHILD_COMP_COUNT; id++) {
-			vaddr_t              vm_range, addr;
+			vaddr_t              vm_range, addr, scbaddr, dcbaddr;
 			pgtblcap_t           child_utpt;
 			int                  is_sched = ((id == CHILD_SCHED_ID) ? 1 : 0);
 			struct cos_compinfo *child_ci = cos_compinfo_get(&child_defci[id]);
@@ -136,7 +136,7 @@ cos_init(void)
 
 			cos_meminfo_init(&(child_ci->mi), BOOT_MEM_KM_BASE, CHILD_UNTYPED_SIZE, child_utpt);
 			cos_defcompinfo_child_alloc(&child_defci[id], (vaddr_t)&cos_upcall_entry,
-			                            (vaddr_t)BOOT_MEM_VM_BASE, BOOT_CAPTBL_FREE, is_sched);
+			                            (vaddr_t)BOOT_MEM_VM_BASE, BOOT_CAPTBL_FREE, is_sched, &dcbaddr, &scbaddr);
 
 			printc("\t\tCopying new capabilities\n");
 			ret = cos_cap_cpy_at(child_ci, BOOT_CAPTBL_SELF_CT, ci, child_ci->captbl_cap);
diff --git a/src/components/implementation/tests/unit_schedtests/unit_schedlib.c b/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
index dde4427fa9..55343b1728 100644
--- a/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
+++ b/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
@@ -40,6 +40,8 @@ static volatile int testing = 1;
 void
 test_thd_perffn(void *data)
 {
+	struct cos_compinfo *ci = cos_compinfo_get(cos_defcompinfo_curr_get());
+	thdcap_t thdc = 0;
 	cycles_t start_cycs = 0, end_cycs = 0, wc_cycs = 0, total_cycs = 0;
 	unsigned int i = 0;
 
@@ -68,6 +70,9 @@ test_thd_perffn(void *data)
 	PRINTC("SWITCH UBENCH: avg: %llu, wc: %llu, iters:%u\n", (total_cycs / (2 * PERF_ITERS)), wc_cycs, PERF_ITERS);
 	testing = 0;
 	/* done testing! let the spinfn cleanup! */
+	PRINTC("CURR THD: %u\n", (unsigned int)cos_introspect(ci, ci->comp_cap, COMP_GET_SCB_CURTHD));
+	thdc = sl_thd_thdcap(sl_thd_curr());
+	PRINTC("%u DCB IP: %lx, DCB SP: %lx\n", (unsigned int)thdc, (unsigned long)cos_introspect(ci, thdc, THD_GET_DCB_IP), (unsigned long)cos_introspect(ci, thdc, THD_GET_DCB_SP));
 	sl_thd_yield(0);
 
 	sl_thd_exit();
@@ -76,11 +81,17 @@ test_thd_perffn(void *data)
 void
 test_thd_spinfn(void *data)
 {
+	struct cos_compinfo *ci = cos_compinfo_get(cos_defcompinfo_curr_get());
+	thdcap_t thdc = 0;
+
 	while (likely(testing)) {
 		rdtscll(mid_cycs);
 		sl_thd_yield(0);
 	}
 
+	thdc = sl_thd_thdcap(sl_thd_curr());
+	PRINTC("CURR THD: %u\n", (unsigned int)cos_introspect(ci, ci->comp_cap, COMP_GET_SCB_CURTHD));
+	PRINTC("%u DCB IP: %lx, DCB SP: %lx\n", (unsigned int)thdc, (unsigned long)cos_introspect(ci, thdc, THD_GET_DCB_IP), (unsigned long)cos_introspect(ci, thdc, THD_GET_DCB_SP));
 	sl_thd_exit();
 }
 
@@ -100,8 +111,13 @@ void
 test_yield_perf(void)
 {
 	int                     i;
-	struct sl_thd *         threads[N_TESTTHDS_PERF];
+	struct sl_thd          *threads[N_TESTTHDS_PERF];
+	struct cos_compinfo *ci = cos_compinfo_get(cos_defcompinfo_curr_get());
 	union sched_param_union sp = {.c = {.type = SCHEDP_PRIO, .value = 31}};
+	struct cos_scb_info *scb_info = cos_scb_info_get();
+
+	scb_info->curr_thd = BOOT_CAPTBL_SELF_INITTHD_CPU_BASE;
+	PRINTC("CURR THD: %u\n", (unsigned int)cos_introspect(ci, ci->comp_cap, COMP_GET_SCB_CURTHD));
 
 	for (i = 0; i < N_TESTTHDS_PERF; i++) {
 		if (i == 1) threads[i] = sl_thd_alloc(test_thd_perffn, (void *)&threads[0]);
@@ -109,6 +125,7 @@ test_yield_perf(void)
 		assert(threads[i]);
 		sl_thd_param_set(threads[i], sp.v);
 		PRINTC("Thread %u:%lu created\n", sl_thd_thdid(threads[i]), sl_thd_thdcap(threads[i]));
+		PRINTC("%u DCB IP: %lx, DCB SP: %lx\n", (unsigned int)sl_thd_thdcap(threads[i]), (unsigned long)cos_introspect(ci, sl_thd_thdcap(threads[i]), THD_GET_DCB_IP), (unsigned long)cos_introspect(ci, sl_thd_thdcap(threads[i]), THD_GET_DCB_SP));
 	}
 }
 
diff --git a/src/components/include/cos_component.h b/src/components/include/cos_component.h
index 1aa854b69f..6c0e4da537 100644
--- a/src/components/include/cos_component.h
+++ b/src/components/include/cos_component.h
@@ -203,12 +203,36 @@ cos_spd_id(void)
 static inline void *
 cos_get_heap_ptr(void)
 {
-	return (void *)cos_comp_info.cos_heap_ptr;
+	/* page at heap_ptr is actually the SCB_PAGE for the booter alone! */
+	unsigned int off = (cos_spd_id() == 0 ? (COS_SCB_SIZE + (PAGE_SIZE * NUM_CPU)) : 0);
+	void *heap_ptr = ((void *)(cos_comp_info.cos_heap_ptr + off));
+
+	return heap_ptr;
+}
+
+static inline struct cos_scb_info *
+cos_scb_info_get(void)
+{
+	struct cos_scb_info *scb_info = cos_comp_info.cos_scb_data;
+
+	if (cos_spd_id() == 0) scb_info = (struct cos_scb_info *)(cos_comp_info.cos_heap_ptr);
+
+	return scb_info;
+}
+
+static inline struct cos_dcb_info *
+cos_init_dcb_get(void)
+{
+	/* created at boot-time for the first component in the system! */
+	if (cos_spd_id() == 0) return (struct cos_dcb_info *)(cos_comp_info.cos_heap_ptr + COS_SCB_SIZE + (PAGE_SIZE * cos_cpuid()));
+
+	return NULL;
 }
 
 static inline void
 cos_set_heap_ptr(void *addr)
 {
+	/* FIXME: fix this for the hack if it's not going to work! */
 	cos_comp_info.cos_heap_ptr = (vaddr_t)addr;
 }
 
diff --git a/src/components/include/cos_dcb.h b/src/components/include/cos_dcb.h
new file mode 100644
index 0000000000..81cc8b4395
--- /dev/null
+++ b/src/components/include/cos_dcb.h
@@ -0,0 +1,11 @@
+#ifndef COS_DCB_H
+#define COS_DCB_H
+
+#include <cos_types.h>
+
+#define COS_DCB_PERPG_MAX (PAGE_SIZE / sizeof(struct cos_dcb_info))
+
+void cos_dcb_info_init(void);
+struct cos_dcb_info *cos_dcb_info_assign(void);
+
+#endif /* COS_DCB_H */
diff --git a/src/components/include/cos_defkernel_api.h b/src/components/include/cos_defkernel_api.h
index fa083c27ef..c39e77c0f5 100644
--- a/src/components/include/cos_defkernel_api.h
+++ b/src/components/include/cos_defkernel_api.h
@@ -36,7 +36,7 @@ struct cos_aep_info {
 	thdid_t         tid;
 	arcvcap_t       rcv;
 	cos_aepthd_fn_t fn;
-	void *          data;
+	void           *data;
 };
 
 /* Default Component information */
@@ -53,7 +53,7 @@ cos_aepthd_fn(void *data)
 {
 	struct cos_aep_info *aep_info = (struct cos_aep_info *)data;
 	cos_aepthd_fn_t      aep_fn   = aep_info->fn;
-	void *               fn_data  = aep_info->data;
+	void                *fn_data  = aep_info->data;
 
 	(aep_fn)(aep_info->rcv, fn_data);
 
@@ -96,44 +96,49 @@ void cos_defcompinfo_sched_init(void);
  * cos_defcompinfo_child_alloc: called to create a new child component including initial capabilities like pgtbl,
  * captbl, compcap, aep. if is_sched is set, scheduling end-point will also be created for the child component, else,
  * the current component's scheduler will remain the scheduler for the child component.
+ * NOTE: dcbuaddr is the address in child_dci page-table and scbuaddr too!.
  */
 int cos_defcompinfo_child_alloc(struct cos_defcompinfo *child_defci, vaddr_t entry, vaddr_t heap_ptr,
-                                capid_t cap_frontier, int is_sched);
+                                capid_t cap_frontier, int is_sched, vaddr_t *dcbuaddr, vaddr_t *scbuaddr);
 
 /*
  * cos_aep_alloc: creates a new async activation end-point which includes thread, tcap and rcv capabilities.
  *                struct cos_aep_info passed in, must not be stack allocated.
  */
-int cos_aep_alloc(struct cos_aep_info *aep, cos_aepthd_fn_t fn, void *data);
+int cos_aep_alloc(struct cos_aep_info *aep, cos_aepthd_fn_t fn, void *data, vaddr_t dcbuaddr);
 /*
  * cos_aep_alloc: creates a new async activation end-point, using an existing tcap.
  *                struct cos_aep_info passed in, must not be stack allocated.
  */
-int cos_aep_tcap_alloc(struct cos_aep_info *aep, tcap_t tc, cos_aepthd_fn_t fn, void *data);
+int cos_aep_tcap_alloc(struct cos_aep_info *aep, tcap_t tc, cos_aepthd_fn_t fn, void *data, vaddr_t dcbuaddr);
 
 /*
  * cos_initaep_alloc: create an initaep in the @child_dci and using sched->rcv as the parent, sets up cos_sched_ape_get(@child_dci) with the init capabilities.
  * 		      if @sched == NULL, use the current scheduler in cos_sched_aep_get(cos_defcompinfo_get_cur()).
  *                    if @is_sched == 0, creates only the init thread (does not need @sched parameter)
+ * NOTE: dcbuaddr is the address in child_dci page-table.
  */
-int cos_initaep_alloc(struct cos_defcompinfo *child_dci, struct cos_aep_info *sched, int is_sched);
+int cos_initaep_alloc(struct cos_defcompinfo *child_dci, struct cos_aep_info *sched, int is_sched, vaddr_t dcbuaddr);
 /*
  * cos_initaep_tcap_alloc: same as cos_initaep_alloc with is_sched == 1, except it doesn't create a new tcap,
  *			   uses the tcap passed in @tc.
+ * NOTE: dcbuaddr is the address in child_dci page-table.
  */
-int cos_initaep_tcap_alloc(struct cos_defcompinfo *child_dci, tcap_t tc, struct cos_aep_info *sched);
+int cos_initaep_tcap_alloc(struct cos_defcompinfo *child_dci, tcap_t tc, struct cos_aep_info *sched, vaddr_t dcbuaddr);
 
 /*
  * cos_aep_alloc_ext: creates a new async activation end-point which includes thread, tcap and rcv capabilities in the child_dci component using sched_aep->rcv.
  *		      if @child_dci == NULL, create in the current component.
+ * NOTE: dcbuaddr is the address in child_dci page-table.
  */
-int cos_aep_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *child_dci, struct cos_aep_info *sched_aep, thdclosure_index_t idx);
+int cos_aep_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *child_dci, struct cos_aep_info *sched_aep, thdclosure_index_t idx, vaddr_t dcbuaddr);
 
 /*
  * cos_aep_alloc_ext: creates a new async activation end-point which includes thread, tcap and rcv capabilities in the child_dci component using sched_aep->rcv.
  *		      if @child_dci == NULL, create in the current component.
+ * NOTE: dcbuaddr is the address in child_dci page-table.
  */
-int cos_aep_tcap_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *child_dci, struct cos_aep_info *sched_aep, tcap_t tc, thdclosure_index_t idx);
+int cos_aep_tcap_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *child_dci, struct cos_aep_info *sched_aep, tcap_t tc, thdclosure_index_t idx, vaddr_t dcbuaddr);
 
 /*
  * cos_defswitch: thread switch api using the default scheduling tcap and rcv.
diff --git a/src/components/include/cos_kernel_api.h b/src/components/include/cos_kernel_api.h
index 911f025e01..28cd57d713 100644
--- a/src/components/include/cos_kernel_api.h
+++ b/src/components/include/cos_kernel_api.h
@@ -108,16 +108,16 @@ int cos_pgtbl_intern_expandwith(struct cos_compinfo *ci, pgtblcap_t intern, vadd
  * correctly populate ci (allocating all resources from ci_resources).
  */
 int         cos_compinfo_alloc(struct cos_compinfo *ci, vaddr_t heap_ptr, capid_t cap_frontier, vaddr_t entry,
-                               struct cos_compinfo *ci_resources);
+                               struct cos_compinfo *ci_resources, vaddr_t *scbpg);
 captblcap_t cos_captbl_alloc(struct cos_compinfo *ci);
 pgtblcap_t  cos_pgtbl_alloc(struct cos_compinfo *ci);
-compcap_t   cos_comp_alloc(struct cos_compinfo *ci, captblcap_t ctc, pgtblcap_t ptc, vaddr_t entry);
+compcap_t   cos_comp_alloc(struct cos_compinfo *ci, captblcap_t ctc, pgtblcap_t ptc, vaddr_t entry, vaddr_t scbpg);
 
 typedef void (*cos_thd_fn_t)(void *);
-thdcap_t cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, cos_thd_fn_t fn, void *data);
-thdcap_t cos_thd_alloc_ext(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t idx);
+thdcap_t cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, cos_thd_fn_t fn, void *data, pgtblcap_t ptc, vaddr_t dcbuaddr);
+thdcap_t cos_thd_alloc_ext(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t idx, pgtblcap_t ptc, vaddr_t dcbuaddr);
 /* Create the initial (cos_init) thread */
-thdcap_t  cos_initthd_alloc(struct cos_compinfo *ci, compcap_t comp);
+thdcap_t  cos_initthd_alloc(struct cos_compinfo *ci, compcap_t comp, pgtblcap_t, vaddr_t dcbuaddr);
 sinvcap_t cos_sinv_alloc(struct cos_compinfo *srcci, compcap_t dstcomp, vaddr_t entry, invtoken_t token);
 arcvcap_t cos_arcv_alloc(struct cos_compinfo *ci, thdcap_t thdcap, tcap_t tcapcap, compcap_t compcap, arcvcap_t enotif);
 asndcap_t cos_asnd_alloc(struct cos_compinfo *ci, arcvcap_t arcvcap, captblcap_t ctcap);
@@ -125,6 +125,9 @@ asndcap_t cos_asnd_alloc(struct cos_compinfo *ci, arcvcap_t arcvcap, captblcap_t
 void *cos_page_bump_alloc(struct cos_compinfo *ci);
 void *cos_page_bump_allocn(struct cos_compinfo *ci, size_t sz);
 
+void *cos_dcbpg_bump_allocn(struct cos_compinfo *ci, size_t sz);
+void *cos_scbpg_bump_allocn(struct cos_compinfo *ci, size_t sz);
+
 capid_t cos_cap_cpy(struct cos_compinfo *dstci, struct cos_compinfo *srcci, cap_t srcctype, capid_t srccap);
 int     cos_cap_cpy_at(struct cos_compinfo *dstci, capid_t dstcap, struct cos_compinfo *srcci, capid_t srccap);
 
diff --git a/src/components/include/hypercall.h b/src/components/include/hypercall.h
index cf080eba04..192f13ee67 100644
--- a/src/components/include/hypercall.h
+++ b/src/components/include/hypercall.h
@@ -14,6 +14,7 @@ enum hypercall_cntl {
 	HYPERCALL_COMP_CAPTBLCAP_GET,
 	HYPERCALL_COMP_PGTBLCAP_GET,
 	HYPERCALL_COMP_CAPFRONTIER_GET,
+	HYPERCALL_COMP_INITDCB_GET, /* per-core, each core. only for threads created by llbooter */
 
 	HYPERCALL_COMP_INITAEP_GET,
 	HYPERCALL_COMP_CHILD_NEXT,
@@ -47,11 +48,12 @@ hypercall_comp_init_done(void)
 
 /* Note: This API can be called ONLY by components that manage capability resources */
 static inline int
-hypercall_comp_initaep_get(spdid_t spdid, int is_sched, struct cos_aep_info *aep)
+hypercall_comp_initaep_get(spdid_t spdid, int is_sched, struct cos_aep_info *aep, spdid_t *parent_spdid)
 {
 	thdcap_t  thdslot = 0;
 	arcvcap_t rcvslot = 0;
 	tcap_t    tcslot  = 0;
+	word_t    r3 = 0;
 	struct cos_compinfo *ci = cos_compinfo_get(cos_defcompinfo_curr_get());
 	int ret = 0;
 
@@ -67,8 +69,8 @@ hypercall_comp_initaep_get(spdid_t spdid, int is_sched, struct cos_aep_info *aep
 	}
 
 	/* capid_t though is unsigned long, only assuming it occupies 16bits for packing */
-	ret = cos_sinv(BOOT_CAPTBL_SINV_CAP, 0, HYPERCALL_COMP_INITAEP_GET,
-			spdid << 16 | thdslot, rcvslot << 16 | tcslot);
+	ret = cos_sinv_rets(BOOT_CAPTBL_SINV_CAP, 0, HYPERCALL_COMP_INITAEP_GET,
+			    spdid << 16 | thdslot, rcvslot << 16 | tcslot, (word_t *)&parent_spdid, &r3);
 	if (ret) return ret;
 
 	aep->thd = thdslot;
@@ -96,7 +98,7 @@ hypercall_comp_info_get(spdid_t spdid, pgtblcap_t *ptslot, captblcap_t *ctslot,
 
 	/* capid_t though is unsigned long, only assuming it occupies 16bits for packing */
 	ret = cos_sinv_rets(BOOT_CAPTBL_SINV_CAP, 0, HYPERCALL_COMP_INFO_GET,
-			     spdid << 16 | (*compslot), (*ptslot) << 16 | (*ctslot), &r2, &r3);
+			    spdid << 16 | (*compslot), (*ptslot) << 16 | (*ctslot), &r2, &r3);
 	*parentid = r2;
 
 	return ret;
@@ -186,6 +188,12 @@ hypercall_comp_capfrontier_get(spdid_t spdid)
 	return cap_frontier;
 }
 
+static inline vaddr_t
+hypercall_initdcb_get(spdid_t spdid)
+{
+	return (vaddr_t)cos_sinv(BOOT_CAPTBL_SINV_CAP, 0, HYPERCALL_COMP_INITDCB_GET, spdid, 0);
+}
+
 static inline int
 hypercall_numcomps_get(void)
 {
diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index 8b2e3281a6..0b649e7830 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -54,13 +54,6 @@ struct sl_cs {
 	} u;
 };
 
-struct sl_scb_info {
-	thdcap_t curr_thd;
-	tcap_prio_t curr_prio;
-
-	cycles_t timer_next;
-};
-
 struct sl_global_cpu {
 	struct sl_cs lock;
 
@@ -75,7 +68,7 @@ struct sl_global_cpu {
 	cycles_t    timer_next;
 	tcap_time_t timeout_next;
 
-	struct sl_scb_info scb_info;
+	struct cos_scb_info *scb_info;
 	struct ps_list_head event_head; /* all pending events for sched end-point */
 };
 
@@ -87,10 +80,10 @@ sl__globals_cpu(void)
 	return &(sl_global_cpu_data[cos_cpuid()]);
 }
 
-static inline struct sl_scb_info *
+static inline struct cos_scb_info *
 sl_scb_info_cpu(void)
 {
-	return &(sl__globals_cpu()->scb_info);
+	return (sl__globals_cpu()->scb_info);
 }
 
 static inline void
@@ -299,8 +292,8 @@ struct sl_thd *sl_thd_aep_alloc(cos_aepthd_fn_t fn, void *data, int own_tcap, co
  */
 struct sl_thd *sl_thd_comp_init(struct cos_defcompinfo *comp, int is_sched);
 
-struct sl_thd *sl_thd_initaep_alloc(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key);
-struct sl_thd *sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, arcvcap_t *extrcv);
+struct sl_thd *sl_thd_initaep_alloc(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key, vaddr_t dcbuaddr);
+struct sl_thd *sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, vaddr_t dcbuaddr, arcvcap_t *extrcv);
 
 struct sl_thd *sl_thd_init_ext(struct cos_aep_info *aep, struct sl_thd *sched_thd);
 
@@ -417,7 +410,7 @@ sl_thd_is_runnable(struct sl_thd *t)
 static inline int
 sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 {
-	struct sl_scb_info *scb = sl_scb_info_cpu();
+	struct cos_scb_info *scb = sl_scb_info_cpu();
 
 	__asm__ __volatile__ (				\
 		"movl $2f, (%%eax)\n\t"			\
@@ -445,7 +438,6 @@ sl_thd_activate(struct sl_thd *t, sched_tok_t tok)
 	struct cos_defcompinfo *dci = cos_defcompinfo_curr_get();
 	struct cos_compinfo    *ci  = &dci->ci;
 	struct sl_global_cpu   *g   = sl__globals_cpu();
-	struct sl_scb_info     *scb = sl_scb_info_cpu();
 	int ret = 0;
 
 	if (t->properties & SL_THD_PROPERTY_SEND) {
@@ -455,7 +447,6 @@ sl_thd_activate(struct sl_thd *t, sched_tok_t tok)
 				  g->timeout_next, g->sched_rcv, tok);
 	} else {
 		return sl_thd_dispatch(t, tok, sl_thd_curr());
-		//return sl_thd_dispatch_slowpath(t, tok);
 	}
 }
 
diff --git a/src/components/include/sl_thd.h b/src/components/include/sl_thd.h
index beadf4be74..67c1d97ff1 100644
--- a/src/components/include/sl_thd.h
+++ b/src/components/include/sl_thd.h
@@ -33,11 +33,6 @@ struct event_info {
 	tcap_time_t timeout;
 };
 
-struct sl_dcb_info {
-	unsigned long ip;
-	unsigned long sp;
-} __attribute__((__packed__));
-
 struct sl_thd {
 	sl_thd_state_t       state;
 	/*
@@ -101,20 +96,20 @@ struct sl_thd {
 	struct event_info event_info;
 	struct ps_list    SL_THD_EVENT_LIST; /* list of events for the scheduler end-point */
 
-	struct sl_dcb_info dcb;
+	struct cos_dcb_info *dcb;
 };
 
-static inline struct sl_dcb_info *
+static inline struct cos_dcb_info *
 sl_thd_dcbinfo(struct sl_thd *t)
-{ return &t->dcb; }
+{ return t->dcb; }
 
 static inline unsigned long *
 sl_thd_ip(struct sl_thd *t)
-{ return &t->dcb.ip; }
+{ return &t->dcb->ip; }
 
 static inline unsigned long *
 sl_thd_sp(struct sl_thd *t)
-{ return &t->dcb.sp; }
+{ return &t->dcb->sp; }
 
 static inline struct cos_aep_info *
 sl_thd_aepinfo(struct sl_thd *t)
diff --git a/src/components/include/sl_xcpu.h b/src/components/include/sl_xcpu.h
index 65f5126592..21d388eb80 100644
--- a/src/components/include/sl_xcpu.h
+++ b/src/components/include/sl_xcpu.h
@@ -64,6 +64,7 @@ struct sl_global {
 	struct sl_xcpu_request xcpu_rbuf[NUM_CPU][SL_XCPU_RING_SIZE];
 	u32_t cpu_bmp[(NUM_CPU + 7)/8]; /* bitmap of cpus this scheduler is running on! */
 	asndcap_t xcpu_asnd[NUM_CPU][NUM_CPU];
+	struct cos_scb_info *scb_area;
 } CACHE_ALIGNED;
 
 extern struct sl_global sl_global_data;
diff --git a/src/components/interface/capmgr/memmgr.h b/src/components/interface/capmgr/memmgr.h
index b4125336b2..36b4c70de1 100644
--- a/src/components/interface/capmgr/memmgr.h
+++ b/src/components/interface/capmgr/memmgr.h
@@ -11,4 +11,7 @@ cbuf_t        memmgr_shared_page_alloc(vaddr_t *pgaddr);
 cbuf_t        memmgr_shared_page_allocn(unsigned long num_pages, vaddr_t *pgaddr);
 unsigned long memmgr_shared_page_map(cbuf_t id, vaddr_t *pgaddr);
 
+vaddr_t memmgr_initdcbpage_retrieve(void);
+vaddr_t memmgr_dcbpage_allocn(unsigned long num_pages);
+
 #endif /* MEMMGR_H */
diff --git a/src/components/interface/capmgr/stubs/s_stub.S b/src/components/interface/capmgr/stubs/s_stub.S
index 57ff3d510a..fe4922d164 100644
--- a/src/components/interface/capmgr/stubs/s_stub.S
+++ b/src/components/interface/capmgr/stubs/s_stub.S
@@ -14,5 +14,7 @@ cos_asm_server_stub(capmgr_asnd_rcv_create)
 cos_asm_server_stub(capmgr_asnd_key_create)
 
 cos_asm_server_stub(memmgr_heap_page_allocn)
+cos_asm_server_stub(memmgr_initdcbpage_retrieve)
+cos_asm_server_stub(memmgr_dcbpage_allocn)
 cos_asm_server_stub_rets(memmgr_shared_page_allocn_cserialized)
 cos_asm_server_stub_rets(memmgr_shared_page_map_cserialized)
diff --git a/src/components/lib/Makefile b/src/components/lib/Makefile
index ee67d9b202..db74ae9470 100644
--- a/src/components/lib/Makefile
+++ b/src/components/lib/Makefile
@@ -1,6 +1,6 @@
 include Makefile.src Makefile.comp
 
-LIB_OBJS=heap.o cobj_format.o cos_kernel_api.o cos_defkernel_api.o
+LIB_OBJS=heap.o cobj_format.o cos_kernel_api.o cos_defkernel_api.o cos_dcbraw.o cos_dcbcapmgr.o
 LIBS=$(LIB_OBJS:%.o=%.a)
 MANDITORY=c_stub.o cos_asm_upcall.o cos_asm_ainv.o cos_component.o
 MAND=$(MANDITORY_LIB)
diff --git a/src/components/lib/cos_component.c b/src/components/lib/cos_component.c
index fcdb4af884..e4df0404a7 100644
--- a/src/components/lib/cos_component.c
+++ b/src/components/lib/cos_component.c
@@ -191,13 +191,17 @@ cos_upcall_fn(upcall_type_t t, void *arg1, void *arg2, void *arg3)
 					cos_thd_entry_exec(idx);
 				}
 			}
-			return;
+			break;
 		}
 	default:
 		/* fault! */
 		assert(0);
 		return;
 	}
+
+	/* FIXME: for now, don't let threads page-fault on return! */
+	while (1) ;
+
 	return;
 }
 
diff --git a/src/components/lib/cos_dcbcapmgr.c b/src/components/lib/cos_dcbcapmgr.c
new file mode 100644
index 0000000000..74db909b35
--- /dev/null
+++ b/src/components/lib/cos_dcbcapmgr.c
@@ -0,0 +1,45 @@
+#include <cos_dcb.h>
+#include <cos_kernel_api.h>
+#include <../interface/capmgr/memmgr.h>
+
+static unsigned long free_off[NUM_CPU] CACHE_ALIGNED = { 0 };
+static struct cos_dcb_info *dcb_off[NUM_CPU] CACHE_ALIGNED = { NULL }, *initdcb[NUM_CPU] CACHE_ALIGNED = { NULL };
+
+void
+cos_dcb_info_init(void)
+{
+	dcb_off[cos_cpuid()] = initdcb[cos_cpuid()] = (struct cos_dcb_info *)memmgr_initdcbpage_retrieve();
+	assert(initdcb[cos_cpuid()]);
+
+	dcb_off[cos_cpuid()]++;
+	free_off[cos_cpuid()] = 1;
+}
+
+void
+cos_dcb_info_alloc(void)
+{
+	dcb_off[cos_cpuid()] = (struct cos_dcb_info *)memmgr_dcbpage_allocn(1);
+	assert(dcb_off[cos_cpuid()]);
+
+	free_off[cos_cpuid()] = 0;
+}
+
+struct cos_dcb_info *
+cos_dcb_info_assign(void)
+{
+	unsigned long curr_off = 0;
+
+	curr_off = ps_faa(&free_off[cos_cpuid()], 1);
+	if (curr_off >= COS_DCB_PERPG_MAX) {
+		cos_dcb_info_alloc();
+		curr_off = ps_faa(&free_off[cos_cpuid()], 1);
+	}
+
+	return (dcb_off[cos_cpuid()] + curr_off);
+}
+
+struct cos_dcb_info *
+cos_dcb_info_init_get(void)
+{
+	return initdcb[cos_cpuid()];
+}
diff --git a/src/components/lib/cos_dcbraw.c b/src/components/lib/cos_dcbraw.c
new file mode 100644
index 0000000000..72f08f9294
--- /dev/null
+++ b/src/components/lib/cos_dcbraw.c
@@ -0,0 +1,47 @@
+#include <cos_dcb.h>
+#include <cos_kernel_api.h>
+#include <cos_defkernel_api.h>
+
+static unsigned long free_off[NUM_CPU] CACHE_ALIGNED = { 0 };
+static struct cos_dcb_info *dcb_off[NUM_CPU] CACHE_ALIGNED = { NULL }, *initdcb[NUM_CPU] CACHE_ALIGNED = { NULL };
+
+void
+cos_dcb_info_init(void)
+{
+	dcb_off[cos_cpuid()] = initdcb[cos_cpuid()] = cos_init_dcb_get();
+	assert(initdcb[cos_cpuid()]);
+
+	dcb_off[cos_cpuid()]++;
+	free_off[cos_cpuid()] = 1;
+}
+
+void
+cos_dcb_info_alloc(void)
+{
+	struct cos_compinfo *ci_res = cos_compinfo_get(cos_defcompinfo_curr_get());
+
+	dcb_off[cos_cpuid()] = cos_dcbpg_bump_allocn(ci_res, PAGE_SIZE);
+	assert(dcb_off[cos_cpuid()]);
+
+	free_off[cos_cpuid()] = 0;
+}
+
+struct cos_dcb_info *
+cos_dcb_info_assign(void)
+{
+	unsigned long curr_off = 0;
+
+	curr_off = ps_faa(&free_off[cos_cpuid()], 1);
+	if (curr_off >= COS_DCB_PERPG_MAX) {
+		cos_dcb_info_alloc();
+		curr_off = ps_faa(&free_off[cos_cpuid()], 1);
+	}
+
+	return (dcb_off[cos_cpuid()] + curr_off);
+}
+
+struct cos_dcb_info *
+cos_dcb_info_init_get(void)
+{
+	return initdcb[cos_cpuid()];
+}
diff --git a/src/components/lib/cos_defkernel_api.c b/src/components/lib/cos_defkernel_api.c
index 68caf64dc1..6ff0beb15f 100644
--- a/src/components/lib/cos_defkernel_api.c
+++ b/src/components/lib/cos_defkernel_api.c
@@ -87,7 +87,7 @@ cos_defcompinfo_sched_init(void)
 }
 
 static int
-cos_aep_alloc_intern(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci, tcap_t tc, struct cos_aep_info *sched, cos_aepthd_fn_t fn, void *data, thdclosure_index_t idx)
+cos_aep_alloc_intern(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci, tcap_t tc, struct cos_aep_info *sched, cos_aepthd_fn_t fn, void *data, thdclosure_index_t idx, vaddr_t dcbuaddr)
 {
 	struct cos_defcompinfo *defci   = cos_defcompinfo_curr_get();
 	struct cos_compinfo    *ci      = cos_compinfo_get(defci);
@@ -97,9 +97,9 @@ cos_aep_alloc_intern(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci,
 	assert(curr_defci_init_status == INITIALIZED);
 	memset(aep, 0, sizeof(struct cos_aep_info));
 
-	if (is_init)      aep->thd = cos_initthd_alloc(ci, dst_ci->comp_cap);
-	else if (idx > 0) aep->thd = cos_thd_alloc_ext(ci, dst_ci->comp_cap, idx);
-	else              aep->thd = cos_thd_alloc(ci, dst_ci->comp_cap, cos_aepthd_fn, (void *)aep);
+	if (is_init)      aep->thd = cos_initthd_alloc(ci, dst_ci->comp_cap, dst_ci->pgtbl_cap, dcbuaddr);
+	else if (idx > 0) aep->thd = cos_thd_alloc_ext(ci, dst_ci->comp_cap, idx, dst_ci->pgtbl_cap, dcbuaddr);
+	else              aep->thd = cos_thd_alloc(ci, dst_ci->comp_cap, cos_aepthd_fn, (void *)aep, dst_ci->pgtbl_cap, dcbuaddr);
 	assert(aep->thd);
 	aep->tid  = cos_introspect(ci, aep->thd, THD_GET_TID);
 	if (!sched && is_init) return 0;
@@ -121,7 +121,7 @@ cos_aep_alloc_intern(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci,
 
 int
 cos_defcompinfo_child_alloc(struct cos_defcompinfo *child_defci, vaddr_t entry, vaddr_t heap_ptr, capid_t cap_frontier,
-                            int is_sched)
+                            int is_sched, vaddr_t *dcbuaddr, vaddr_t *scbaddr)
 {
 	int                     ret;
 	struct cos_defcompinfo *defci     = cos_defcompinfo_curr_get();
@@ -131,9 +131,11 @@ cos_defcompinfo_child_alloc(struct cos_defcompinfo *child_defci, vaddr_t entry,
 	struct cos_aep_info    *child_aep = cos_sched_aep_get(child_defci);
 
 	assert(curr_defci_init_status == INITIALIZED);
-	ret = cos_compinfo_alloc(child_ci, heap_ptr, cap_frontier, entry, ci);
+	ret = cos_compinfo_alloc(child_ci, heap_ptr, cap_frontier, entry, ci, scbaddr);
 	if (ret) return ret;
-	ret = cos_aep_alloc_intern(child_aep, child_defci, 0, is_sched ? sched_aep : NULL, NULL, NULL, 0);
+	*dcbuaddr = (vaddr_t)cos_dcbpg_bump_allocn(child_ci, PAGE_SIZE);
+	assert(*dcbuaddr);
+	ret = cos_aep_alloc_intern(child_aep, child_defci, 0, is_sched ? sched_aep : NULL, NULL, NULL, 0, *dcbuaddr);
 
 	return ret;
 }
@@ -147,29 +149,29 @@ cos_defcompinfo_childid_init(struct cos_defcompinfo *child_defci, spdid_t c)
 }
 
 int
-cos_initaep_alloc(struct cos_defcompinfo *dst_dci, struct cos_aep_info *sched, int is_sched)
+cos_initaep_alloc(struct cos_defcompinfo *dst_dci, struct cos_aep_info *sched, int is_sched, vaddr_t dcbuaddr)
 {
 	struct cos_defcompinfo *defci     = cos_defcompinfo_curr_get();
 	struct cos_aep_info    *sched_aep = cos_sched_aep_get(defci);
 	struct cos_aep_info    *child_aep = cos_sched_aep_get(dst_dci);
 	struct cos_aep_info    *sched_use = is_sched ? (sched ? sched : sched_aep) : NULL;
 
-	return cos_aep_alloc_intern(child_aep, dst_dci, 0, sched_use, NULL, NULL, 0);
+	return cos_aep_alloc_intern(child_aep, dst_dci, 0, sched_use, NULL, NULL, 0, dcbuaddr);
 }
 
 int
-cos_initaep_tcap_alloc(struct cos_defcompinfo *dst_dci, tcap_t tc, struct cos_aep_info *sched)
+cos_initaep_tcap_alloc(struct cos_defcompinfo *dst_dci, tcap_t tc, struct cos_aep_info *sched, vaddr_t dcbuaddr)
 {
 	struct cos_defcompinfo *defci     = cos_defcompinfo_curr_get();
 	struct cos_aep_info    *sched_aep = cos_sched_aep_get(defci);
 	struct cos_aep_info    *child_aep = cos_sched_aep_get(dst_dci);
 	struct cos_aep_info    *sched_use = sched ? sched : sched_aep;
 
-	return cos_aep_alloc_intern(child_aep, dst_dci, tc, sched_use, NULL, NULL, 0);
+	return cos_aep_alloc_intern(child_aep, dst_dci, tc, sched_use, NULL, NULL, 0, dcbuaddr);
 }
 
 int
-cos_aep_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci, struct cos_aep_info *sched, thdclosure_index_t idx)
+cos_aep_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci, struct cos_aep_info *sched, thdclosure_index_t idx, vaddr_t dcbuaddr)
 {
 	struct cos_defcompinfo *defci     = cos_defcompinfo_curr_get();
 	struct cos_aep_info    *sched_aep = cos_sched_aep_get(defci);
@@ -178,11 +180,11 @@ cos_aep_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci, str
 	if (!sched) sched_aep = cos_sched_aep_get(dst_dci);
 	else        sched_aep = sched;
 
-	return cos_aep_alloc_intern(aep, dst_dci, 0, sched_aep, NULL, NULL, idx);
+	return cos_aep_alloc_intern(aep, dst_dci, 0, sched_aep, NULL, NULL, idx, dcbuaddr);
 }
 
 int
-cos_aep_tcap_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci, struct cos_aep_info *sched, tcap_t tc, thdclosure_index_t idx)
+cos_aep_tcap_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci, struct cos_aep_info *sched, tcap_t tc, thdclosure_index_t idx, vaddr_t dcbuaddr)
 {
 	struct cos_defcompinfo *defci     = cos_defcompinfo_curr_get();
 	struct cos_aep_info    *sched_aep = cos_sched_aep_get(defci);
@@ -192,25 +194,25 @@ cos_aep_tcap_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci
 	if (!sched) sched_aep = cos_sched_aep_get(dst_dci);
 	else        sched_aep = sched;
 
-	return cos_aep_alloc_intern(aep, dst_dci, tc, sched_aep, NULL, NULL, idx);
+	return cos_aep_alloc_intern(aep, dst_dci, tc, sched_aep, NULL, NULL, idx, dcbuaddr);
 }
 
 int
-cos_aep_alloc(struct cos_aep_info *aep, cos_aepthd_fn_t fn, void *data)
+cos_aep_alloc(struct cos_aep_info *aep, cos_aepthd_fn_t fn, void *data, vaddr_t dcbuaddr)
 {
 	struct cos_defcompinfo *defci     = cos_defcompinfo_curr_get();
 	struct cos_aep_info    *sched_aep = cos_sched_aep_get(defci);
 
-	return cos_aep_alloc_intern(aep, defci, 0, sched_aep, fn, data, 0);
+	return cos_aep_alloc_intern(aep, defci, 0, sched_aep, fn, data, 0, dcbuaddr);
 }
 
 int
-cos_aep_tcap_alloc(struct cos_aep_info *aep, tcap_t tc, cos_aepthd_fn_t fn, void *data)
+cos_aep_tcap_alloc(struct cos_aep_info *aep, tcap_t tc, cos_aepthd_fn_t fn, void *data, vaddr_t dcbuaddr)
 {
 	struct cos_defcompinfo *defci     = cos_defcompinfo_curr_get();
 	struct cos_aep_info    *sched_aep = cos_sched_aep_get(defci);
 
-	return cos_aep_alloc_intern(aep, defci, tc, sched_aep, fn, data, 0);
+	return cos_aep_alloc_intern(aep, defci, tc, sched_aep, fn, data, 0, dcbuaddr);
 }
 
 int
diff --git a/src/components/lib/cos_kernel_api.c b/src/components/lib/cos_kernel_api.c
index e1f9f8994c..5d25f85219 100644
--- a/src/components/lib/cos_kernel_api.c
+++ b/src/components/lib/cos_kernel_api.c
@@ -507,7 +507,7 @@ __page_bump_valloc(struct cos_compinfo *ci, size_t sz)
 }
 
 static vaddr_t
-__page_bump_alloc(struct cos_compinfo *ci, size_t sz)
+__page_bump_alloc(struct cos_compinfo *ci, size_t sz, int shared)
 {
 	struct cos_compinfo *meta = __compinfo_metacap(ci);
 	vaddr_t              heap_vaddr, heap_cursor, heap_limit;
@@ -532,7 +532,7 @@ __page_bump_alloc(struct cos_compinfo *ci, size_t sz)
 	for (heap_cursor = heap_vaddr; heap_cursor < heap_limit; heap_cursor += PAGE_SIZE) {
 		vaddr_t umem;
 
-		umem = __umem_bump_alloc(ci);
+		umem = shared ? __kmem_bump_alloc(ci) : __umem_bump_alloc(ci);
 		if (!umem) return 0;
 
 		/* Actually map in the memory. */
@@ -574,7 +574,7 @@ __alloc_mem_cap(struct cos_compinfo *ci, cap_t ct, vaddr_t *kmem, capid_t *cap)
 }
 
 static thdcap_t
-__cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t init_data)
+__cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t init_data, pgtblcap_t ptcap, vaddr_t dcbaddr)
 {
 	vaddr_t kmem;
 	capid_t cap;
@@ -584,10 +584,11 @@ __cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t init
 	assert(ci && comp > 0);
 
 	if (__alloc_mem_cap(ci, CAP_THD, &kmem, &cap)) return 0;
-	assert(!(init_data & ~((1 << 16) - 1)));
-	/* TODO: Add cap size checking */
-	if (call_cap_op(ci->captbl_cap, CAPTBL_OP_THDACTIVATE, (init_data << 16) | cap,
-	                __compinfo_metacap(ci)->mi.pgtbl_cap, kmem, comp))
+	assert(!(init_data & ~((1 << 12) - 1)));
+	assert(kmem && (round_to_page(kmem) == kmem));
+
+	if (call_cap_op(ci->captbl_cap, CAPTBL_OP_THDACTIVATE, __compinfo_metacap(ci)->mi.pgtbl_cap | (cap << 16),
+			kmem | init_data, comp << 16 | ptcap, dcbaddr))
 		BUG();
 
 	return cap;
@@ -596,30 +597,30 @@ __cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t init
 #include <cos_thd_init.h>
 
 thdcap_t
-cos_thd_alloc_ext(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t idx)
+cos_thd_alloc_ext(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t idx, pgtblcap_t ptcap, vaddr_t dcbaddr)
 {
 	if (idx < 1) return 0;
 
-	return __cos_thd_alloc(ci, comp, idx);
+	return __cos_thd_alloc(ci, comp, idx, ptcap, dcbaddr);
 }
 
 thdcap_t
-cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, cos_thd_fn_t fn, void *data)
+cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, cos_thd_fn_t fn, void *data, pgtblcap_t ptcap, vaddr_t dcbaddr)
 {
 	int      idx = cos_thd_init_alloc(fn, data);
 	thdcap_t ret;
 
 	if (idx < 1) return 0;
-	ret = __cos_thd_alloc(ci, comp, idx);
+	ret = __cos_thd_alloc(ci, comp, idx, ptcap, dcbaddr);
 	if (!ret) cos_thd_init_free(idx);
 
 	return ret;
 }
 
 thdcap_t
-cos_initthd_alloc(struct cos_compinfo *ci, compcap_t comp)
+cos_initthd_alloc(struct cos_compinfo *ci, compcap_t comp, pgtblcap_t ptcap, vaddr_t dcbaddr)
 {
-	return __cos_thd_alloc(ci, comp, 0);
+	return __cos_thd_alloc(ci, comp, 0, ptcap, dcbaddr);
 }
 
 captblcap_t
@@ -657,25 +658,25 @@ cos_pgtbl_alloc(struct cos_compinfo *ci)
 }
 
 compcap_t
-cos_comp_alloc(struct cos_compinfo *ci, captblcap_t ctc, pgtblcap_t ptc, vaddr_t entry)
+cos_comp_alloc(struct cos_compinfo *ci, captblcap_t ctc, pgtblcap_t ptc, vaddr_t entry, vaddr_t scbpg)
 {
 	capid_t cap;
 	u32_t   lid = livenessid_bump_alloc();
 
 	printd("cos_comp_alloc\n");
 
-	assert(ci && ctc && ptc && lid);
+	assert(ci && ctc && ptc && lid && scbpg);
 
 	cap = __capid_bump_alloc(ci, CAP_COMP);
 	if (!cap) return 0;
-	if (call_cap_op(ci->captbl_cap, CAPTBL_OP_COMPACTIVATE, cap, (ctc << 16) | ptc, lid, entry)) BUG();
+	if (call_cap_op(ci->captbl_cap, CAPTBL_OP_COMPACTIVATE, (lid << 16) | cap, (ctc << 16) | ptc, scbpg, entry)) BUG();
 
 	return cap;
 }
 
 int
 cos_compinfo_alloc(struct cos_compinfo *ci, vaddr_t heap_ptr, capid_t cap_frontier, vaddr_t entry,
-                   struct cos_compinfo *ci_resources)
+                   struct cos_compinfo *ci_resources, vaddr_t *scbpg)
 {
 	pgtblcap_t  ptc;
 	captblcap_t ctc;
@@ -687,10 +688,13 @@ cos_compinfo_alloc(struct cos_compinfo *ci, vaddr_t heap_ptr, capid_t cap_fronti
 	assert(ptc);
 	ctc = cos_captbl_alloc(ci_resources);
 	assert(ctc);
-	compc = cos_comp_alloc(ci_resources, ctc, ptc, entry);
-	assert(compc);
+	cos_compinfo_init(ci, ptc, ctc, 0, heap_ptr, cap_frontier, ci_resources);
+	*scbpg = (vaddr_t)cos_scbpg_bump_allocn(ci, COS_SCB_SIZE);
+	assert(*scbpg);
 
-	cos_compinfo_init(ci, ptc, ctc, compc, heap_ptr, cap_frontier, ci_resources);
+	compc = cos_comp_alloc(ci_resources, ctc, ptc, entry, *scbpg);
+	assert(compc);
+	ci->comp_cap = compc;
 
 	return 0;
 }
@@ -779,10 +783,29 @@ cos_hw_alloc(struct cos_compinfo *ci, u32_t bitmap)
 	return cap;
 }
 
+/* TODO: Can we alias/etc on this page with this logic? */
+void *
+cos_dcbpg_bump_allocn(struct cos_compinfo *ci, size_t sz)
+{
+	assert(sz == PAGE_SIZE);
+	/* assert(sz % PAGE_SIZE == 0); */
+
+	return (void *)__page_bump_alloc(ci, sz, 1);
+}
+
+void *
+cos_scbpg_bump_allocn(struct cos_compinfo *ci, size_t sz)
+{
+	assert(sz == PAGE_SIZE);
+	/* assert(sz % PAGE_SIZE == 0); */
+
+	return (void *)__page_bump_alloc(ci, sz, 1);
+}
+
 void *
 cos_page_bump_alloc(struct cos_compinfo *ci)
 {
-	return (void *)__page_bump_alloc(ci, PAGE_SIZE);
+	return (void *)__page_bump_alloc(ci, PAGE_SIZE, 0);
 }
 
 void *
@@ -790,7 +813,7 @@ cos_page_bump_allocn(struct cos_compinfo *ci, size_t sz)
 {
 	assert(sz % PAGE_SIZE == 0);
 
-	return (void *)__page_bump_alloc(ci, sz);
+	return (void *)__page_bump_alloc(ci, sz, 0);
 }
 
 capid_t
diff --git a/src/components/lib/sl/sl_capmgr.c b/src/components/lib/sl/sl_capmgr.c
index cc311ae9ac..1d06543569 100644
--- a/src/components/lib/sl/sl_capmgr.c
+++ b/src/components/lib/sl/sl_capmgr.c
@@ -14,6 +14,7 @@
 #include "../../interface/capmgr/memmgr.h"
 #include <bitmap.h>
 #include <sl_child.h>
+#include <cos_dcb.h>
 
 extern void sl_thd_event_info_reset(struct sl_thd *t);
 extern void sl_thd_free_no_cs(struct sl_thd *t);
@@ -55,7 +56,7 @@ sl_xcpu_asnd_alloc(void)
 }
 
 struct sl_thd *
-sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t prps)
+sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t prps, struct cos_dcb_info *dcb)
 {
 	struct sl_thd_policy *tp = NULL;
 	struct sl_thd        *t  = NULL;
@@ -64,6 +65,7 @@ sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t
 	if (!tp) goto done;
 	t  = sl_mod_thd_get(tp);
 
+	t->dcb            = dcb;
 	t->properties     = prps;
 	t->aepinfo        = aep;
 	t->sndcap         = sndcap;
@@ -88,20 +90,23 @@ struct sl_thd *
 sl_thd_alloc_no_cs(cos_thd_fn_t fn, void *data)
 {
 	struct cos_defcompinfo *dci = cos_defcompinfo_curr_get();
-	struct cos_compinfo    *ci  = &dci->ci;
+	struct cos_compinfo    *ci  = cos_compinfo_get(dci);
 	struct sl_thd          *t   = NULL;
 	struct cos_aep_info    *aep = NULL;
+	struct cos_dcb_info    *dcb = NULL;
 	thdcap_t thdcap = 0;
 	thdid_t tid = 0;
 
 	aep = sl_thd_alloc_aep_backend();
 	if (!aep) goto done;
+	dcb = cos_dcb_info_assign();
 
+	/* TODO: use dcb */
 	aep->thd = capmgr_thd_create(fn, data, &tid);
 	if (!aep->thd) goto done;
 	aep->tid = tid;
 
-	t = sl_thd_alloc_init(aep, 0, 0);
+	t = sl_thd_alloc_init(aep, 0, 0, dcb);
 	sl_mod_thd_create(sl_mod_thd_policy_get(t));
 
 done:
@@ -128,7 +133,7 @@ sl_thd_comp_init_no_cs(struct cos_defcompinfo *comp, sl_thd_property_t prps, asn
 		assert(snd);
 	}
 
-	t = sl_thd_alloc_init(aep, snd, prps);
+	t = sl_thd_alloc_init(aep, snd, prps, NULL);
 	sl_mod_thd_create(sl_mod_thd_policy_get(t));
 
 done:
@@ -136,7 +141,7 @@ sl_thd_comp_init_no_cs(struct cos_defcompinfo *comp, sl_thd_property_t prps, asn
 }
 
 static struct sl_thd *
-sl_thd_alloc_ext_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx)
+sl_thd_alloc_ext_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx, vaddr_t dcbuaddr)
 {
 	struct cos_defcompinfo *dci    = cos_defcompinfo_curr_get();
 	struct cos_compinfo    *ci     = cos_compinfo_get(dci);
@@ -154,7 +159,7 @@ sl_thd_alloc_ext_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx)
 		if (!aep->thd) goto done;
 		aep->tc  = sl_thd_tcap(sl__globals_cpu()->sched_thd);
 
-		t = sl_thd_alloc_init(aep, 0, 0);
+		t = sl_thd_alloc_init(aep, 0, 0, NULL);
 		sl_mod_thd_create(sl_mod_thd_policy_get(t));
 	} else {
 		struct cos_aep_info *compaep = cos_sched_aep_get(comp);
@@ -173,7 +178,7 @@ sl_thd_alloc_ext_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx)
 }
 
 static struct sl_thd *
-sl_thd_aep_alloc_ext_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, thdclosure_index_t idx, sl_thd_property_t prps, cos_channelkey_t key, arcvcap_t *extrcv)
+sl_thd_aep_alloc_ext_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, thdclosure_index_t idx, sl_thd_property_t prps, cos_channelkey_t key, vaddr_t dcbuaddr, arcvcap_t *extrcv)
 {
 	struct cos_aep_info *aep = NULL;
 	struct sl_thd       *t   = NULL;
@@ -201,7 +206,7 @@ sl_thd_aep_alloc_ext_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, t
 		capmgr_aep_create_ext(comp->id, aep, idx, owntc, key, extrcv);
 		if (!aep->thd) goto done;
 
-		t = sl_thd_alloc_init(aep, 0, prps);
+		t = sl_thd_alloc_init(aep, 0, prps, NULL);
 		sl_mod_thd_create(sl_mod_thd_policy_get(t));
 	}
 
@@ -214,16 +219,18 @@ sl_thd_aep_alloc_no_cs(cos_aepthd_fn_t fn, void *data, sl_thd_property_t prps, c
 {
 	struct sl_thd       *t     = NULL;
 	struct cos_aep_info *aep   = NULL;
+	struct cos_dcb_info *dcb   = NULL;
 	int                  owntc = 0;
 
 	aep = sl_thd_alloc_aep_backend();
 	if (!aep) goto done;
+	dcb = cos_dcb_info_assign();
 
 	if (prps & SL_THD_PROPERTY_OWN_TCAP) owntc = 1;
 	capmgr_aep_create(aep, fn, data, owntc, key);
 	if (aep->thd == 0) goto done;
 
-	t = sl_thd_alloc_init(aep, 0, prps);
+	t = sl_thd_alloc_init(aep, 0, prps, dcb);
 	sl_mod_thd_create(sl_mod_thd_policy_get(t));
 
 done:
@@ -270,7 +277,7 @@ sl_thd_comp_init(struct cos_defcompinfo *comp, int is_sched)
 }
 
 struct sl_thd *
-sl_thd_initaep_alloc(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key)
+sl_thd_initaep_alloc(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key, vaddr_t dcbuaddr)
 {
 	struct sl_thd *t = NULL;
 
@@ -278,10 +285,10 @@ sl_thd_initaep_alloc(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int
 
 	sl_cs_enter();
 	if (!is_sched) {
-		t = sl_thd_alloc_ext_no_cs(comp, 0);
+		t = sl_thd_alloc_ext_no_cs(comp, 0, dcbuaddr);
 	} else {
 		t = sl_thd_aep_alloc_ext_no_cs(comp, sched_thd, 0, (is_sched ? SL_THD_PROPERTY_SEND : 0)
-					       | (own_tcap ? SL_THD_PROPERTY_OWN_TCAP : 0), key, NULL);
+					       | (own_tcap ? SL_THD_PROPERTY_OWN_TCAP : 0), key, dcbuaddr, NULL);
 	}
 	sl_cs_exit();
 
@@ -289,7 +296,7 @@ sl_thd_initaep_alloc(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int
 }
 
 struct sl_thd *
-sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, arcvcap_t *extrcv)
+sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, vaddr_t dcbuaddr, arcvcap_t *extrcv)
 {
 	struct sl_thd *t = NULL;
 
@@ -299,9 +306,9 @@ sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thd
 	sl_cs_enter();
 	if (!is_aep) own_tcap = 0;
 	if (is_aep) {
-		t = sl_thd_aep_alloc_ext_no_cs(comp, sched_thd, idx, own_tcap ? SL_THD_PROPERTY_OWN_TCAP : 0, key, extrcv);
+		t = sl_thd_aep_alloc_ext_no_cs(comp, sched_thd, idx, own_tcap ? SL_THD_PROPERTY_OWN_TCAP : 0, key, dcbuaddr, extrcv);
 	} else {
-		t = sl_thd_alloc_ext_no_cs(comp, idx);
+		t = sl_thd_alloc_ext_no_cs(comp, idx, dcbuaddr);
 	}
 	sl_cs_exit();
 
@@ -318,7 +325,7 @@ sl_thd_init_ext_no_cs(struct cos_aep_info *aepthd, struct sl_thd *sched)
 	if (!aep) goto done;
 
 	*aep = *aepthd;
-	t = sl_thd_alloc_init(aep, 0, 0);
+	t = sl_thd_alloc_init(aep, 0, 0, NULL);
 	if (!t) goto done;
 
 	/* use sched info for parent -> child notifications */
diff --git a/src/components/lib/sl/sl_raw.c b/src/components/lib/sl/sl_raw.c
index 3c53cd93ad..378a9c7969 100644
--- a/src/components/lib/sl/sl_raw.c
+++ b/src/components/lib/sl/sl_raw.c
@@ -11,6 +11,7 @@
 #include <cos_debug.h>
 #include <cos_kernel_api.h>
 #include <bitmap.h>
+#include <cos_dcb.h>
 
 extern void sl_thd_event_info_reset(struct sl_thd *t);
 extern void sl_thd_free_no_cs(struct sl_thd *t);
@@ -47,7 +48,7 @@ sl_xcpu_asnd_alloc(void)
 }
 
 struct sl_thd *
-sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t prps)
+sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t prps, struct cos_dcb_info *dcb)
 {
 	struct sl_thd_policy *tp = NULL;
 	struct sl_thd        *t  = NULL;
@@ -57,6 +58,7 @@ sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t
 	if (!tp) goto done;
 	t  = sl_mod_thd_get(tp);
 
+	t->dcb            = dcb;
 	t->properties     = prps;
 	t->aepinfo        = aep;
 	t->sndcap         = sndcap;
@@ -84,16 +86,18 @@ sl_thd_alloc_no_cs(cos_thd_fn_t fn, void *data)
 	struct cos_compinfo    *ci  = cos_compinfo_get(dci);
 	struct sl_thd          *t   = NULL;
 	struct cos_aep_info    *aep = NULL;
+	struct cos_dcb_info    *dcb = NULL;
 
 	aep = sl_thd_alloc_aep_backend();
 	if (!aep) goto done;
+	dcb = cos_dcb_info_assign();
 
-	aep->thd = cos_thd_alloc(ci, ci->comp_cap, fn, data);
+	aep->thd = cos_thd_alloc(ci, ci->comp_cap, fn, data, ci->pgtbl_cap, (vaddr_t)dcb);
 	if (!aep->thd) goto done;
 	aep->tid = cos_introspect(ci, aep->thd, THD_GET_TID);
 	if (!aep->tid) goto done;
 
-	t = sl_thd_alloc_init(aep, 0, 0);
+	t = sl_thd_alloc_init(aep, 0, 0, dcb);
 	sl_mod_thd_create(sl_mod_thd_policy_get(t));
 
 done:
@@ -119,7 +123,7 @@ sl_thd_comp_init_no_cs(struct cos_defcompinfo *comp, sl_thd_property_t prps, asn
 		assert(snd);
 	}
 
-	t = sl_thd_alloc_init(aep, snd, prps);
+	t = sl_thd_alloc_init(aep, snd, prps, NULL);
 	sl_mod_thd_create(sl_mod_thd_policy_get(t));
 
 done:
@@ -127,7 +131,7 @@ sl_thd_comp_init_no_cs(struct cos_defcompinfo *comp, sl_thd_property_t prps, asn
 }
 
 static struct sl_thd *
-sl_thd_alloc_ext_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx)
+sl_thd_alloc_ext_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx, vaddr_t dcbuaddr)
 {
 	struct cos_defcompinfo *dci    = cos_defcompinfo_curr_get();
 	struct cos_compinfo    *ci     = cos_compinfo_get(dci);
@@ -140,16 +144,16 @@ sl_thd_alloc_ext_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx)
 		aep = sl_thd_alloc_aep_backend();
 		if (!aep) goto done;
 
-		aep->thd = cos_thd_alloc_ext(ci, compci->comp_cap, idx);
+		aep->thd = cos_thd_alloc_ext(ci, compci->comp_cap, idx, compci->pgtbl_cap, dcbuaddr);
 		if (!aep->thd) goto done;
 		aep->tid = cos_introspect(ci, aep->thd, THD_GET_TID);
 		if (!aep->tid) goto done;
 
-		t = sl_thd_alloc_init(aep, 0, 0);
+		t = sl_thd_alloc_init(aep, 0, 0, NULL);
 		sl_mod_thd_create(sl_mod_thd_policy_get(t));
 	} else {
 		assert(idx == 0);
-		ret = cos_initaep_alloc(comp, NULL, 0);
+		ret = cos_initaep_alloc(comp, NULL, 0, dcbuaddr);
 		if (ret) goto done;
 
 		t = sl_thd_comp_init_no_cs(comp, 0, 0);
@@ -165,18 +169,20 @@ sl_thd_aep_alloc_no_cs(cos_aepthd_fn_t fn, void *data, sl_thd_property_t prps, c
 	struct cos_defcompinfo *dci = cos_defcompinfo_curr_get();
 	struct sl_thd          *t   = NULL;
 	struct cos_aep_info    *aep = NULL;
+	struct cos_dcb_info    *dcb = NULL;
 	int                     ret;
 
 	aep = sl_thd_alloc_aep_backend();
 	if (!aep) goto done;
+	dcb = cos_dcb_info_assign();
 
 	/* NOTE: Cannot use stack-allocated cos_aep_info struct here */
-	if (prps & SL_THD_PROPERTY_OWN_TCAP) ret = cos_aep_alloc(aep, fn, data);
+	if (prps & SL_THD_PROPERTY_OWN_TCAP) ret = cos_aep_alloc(aep, fn, data, (vaddr_t)dcb);
 	else                                 ret = cos_aep_tcap_alloc(aep, sl_thd_aepinfo(sl__globals_cpu()->sched_thd)->tc,
-			                                              fn, data);
+			                                              fn, data, (vaddr_t)dcb);
 	if (ret) goto done;
 
-	t = sl_thd_alloc_init(aep, 0, prps);
+	t = sl_thd_alloc_init(aep, 0, prps, dcb);
 	sl_mod_thd_create(sl_mod_thd_policy_get(t));
 
 done:
@@ -184,7 +190,7 @@ sl_thd_aep_alloc_no_cs(cos_aepthd_fn_t fn, void *data, sl_thd_property_t prps, c
 }
 
 static struct sl_thd *
-sl_thd_aep_alloc_ext_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, thdclosure_index_t idx, sl_thd_property_t prps, cos_channelkey_t key, arcvcap_t *extrcv)
+sl_thd_aep_alloc_ext_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, thdclosure_index_t idx, sl_thd_property_t prps, cos_channelkey_t key, vaddr_t dcbuaddr, arcvcap_t *extrcv)
 {
 	struct cos_aep_info *aep = NULL;
 	struct sl_thd       *t   = NULL;
@@ -194,9 +200,9 @@ sl_thd_aep_alloc_ext_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, t
 	if (prps & SL_THD_PROPERTY_SEND) {
 		assert(sched);
 		if (prps & SL_THD_PROPERTY_OWN_TCAP) {
-			ret = cos_initaep_alloc(comp, sl_thd_aepinfo(sched), prps & SL_THD_PROPERTY_SEND);
+			ret = cos_initaep_alloc(comp, sl_thd_aepinfo(sched), prps & SL_THD_PROPERTY_SEND, dcbuaddr);
 		} else {
-			ret = cos_initaep_tcap_alloc(comp, sl_thd_tcap(sched), sl_thd_aepinfo(sched));
+			ret = cos_initaep_tcap_alloc(comp, sl_thd_tcap(sched), sl_thd_aepinfo(sched), dcbuaddr);
 		}
 		if (ret) goto done;
 
@@ -208,13 +214,13 @@ sl_thd_aep_alloc_ext_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, t
 		if (!aep) goto done;
 
 		if (prps & SL_THD_PROPERTY_OWN_TCAP) {
-			ret = cos_aep_alloc_ext(aep, comp, sl_thd_aepinfo(sched), idx);
+			ret = cos_aep_alloc_ext(aep, comp, sl_thd_aepinfo(sched), idx, dcbuaddr);
 		} else {
-			ret = cos_aep_tcap_alloc_ext(aep, comp, sl_thd_aepinfo(sched), sl_thd_tcap(sched), idx);
+			ret = cos_aep_tcap_alloc_ext(aep, comp, sl_thd_aepinfo(sched), sl_thd_tcap(sched), idx, dcbuaddr);
 		}
 		if (ret) goto done;
 
-		t = sl_thd_alloc_init(aep, 0, prps);
+		t = sl_thd_alloc_init(aep, 0, prps, NULL);
 		sl_mod_thd_create(sl_mod_thd_policy_get(t));
 
 		if (extrcv) *extrcv = sl_thd_rcvcap(t);
@@ -264,23 +270,23 @@ sl_thd_comp_init(struct cos_defcompinfo *comp, int is_sched)
 }
 
 struct sl_thd *
-sl_thd_initaep_alloc(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key)
+sl_thd_initaep_alloc(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key, vaddr_t dcbuaddr)
 {
 	struct sl_thd *t = NULL;
 
 	if (!comp) return NULL;
 
 	sl_cs_enter();
-	if (!is_sched) t = sl_thd_alloc_ext_no_cs(comp, 0);
+	if (!is_sched) t = sl_thd_alloc_ext_no_cs(comp, 0, dcbuaddr);
 	else           t = sl_thd_aep_alloc_ext_no_cs(comp, sched_thd, 0, (is_sched ? SL_THD_PROPERTY_SEND : 0)
-						      | (own_tcap ? SL_THD_PROPERTY_OWN_TCAP : 0), key, NULL);
+						      | (own_tcap ? SL_THD_PROPERTY_OWN_TCAP : 0), key, dcbuaddr, NULL);
 	sl_cs_exit();
 
 	return t;
 }
 
 struct sl_thd *
-sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, arcvcap_t *extrcv)
+sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, vaddr_t dcbuaddr, arcvcap_t *extrcv)
 {
 	struct sl_thd *t = NULL;
 
@@ -288,9 +294,9 @@ sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thd
 	sl_cs_enter();
 	if (!is_aep) own_tcap = 0;
 	if (is_aep) {
-		t = sl_thd_aep_alloc_ext_no_cs(comp, sched_thd, idx, own_tcap ? SL_THD_PROPERTY_OWN_TCAP : 0, key, extrcv);
+		t = sl_thd_aep_alloc_ext_no_cs(comp, sched_thd, idx, own_tcap ? SL_THD_PROPERTY_OWN_TCAP : 0, key, dcbuaddr, extrcv);
 	} else {
-		t = sl_thd_alloc_ext_no_cs(comp, idx);
+		t = sl_thd_alloc_ext_no_cs(comp, idx, dcbuaddr);
 	}
 	sl_cs_exit();
 
@@ -311,7 +317,7 @@ sl_thd_init_ext(struct cos_aep_info *aepthd, struct sl_thd *sched)
 
 	*aep = *aepthd;
 	/* TODO: use sched info for parent -> child notifications */
-	t = sl_thd_alloc_init(aep, 0, 0);
+	t = sl_thd_alloc_init(aep, 0, 0, NULL);
 
 done:
 	sl_cs_exit();
diff --git a/src/components/lib/sl/sl_sched.c b/src/components/lib/sl/sl_sched.c
index 8cbdf5c5c3..30a15b52f1 100644
--- a/src/components/lib/sl/sl_sched.c
+++ b/src/components/lib/sl/sl_sched.c
@@ -13,11 +13,12 @@
 #include <cos_debug.h>
 #include <cos_kernel_api.h>
 #include <bitmap.h>
+#include <cos_dcb.h>
 
 struct sl_global sl_global_data;
 struct sl_global_cpu sl_global_cpu_data[NUM_CPU] CACHE_ALIGNED;
 static void sl_sched_loop_intern(int non_block) __attribute__((noreturn));
-extern struct sl_thd *sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t prps);
+extern struct sl_thd *sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t prps, struct cos_dcb_info *dcb);
 extern int sl_xcpu_process_no_cs(void);
 extern void sl_xcpu_asnd_alloc(void);
 
@@ -551,6 +552,7 @@ sl_global_init(u32_t *cpu_bmp)
 	unsigned int i = 0;
 
 	memset(g, 0, sizeof(struct sl_global));
+	assert(sizeof(struct cos_scb_info) * NUM_CPU <= COS_SCB_SIZE && COS_SCB_SIZE == PAGE_SIZE);
 
 	for (i = 0; i < NUM_CPU; i++) {
 		if (!bitmap_check(cpu_bmp, i)) continue;
@@ -558,6 +560,7 @@ sl_global_init(u32_t *cpu_bmp)
 		bitmap_set(g->cpu_bmp, i);
 		ck_ring_init(sl__ring(i), SL_XCPU_RING_SIZE);
 	}
+	g->scb_area = (struct cos_scb_info *)cos_scb_info_get();
 }
 
 void
@@ -583,15 +586,17 @@ sl_init(microsec_t period)
 	assert(sizeof(struct sl_cs) <= sizeof(unsigned long));
 	memset(g, 0, sizeof(struct sl_global_cpu));
 
-	g->cyc_per_usec    = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
-	g->lock.u.v        = 0;
+	g->cyc_per_usec = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
+	g->lock.u.v     = 0;
+	g->scb_info     = ((sl__globals()->scb_area) + cos_cpuid());
 
 	sl_thd_init_backend();
 	sl_mod_init();
 	sl_timeout_init(period);
 
 	/* Create the scheduler thread for us. cos_sched_aep_get() is from global(static) memory */
-	g->sched_thd       = sl_thd_alloc_init(cos_sched_aep_get(dci), 0, 0);
+	cos_dcb_info_init();
+	g->sched_thd       = sl_thd_alloc_init(cos_sched_aep_get(dci), 0, 0, (struct cos_dcb_info *)cos_init_dcb_get());
 	assert(g->sched_thd);
 	g->sched_thdcap    = BOOT_CAPTBL_SELF_INITTHD_CPU_BASE;
 	g->sched_tcap      = BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE;
diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index a22edc37d6..f4ea791bd0 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -932,6 +932,8 @@ cap_introspect(struct captbl *ct, capid_t capid, u32_t op, unsigned long *retval
 		return thd_introspect(((struct cap_thd *)ch)->t, op, retval);
 	case CAP_TCAP:
 		return tcap_introspect(((struct cap_tcap *)ch)->tcap, op, retval);
+	case CAP_COMP:
+		return comp_introspect(((struct cap_comp *)ch), op, retval);
 	default:
 		return -EINVAL;
 	}
@@ -1184,22 +1186,33 @@ static int __attribute__((noinline)) composite_syscall_slowpath(struct pt_regs *
 			break;
 		}
 		case CAPTBL_OP_THDACTIVATE: {
-			thdclosure_index_t init_data  = __userregs_get1(regs) >> 16;
-			capid_t thd_cap               = __userregs_get1(regs) & 0xFFFF;
-			capid_t pgtbl_cap             = __userregs_get2(regs);
-			capid_t pgtbl_addr            = __userregs_get3(regs);
-			capid_t compcap               = __userregs_get4(regs);
-
-			struct thread *thd;
-			unsigned long *pte = NULL;
-
-			ret = cap_kmem_activate(ct, pgtbl_cap, pgtbl_addr, (unsigned long *)&thd, &pte);
+			u32_t              reg2         = __userregs_get2(regs);
+			u32_t              reg3         = __userregs_get3(regs);
+			u32_t              reg4         = __userregs_get4(regs);
+			thdclosure_index_t init_data    = (reg2) & (~(~0 << 12));
+			capid_t            pgtbl_addr   = (reg2) & (~0 << 12);
+			capid_t            thd_cap      = (capin >> 16);
+			capid_t            pgtbl_cap    = (capin << 16) >> 16;
+			capid_t            compcap      = (reg3 >> 16);
+			capid_t            dcbpgtbl_cap = (reg3 << 16) >> 16;
+			vaddr_t            dcbuaddr     = reg4, dcbkaddr;
+			unsigned long     *tpte = NULL, *dcbpte = NULL, flags;
+			struct             thread *thd;
+			struct cap_header *ctfrom;
+
+			ret = cap_kmem_activate(ct, pgtbl_cap, pgtbl_addr, (unsigned long *)&thd, &tpte);
 			if (unlikely(ret)) cos_throw(err, ret);
-			assert(thd && pte);
+			assert(thd && tpte);
+
+			ctfrom = captbl_lkup(ct, dcbpgtbl_cap);
+			if (unlikely(!ctfrom || ctfrom->type != CAP_PGTBL)) return -EINVAL;
+			dcbpte = pgtbl_lkup(((struct cap_pgtbl *)ctfrom)->pgtbl, (dcbuaddr & (~0 << 12)), (u32_t *)&flags);
+			if (!dcbpte) return -EINVAL;
+			dcbkaddr = ((unsigned long)dcbpte & (~0 << 12)) | (dcbuaddr & ~(~0 << 12));
 
 			/* ret is returned by the overall function */
-			ret = thd_activate(ct, cap, thd_cap, thd, compcap, init_data);
-			if (ret) kmem_unalloc(pte);
+			ret = thd_activate(ct, cap, thd_cap, thd, compcap, init_data, dcbkaddr);
+			if (ret) kmem_unalloc(tpte);
 
 			break;
 		}
@@ -1249,10 +1262,12 @@ static int __attribute__((noinline)) composite_syscall_slowpath(struct pt_regs *
 		case CAPTBL_OP_COMPACTIVATE: {
 			capid_t      captbl_cap = __userregs_get2(regs) >> 16;
 			capid_t      pgtbl_cap  = __userregs_get2(regs) & 0xFFFF;
-			livenessid_t lid        = __userregs_get3(regs);
+			livenessid_t lid        = (capin >> 16);
+			capid_t      comp_cap   = (capin << 16) >> 16;
+			vaddr_t      scb_uaddr  = __userregs_get3(regs);
 			vaddr_t      entry_addr = __userregs_get4(regs);
 
-			ret = comp_activate(ct, cap, capin, captbl_cap, pgtbl_cap, lid, entry_addr, NULL);
+			ret = comp_activate(ct, cap, comp_cap, captbl_cap, pgtbl_cap, lid, entry_addr, scb_uaddr);
 			break;
 		}
 		case CAPTBL_OP_COMPDEACTIVATE: {
diff --git a/src/kernel/include/component.h b/src/kernel/include/component.h
index c837cf22fa..0c6c31581b 100644
--- a/src/kernel/include/component.h
+++ b/src/kernel/include/component.h
@@ -17,7 +17,7 @@ struct comp_info {
 	struct liveness_data        liveness;
 	pgtbl_t                     pgtbl;
 	struct captbl *             captbl;
-	struct cos_sched_data_area *comp_nfo;
+	struct cos_scb_info        *scb_data;
 } __attribute__((packed));
 
 struct cap_comp {
@@ -30,13 +30,14 @@ struct cap_comp {
 
 static int
 comp_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t captbl_cap, capid_t pgtbl_cap, livenessid_t lid,
-              vaddr_t entry_addr, struct cos_sched_data_area *sa)
+              vaddr_t entry_addr, vaddr_t scb_uaddr)
 {
 	struct cap_comp *  compc;
 	struct cap_pgtbl * ptc;
 	struct cap_captbl *ctc;
-	u32_t              v;
+	u32_t              v, flags;
 	int                ret = 0;
+	vaddr_t            scb_kaddr = 0;
 
 	ctc = (struct cap_captbl *)captbl_lkup(t, captbl_cap);
 	if (unlikely(!ctc || ctc->h.type != CAP_CAPTBL || ctc->lvl > 0)) return -EINVAL;
@@ -47,6 +48,9 @@ comp_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t captbl_cap,
 	if (v & CAP_MEM_FROZEN_FLAG) return -EINVAL;
 	if (cos_cas((unsigned long *)&ptc->refcnt_flags, v, v + 1) != CAS_SUCCESS) return -ECASFAIL;
 
+	scb_kaddr = (vaddr_t)pgtbl_lkup(((struct cap_pgtbl *)ptc)->pgtbl, scb_uaddr, &flags);
+	assert(scb_kaddr);
+
 	v = ctc->refcnt_flags;
 	if (v & CAP_MEM_FROZEN_FLAG) cos_throw(undo_ptc, -EINVAL);
 	if (cos_cas((unsigned long *)&ctc->refcnt_flags, v, v + 1) != CAS_SUCCESS) {
@@ -60,7 +64,8 @@ comp_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t captbl_cap,
 	compc->entry_addr    = entry_addr;
 	compc->info.pgtbl    = ptc->pgtbl;
 	compc->info.captbl   = ctc->captbl;
-	compc->info.comp_nfo = sa;
+	compc->info.scb_data = (struct cos_scb_info *)scb_kaddr;
+	memset(compc->info.scb_data, 0, PAGE_SIZE);
 	compc->pgd           = ptc;
 	compc->ct_top        = ctc;
 	ltbl_get(lid, &compc->info.liveness);
@@ -107,4 +112,17 @@ comp_init(void)
 	assert(sizeof(struct cap_comp) <= __captbl_cap2bytes(CAP_COMP));
 }
 
+static inline int
+comp_introspect(struct cap_comp *t, unsigned long op, unsigned long *retval)
+{
+	switch (op) {
+	case COMP_GET_SCB_CURTHD:
+		*retval = t->info.scb_data->curr_thd;
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
 #endif /* COMPONENT_H */
diff --git a/src/kernel/include/pgtbl.h b/src/kernel/include/pgtbl.h
index 7ef95512d8..bef911d995 100644
--- a/src/kernel/include/pgtbl.h
+++ b/src/kernel/include/pgtbl.h
@@ -290,6 +290,7 @@ pgtbl_mapping_add(pgtbl_t pt, u32_t addr, u32_t page, u32_t flags)
 	                                          PGTBL_DEPTH, &accum);
 	if (!pte) return -ENOENT;
 	orig_v = (u32_t)(pte->next);
+//	printk("%p %x\n", pte, orig_v);
 
 	if (orig_v & PGTBL_PRESENT) return -EEXIST;
 	if (orig_v & PGTBL_COSFRAME) return -EPERM;
@@ -357,6 +358,7 @@ pgtbl_cosframe_add(pgtbl_t pt, u32_t addr, u32_t page, u32_t flags)
                                                   PGTBL_DEPTH, &accum);
 	orig_v = (u32_t)(pte->next);
 	assert(orig_v == 0);
+//	printk("%x %x %p %x\n", addr, page, pte, orig_v);
 
 	return __pgtbl_update_leaf(pte, (void *)(page | flags), 0);
 }
diff --git a/src/kernel/include/shared/cos_config.h b/src/kernel/include/shared/cos_config.h
index 0a67c5f027..b90ad107f3 100644
--- a/src/kernel/include/shared/cos_config.h
+++ b/src/kernel/include/shared/cos_config.h
@@ -60,6 +60,7 @@
 
 /* Composite user memory uses physical memory above this. */
 #define COS_MEM_START COS_MEM_KERN_PA
+#define COS_SCB_SIZE  (PAGE_SIZE)
 
 /* NUM_CPU_SOCKETS defined in cpu_ghz.h. The information is used for
  * intelligent IPI distribution. */
diff --git a/src/kernel/include/shared/cos_types.h b/src/kernel/include/shared/cos_types.h
index 1e94df8235..f832d08cdb 100644
--- a/src/kernel/include/shared/cos_types.h
+++ b/src/kernel/include/shared/cos_types.h
@@ -291,6 +291,8 @@ enum
 {
 	/* thread id */
 	THD_GET_TID,
+	THD_GET_DCB_IP,
+	THD_GET_DCB_SP,
 };
 
 enum
@@ -299,6 +301,12 @@ enum
 	TCAP_GET_BUDGET,
 };
 
+enum
+{
+	/* get current thread info from scb */
+	COMP_GET_SCB_CURTHD,
+};
+
 /* Macro used to define per core variables */
 #define PERCPU(type, name)       \
 	PERCPU_DECL(type, name); \
@@ -390,6 +398,16 @@ struct cos_stack_freelists {
 /* #error "Assembly in <fill in file name here> requires that COMP_INFO_STACK_FREELISTS != 1 ||
  * COMP_INFO_TMEM_STK_RELINQ != 0.  Change the defines, or change the assembly" */
 /* #endif */
+struct cos_scb_info {
+	capid_t     curr_thd;
+	cycles_t    timer_next;
+	sched_tok_t sched_tok;
+} CACHE_ALIGNED;
+
+struct cos_dcb_info {
+	unsigned long ip;
+	unsigned long sp;
+} __attribute__((packed));
 
 struct cos_component_information {
 	struct cos_stack_freelists cos_stacks;
@@ -400,7 +418,7 @@ struct cos_component_information {
 	vaddr_t                    cos_heap_allocated, cos_heap_alloc_extent;
 	vaddr_t                    cos_upcall_entry;
 	vaddr_t                    cos_async_inv_entry;
-	//	struct cos_sched_data_area *cos_sched_data_area;
+	struct cos_scb_info               *cos_scb_data;
 	vaddr_t                            cos_user_caps;
 	struct restartable_atomic_sequence cos_ras[COS_NUM_ATOMIC_SECTIONS / 2];
 	vaddr_t                            cos_poly[COMP_INFO_POLY_NUM];
diff --git a/src/kernel/include/thd.h b/src/kernel/include/thd.h
index 8c10d536cc..8861188c86 100644
--- a/src/kernel/include/thd.h
+++ b/src/kernel/include/thd.h
@@ -69,6 +69,7 @@ struct thread {
 	tcap_time_t    timeout;
 	struct thread *interrupted_thread;
 	struct thread *scheduler_thread;
+	struct cos_dcb_info *dcbinfo;
 
 	/* rcv end-point data-structures */
 	struct rcvcap_info rcvcap;
@@ -333,7 +334,7 @@ thd_scheduler_set(struct thread *thd, struct thread *sched)
 }
 
 static int
-thd_activate(struct captbl *t, capid_t cap, capid_t capin, struct thread *thd, capid_t compcap, thdclosure_index_t init_data)
+thd_activate(struct captbl *t, capid_t cap, capid_t capin, struct thread *thd, capid_t compcap, thdclosure_index_t init_data, unsigned long dcbkaddr)
 {
 	struct cos_cpu_local_info *cli = cos_cpu_local_info();
 	struct cap_thd            *tc;
@@ -354,6 +355,8 @@ thd_activate(struct captbl *t, capid_t cap, capid_t capin, struct thread *thd, c
 	thd->refcnt                           = 1;
 	thd->invstk_top                       = 0;
 	thd->cpuid                            = get_cpuid();
+	thd->dcbinfo                          = (struct cos_dcb_info *)dcbkaddr;
+	memset(thd->dcbinfo, 0, sizeof(struct cos_dcb_info));
 	assert(thd->tid <= MAX_NUM_THREADS);
 	thd_scheduler_set(thd, thd_current(cli));
 
@@ -584,6 +587,14 @@ thd_introspect(struct thread *t, unsigned long op, unsigned long *retval)
 	case THD_GET_TID:
 		*retval = t->tid;
 		break;
+	case THD_GET_DCB_IP:
+		*retval = t->dcbinfo->ip;
+		printk("%lx\n", t->dcbinfo->ip);
+		break;
+	case THD_GET_DCB_SP:
+		*retval = t->dcbinfo->sp;
+		printk("%lx\n", t->dcbinfo->sp);
+		break;
 	default:
 		return -EINVAL;
 	}
diff --git a/src/platform/i386/boot_comp.c b/src/platform/i386/boot_comp.c
index 82b363de1f..4c05e2a006 100644
--- a/src/platform/i386/boot_comp.c
+++ b/src/platform/i386/boot_comp.c
@@ -12,6 +12,7 @@
 
 extern u8_t *boot_comp_pgd;
 
+vaddr_t dcb_addr[NUM_CPU];
 void *thd_mem[NUM_CPU], *tcap_mem[NUM_CPU];
 struct captbl *glb_boot_ct;
 
@@ -23,7 +24,7 @@ boot_nptes(unsigned int sz)
 
 int
 boot_pgtbl_mappings_add(struct captbl *ct, capid_t pgdcap, capid_t ptecap, const char *label, void *kern_vaddr,
-                        unsigned long user_vaddr, unsigned int range, int uvm)
+                        unsigned long user_vaddr, unsigned int range, int uvm, unsigned long *scb_uaddr)
 {
 	int               ret;
 	u8_t *            ptes;
@@ -34,7 +35,8 @@ boot_pgtbl_mappings_add(struct captbl *ct, capid_t pgdcap, capid_t ptecap, const
 	pgd_cap = (struct cap_pgtbl *)captbl_lkup(ct, pgdcap);
 	if (!pgd_cap || !CAP_TYPECHK(pgd_cap, CAP_PGTBL)) assert(0);
 	pgtbl = (pgtbl_t)pgd_cap->pgtbl;
-	nptes = boot_nptes(range);
+	if (!uvm) nptes = boot_nptes(range);
+	else      nptes = boot_nptes(range + COS_SCB_SIZE);
 	ptes  = mem_boot_alloc(nptes);
 	assert(ptes);
 
@@ -85,13 +87,42 @@ boot_pgtbl_mappings_add(struct captbl *ct, capid_t pgdcap, capid_t ptecap, const
 		if (!uvm && pgtbl_cosframe_add(pgtbl, mapat, pf, PGTBL_COSFRAME)) assert(0);
 		assert((void *)p == pgtbl_lkup(pgtbl, user_vaddr + i * PAGE_SIZE, &flags));
 	}
+	if (uvm) {
+		unsigned int j;
+		u8_t   *p;
+		paddr_t pf;
+		u32_t   mapat = (u32_t)user_vaddr + i * PAGE_SIZE, flags = 0;
+
+		assert(i == range / PAGE_SIZE);
+		assert(COS_SCB_SIZE == PAGE_SIZE); /* FIXME: for prototype impl! */
+		p = mem_boot_alloc(1);
+		assert(p);
+		pf = chal_va2pa(p);
+
+		if (pgtbl_mapping_add(pgtbl, mapat, pf, PGTBL_USER_DEF)) assert(0);
+		*scb_uaddr = (unsigned long)mapat;
+		i++;
+
+		for (j = 0; j < NUM_CPU; j++, i++) {
+			unsigned long *pte = NULL, flags;
+			mapat = (u32_t)user_vaddr + i * PAGE_SIZE;
+			p = mem_boot_alloc(1);
+			assert(p);
+			pf = chal_va2pa(p);
+			if (pgtbl_mapping_add(pgtbl, mapat, pf, PGTBL_USER_DEF)) assert(0);
+
+			dcb_addr[j] = (unsigned long)p;
+			pte = pgtbl_lkup(pgtbl, mapat, (u32_t *)&flags);
+			assert((void *)p == pte);
+		}
+	}
 
 	return 0;
 }
 
 /* FIXME:  loops to create threads/tcaps/rcv caps per core. */
 static void
-kern_boot_thd(struct captbl *ct, void *thd_mem, void *tcap_mem, const cpuid_t cpu_id)
+kern_boot_thd(struct captbl *ct, void *thd_mem, void *tcap_mem, vaddr_t dcb_addr, const cpuid_t cpu_id)
 {
 	struct cos_cpu_local_info *cos_info = cos_cpu_local_info();
 	struct thread *            t        = thd_mem;
@@ -108,7 +139,7 @@ kern_boot_thd(struct captbl *ct, void *thd_mem, void *tcap_mem, const cpuid_t cp
 	cos_info->cpuid          = cpu_id;
 	cos_info->invstk_top     = 0;
 	cos_info->overflow_check = 0xDEADBEEF;
-	ret = thd_activate(ct, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cpu_id), thd_mem, BOOT_CAPTBL_SELF_COMP, 0);
+	ret = thd_activate(ct, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cpu_id), thd_mem, BOOT_CAPTBL_SELF_COMP, 0, dcb_addr);
 	assert(!ret);
 
 	tcap_active_init(cos_info);
@@ -152,12 +183,13 @@ kern_boot_comp(const cpuid_t cpu_id)
 	u8_t *         boot_comp_captbl;
 	pgtbl_t        pgtbl     = (pgtbl_t)chal_va2pa(&boot_comp_pgd), boot_vm_pgd;
 	u32_t          hw_bitmap = 0xFFFFFFFF;
+	vaddr_t        scb_uaddr  = 0;
 
 	assert(cpu_id >= 0);
 	if (NUM_CPU > 1 && cpu_id > 0) {
 		assert(glb_boot_ct);
 		pgtbl_update(pgtbl);
-		kern_boot_thd(glb_boot_ct, thd_mem[cpu_id], tcap_mem[cpu_id], cpu_id);
+		kern_boot_thd(glb_boot_ct, thd_mem[cpu_id], tcap_mem[cpu_id], dcb_addr[cpu_id], cpu_id);
 		return;
 	}
 
@@ -201,8 +233,8 @@ kern_boot_comp(const cpuid_t cpu_id)
 		assert(0);
 
 	ret = boot_pgtbl_mappings_add(glb_boot_ct, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_BOOTVM_PTE, "booter VM", mem_bootc_start(),
-	                              (unsigned long)mem_bootc_vaddr(), mem_bootc_end() - mem_bootc_start(), 1);
-	assert(ret == 0);
+	                              (unsigned long)mem_bootc_vaddr(), mem_bootc_end() - mem_bootc_start(), 1, &scb_uaddr);
+	assert(ret == 0 && scb_uaddr);
 
 	/*
 	 * This _must_ be the last allocation.  The bump pointer
@@ -215,7 +247,7 @@ kern_boot_comp(const cpuid_t cpu_id)
 	nkmemptes = boot_nptes(mem_utmem_end() - mem_boot_end());
 	ret       = boot_pgtbl_mappings_add(glb_boot_ct, BOOT_CAPTBL_SELF_UNTYPED_PT, BOOT_CAPTBL_KM_PTE, "untyped memory",
                                       mem_boot_nalloc_end(nkmemptes), BOOT_MEM_KM_BASE,
-                                      mem_utmem_end() - mem_boot_nalloc_end(nkmemptes), 0);
+                                      mem_utmem_end() - mem_boot_nalloc_end(nkmemptes), 0, 0);
 	assert(ret == 0);
 
 	printk("\tCapability table and page-table created.\n");
@@ -224,11 +256,11 @@ kern_boot_comp(const cpuid_t cpu_id)
 	glb_memlayout.allocs_avail = 0;
 
 	if (comp_activate(glb_boot_ct, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_COMP, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_PT, 0,
-	                  (vaddr_t)mem_bootc_entry(), NULL))
+	                  (vaddr_t)mem_bootc_entry(), scb_uaddr))
 		assert(0);
 	printk("\tCreated boot component structure from page-table and capability-table.\n");
 
-	kern_boot_thd(glb_boot_ct, thd_mem[cpu_id], tcap_mem[cpu_id], cpu_id);
+	kern_boot_thd(glb_boot_ct, thd_mem[cpu_id], tcap_mem[cpu_id], dcb_addr[cpu_id], cpu_id);
 
 	printk("\tBoot component initialization complete.\n");
 }

From 5913090feae28dc5e93be63afbb32d8f9bb942c9 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Fri, 22 Feb 2019 15:13:21 -0500
Subject: [PATCH 027/127] Fixed user-level dispatch which also works with timer
 interrupts

* TODO: use dcb capabilities instead of raw dcb pages
---
 src/components/Makefile.comp                  |  2 +-
 .../tests/unit_schedtests/unit_schedlib.c     | 23 +-----
 src/components/include/sl.h                   | 39 +++++++---
 src/components/lib/sl/Makefile                |  2 +-
 src/components/lib/sl/sl_sched.c              | 17 ++---
 src/components/lib/sl/sl_slowpath.S           |  9 ---
 src/kernel/capinv.c                           | 73 +++++++++++++------
 src/kernel/include/shared/cos_types.h         |  2 +-
 src/kernel/include/thd.h                      | 28 ++++++-
 9 files changed, 122 insertions(+), 73 deletions(-)
 delete mode 100644 src/components/lib/sl/sl_slowpath.S

diff --git a/src/components/Makefile.comp b/src/components/Makefile.comp
index 602bb7a0a1..80251128cc 100644
--- a/src/components/Makefile.comp
+++ b/src/components/Makefile.comp
@@ -52,6 +52,6 @@ SERVER_STUB=s_stub.o
 CLIENT_STUB=c_stub.o
 
 LIBCOSDEFKERN=-lcos_kernel_api -lcos_defkernel_api
-LIBSLCORE=$(LIBCOSDEFKERN) -lsl_sched -lheap -lsl_xcpu -lsl_child -lck -lsl_slowpath
+LIBSLCORE=$(LIBCOSDEFKERN) -lsl_sched -lheap -lsl_xcpu -lsl_child -lck
 LIBSLCAPMGR=$(LIBSLCORE) -lsl_capmgr -lcos_dcbcapmgr
 LIBSLRAW=$(LIBSLCORE) -lsl_raw -lcos_dcbraw
diff --git a/src/components/implementation/tests/unit_schedtests/unit_schedlib.c b/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
index 55343b1728..9b098260c8 100644
--- a/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
+++ b/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
@@ -40,8 +40,6 @@ static volatile int testing = 1;
 void
 test_thd_perffn(void *data)
 {
-	struct cos_compinfo *ci = cos_compinfo_get(cos_defcompinfo_curr_get());
-	thdcap_t thdc = 0;
 	cycles_t start_cycs = 0, end_cycs = 0, wc_cycs = 0, total_cycs = 0;
 	unsigned int i = 0;
 
@@ -70,9 +68,6 @@ test_thd_perffn(void *data)
 	PRINTC("SWITCH UBENCH: avg: %llu, wc: %llu, iters:%u\n", (total_cycs / (2 * PERF_ITERS)), wc_cycs, PERF_ITERS);
 	testing = 0;
 	/* done testing! let the spinfn cleanup! */
-	PRINTC("CURR THD: %u\n", (unsigned int)cos_introspect(ci, ci->comp_cap, COMP_GET_SCB_CURTHD));
-	thdc = sl_thd_thdcap(sl_thd_curr());
-	PRINTC("%u DCB IP: %lx, DCB SP: %lx\n", (unsigned int)thdc, (unsigned long)cos_introspect(ci, thdc, THD_GET_DCB_IP), (unsigned long)cos_introspect(ci, thdc, THD_GET_DCB_SP));
 	sl_thd_yield(0);
 
 	sl_thd_exit();
@@ -81,17 +76,11 @@ test_thd_perffn(void *data)
 void
 test_thd_spinfn(void *data)
 {
-	struct cos_compinfo *ci = cos_compinfo_get(cos_defcompinfo_curr_get());
-	thdcap_t thdc = 0;
-
 	while (likely(testing)) {
 		rdtscll(mid_cycs);
 		sl_thd_yield(0);
 	}
 
-	thdc = sl_thd_thdcap(sl_thd_curr());
-	PRINTC("CURR THD: %u\n", (unsigned int)cos_introspect(ci, ci->comp_cap, COMP_GET_SCB_CURTHD));
-	PRINTC("%u DCB IP: %lx, DCB SP: %lx\n", (unsigned int)thdc, (unsigned long)cos_introspect(ci, thdc, THD_GET_DCB_IP), (unsigned long)cos_introspect(ci, thdc, THD_GET_DCB_SP));
 	sl_thd_exit();
 }
 
@@ -102,7 +91,7 @@ test_thd_fn(void *data)
 		int workiters = WORKITERS * ((int)data);
 
 		printc("%d", (int)data);
-		SPIN(workiters);
+		//SPIN(workiters);
 		sl_thd_yield(0);
 	}
 }
@@ -112,12 +101,7 @@ test_yield_perf(void)
 {
 	int                     i;
 	struct sl_thd          *threads[N_TESTTHDS_PERF];
-	struct cos_compinfo *ci = cos_compinfo_get(cos_defcompinfo_curr_get());
 	union sched_param_union sp = {.c = {.type = SCHEDP_PRIO, .value = 31}};
-	struct cos_scb_info *scb_info = cos_scb_info_get();
-
-	scb_info->curr_thd = BOOT_CAPTBL_SELF_INITTHD_CPU_BASE;
-	PRINTC("CURR THD: %u\n", (unsigned int)cos_introspect(ci, ci->comp_cap, COMP_GET_SCB_CURTHD));
 
 	for (i = 0; i < N_TESTTHDS_PERF; i++) {
 		if (i == 1) threads[i] = sl_thd_alloc(test_thd_perffn, (void *)&threads[0]);
@@ -125,7 +109,6 @@ test_yield_perf(void)
 		assert(threads[i]);
 		sl_thd_param_set(threads[i], sp.v);
 		PRINTC("Thread %u:%lu created\n", sl_thd_thdid(threads[i]), sl_thd_thdcap(threads[i]));
-		PRINTC("%u DCB IP: %lx, DCB SP: %lx\n", (unsigned int)sl_thd_thdcap(threads[i]), (unsigned long)cos_introspect(ci, sl_thd_thdcap(threads[i]), THD_GET_DCB_IP), (unsigned long)cos_introspect(ci, sl_thd_thdcap(threads[i]), THD_GET_DCB_SP));
 	}
 }
 
@@ -238,8 +221,8 @@ cos_init(void)
 	cos_defcompinfo_init();
 	sl_init(SL_MIN_PERIOD_US);
 
-	test_yield_perf();
-	//test_yields();
+	//test_yield_perf();
+	test_yields();
 	//test_blocking_directed_yield();
 	//test_timeout_wakeup();
 
diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index 0b649e7830..3680d47ee9 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -40,9 +40,6 @@
 #include <sl_xcpu.h>
 #include <heap.h>
 
-extern int thd_dispatch_slowpath(struct sl_thd *t, sched_tok_t tok);
-extern int sl_thd_dispatch_slowpath(struct sl_thd *t, sched_tok_t tok);
-
 /* Critical section (cs) API to protect scheduler data-structures */
 struct sl_cs {
 	union sl_cs_intern {
@@ -407,25 +404,48 @@ sl_thd_is_runnable(struct sl_thd *t)
 	return (t->state == SL_THD_RUNNABLE || t->state == SL_THD_WOKEN);
 }
 
+int sl_thd_kern_dispatch(thdcap_t t);
+
 static inline int
 sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 {
 	struct cos_scb_info *scb = sl_scb_info_cpu();
 
 	__asm__ __volatile__ (				\
+		"pushl %%eax\n\t"			\
+		"pushl %%ebx\n\t"			\
+		"pushl %%ecx\n\t"			\
+		"pushl %%edx\n\t"			\
+		"pushl %%esi\n\t"			\
+		"pushl %%edi\n\t"			\
 		"movl $2f, (%%eax)\n\t"			\
-		"movl %%esp, 4(%%eax)\n\t"		\
-		"cmp $0, 4(%%ebx)\n\t"			\
+		"movl $3f, 4(%%eax)\n\t"		\
+		"movl %%esp, 8(%%eax)\n\t"		\
+		"cmp $0, 8(%%ebx)\n\t"			\
 		"je 1f\n\t"				\
 		"movl %%edx, (%%ecx)\n\t"		\
-		"movl 4(%%ebx), %%esp\n\t"		\
+		"movl 8(%%ebx), %%esp\n\t"		\
 		"jmp *(%%ebx)\n\t"			\
 		"1:\n\t"				\
-		"call thd_dispatch_slowpath\n\t"	\
+		"pushl %%ebp\n\t"			\
+		"movl %%esp, %%ebp\n\t"			\
+		"pushl %%edx\n\t"			\
+		"call sl_thd_kern_dispatch\n\t"		\
+		"addl $4, %%esp\n\t"			\
+		"popl %%ebp\n\t"			\
+		"jmp 3f\n\t"				\
 		"2:\n\t"				\
-		"movl $0, 4(%%ebx)\n\t"			\
+		"movl $0, 8(%%ebx)\n\t"			\
+		"3:\n\t"				\
+		"popl %%edi\n\t"			\
+		"popl %%esi\n\t"			\
+		"popl %%edx\n\t"			\
+		"popl %%ecx\n\t"			\
+		"popl %%ebx\n\t"			\
+		"popl %%eax\n\t"			\
 		:
-		: "a" (sl_thd_dcbinfo(curr)), "b" (sl_thd_dcbinfo(next)), "S" (next), "D" (tok),
+		: "a" (sl_thd_dcbinfo(curr)), "b" (sl_thd_dcbinfo(next)),
+		  "S" ((u32_t)((u64_t)tok >> 32)), "D" ((u32_t)(((u64_t)tok << 32) >> 32)),
 		  "c" (&(scb->curr_thd)), "d" (sl_thd_thdcap(next))
 		: "memory", "cc");
 
@@ -446,6 +466,7 @@ sl_thd_activate(struct sl_thd *t, sched_tok_t tok)
 		return cos_switch(sl_thd_thdcap(t), sl_thd_tcap(t), t->prio,
 				  g->timeout_next, g->sched_rcv, tok);
 	} else {
+		/* TODO: can't use if you're reprogramming a timer/prio */
 		return sl_thd_dispatch(t, tok, sl_thd_curr());
 	}
 }
diff --git a/src/components/lib/sl/Makefile b/src/components/lib/sl/Makefile
index 6599156abd..f4bcf0a260 100644
--- a/src/components/lib/sl/Makefile
+++ b/src/components/lib/sl/Makefile
@@ -1,6 +1,6 @@
 include Makefile.src Makefile.comp
 
-LIB_OBJS=sl_capmgr.o sl_raw.o sl_sched.o sl_xcpu.o sl_child.o sl_mod_fprr.o sl_mod_rr.o sl_lock.o sl_thd_static_backend.o sl_slowpath.o
+LIB_OBJS=sl_capmgr.o sl_raw.o sl_sched.o sl_xcpu.o sl_child.o sl_mod_fprr.o sl_mod_rr.o sl_lock.o sl_thd_static_backend.o
 LIBS=$(LIB_OBJS:%.o=%.a)
 CINC+=-m32
 
diff --git a/src/components/lib/sl/sl_sched.c b/src/components/lib/sl/sl_sched.c
index 30a15b52f1..9411262657 100644
--- a/src/components/lib/sl/sl_sched.c
+++ b/src/components/lib/sl/sl_sched.c
@@ -191,15 +191,6 @@ sl_thd_sched_block_no_cs(struct sl_thd *t, sl_thd_state_t block_type, cycles_t t
 	return 0;
 }
 
-int
-sl_thd_dispatch_slowpath(struct sl_thd *t, sched_tok_t tok)
-{
-	struct sl_global_cpu *g = sl__globals_cpu();
-
-	/* no timeouts for now! */
-	return cos_switch(sl_thd_thdcap(t), g->sched_tcap, t->prio, TCAP_TIME_NIL /*t == g->sched_thd ? TCAP_TIME_NIL : g->timeout_next*/, 0 /* don't switch to scheduler in the middle of this! */, tok);
-}
-
 /*
  * Wake "t" up if it was previously blocked on cos_rcv and got
  * to run before the scheduler (tcap-activated)!
@@ -603,6 +594,8 @@ sl_init(microsec_t period)
 	g->sched_rcv       = BOOT_CAPTBL_SELF_INITRCV_CPU_BASE;
 	g->sched_thd->prio = 0;
 	ps_list_head_init(&g->event_head);
+	assert(cos_thdid() == sl_thd_thdid(g->sched_thd));
+	g->scb_info->curr_thd = 0;
 
 	g->idle_thd        = sl_thd_alloc(sl_idle, NULL);
 	assert(g->idle_thd);
@@ -723,3 +716,9 @@ sl_sched_loop_nonblock(void)
 {
 	sl_sched_loop_intern(1);
 }
+
+int
+sl_thd_kern_dispatch(thdcap_t t)
+{
+	return cos_thd_switch(t);
+}
diff --git a/src/components/lib/sl/sl_slowpath.S b/src/components/lib/sl/sl_slowpath.S
deleted file mode 100644
index 88cc2832a5..0000000000
--- a/src/components/lib/sl/sl_slowpath.S
+++ /dev/null
@@ -1,9 +0,0 @@
-.text
-.globl thd_dispatch_slowpath
-.type thd_dispatch_slowpath, @function
-thd_dispatch_slowpath:
-	pushl %edi
-	pushl %esi
-	call sl_thd_dispatch_slowpath
-	popl %esi
-	popl %edi
diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index f4ea791bd0..41c4f4b895 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -82,6 +82,44 @@ printfn(struct pt_regs *regs)
 	return 0;
 }
 
+static struct thread *
+cap_ulthd_restore(struct pt_regs *regs, struct cos_cpu_local_info *cos_info, struct comp_info **ci_ptr)
+{
+	struct thread  *thd = thd_current(cos_info);
+	struct cap_thd *ch_ult;
+	struct thread  *ulthd;
+	capid_t         ultc;
+	int             invstk_top;
+
+	*ci_ptr = thd_invstk_current_compinfo(thd, cos_info, &invstk_top);
+
+	/* no user-level thread switches in invocations! */
+	if (unlikely(invstk_top)) goto done;
+
+	assert(*ci_ptr && (*ci_ptr)->captbl);
+
+	if (unlikely(!(*ci_ptr)->scb_data)) goto done;
+
+	ultc   = (*ci_ptr)->scb_data->curr_thd;
+	if (!ultc) goto done;
+	ch_ult = (struct cap_thd *)captbl_lkup((*ci_ptr)->captbl, ultc);
+	/* TODO: use kernel curr thread? or sched thread in that component? */
+	if (unlikely(!CAP_TYPECHK_CORE(ch_ult, CAP_THD))) return NULL;
+
+	/* reset inconsistency from user-level thd! */
+	(*ci_ptr)->scb_data->curr_thd = 0;
+
+	ulthd = ch_ult->t;
+	assert(ulthd->dcbinfo);
+	if (ulthd == thd) goto done;
+
+	thd_current_update(ulthd, thd, cos_info);
+	thd = ulthd;
+
+done:
+	return thd;
+}
+
 void cos_cap_ipi_handling(void);
 void
 cos_cap_ipi_handling(void)
@@ -768,12 +806,11 @@ int
 cap_hw_asnd(struct cap_asnd *asnd, struct pt_regs *regs)
 {
 	int                        curr_cpu = get_cpuid();
-	struct cap_arcv *          arcv;
+	struct cap_arcv           *arcv;
 	struct cos_cpu_local_info *cos_info;
-	struct thread *            rcv_thd, *next, *thd;
-	struct tcap *              rcv_tcap, *tcap, *tcap_next;
-	struct comp_info *         ci;
-	unsigned long              ip, sp;
+	struct thread             *rcv_thd, *next, *thd;
+	struct tcap               *rcv_tcap, *tcap, *tcap_next;
+	struct comp_info          *ci;
 
 	if (!CAP_TYPECHK(asnd, CAP_ASND)) return 1;
 	assert(asnd->arcv_capid);
@@ -789,12 +826,10 @@ cap_hw_asnd(struct cap_asnd *asnd, struct pt_regs *regs)
 
 	cos_info = cos_cpu_local_info();
 	assert(cos_info);
-	thd  = thd_current(cos_info);
-	tcap = tcap_current(cos_info);
-	assert(thd);
-	ci = thd_invstk_current(thd, &ip, &sp, cos_info);
-	assert(ci && ci->captbl);
+	thd = cap_ulthd_restore(regs, cos_info, &ci);
+	assert(thd && ci && ci->captbl);
 	assert(!(thd->state & THD_STATE_PREEMPTED));
+	tcap = tcap_current(cos_info);
 	rcv_thd  = arcv->thd;
 	rcv_tcap = rcv_thd->rcvcap.rcvcap_tcap;
 	assert(rcv_tcap && tcap);
@@ -837,16 +872,13 @@ int
 timer_process(struct pt_regs *regs)
 {
 	struct cos_cpu_local_info *cos_info;
-	struct thread *            thd_curr;
-	struct comp_info *         comp;
-	unsigned long              ip, sp;
-	cycles_t                   now;
+	struct thread             *thd_curr;
+	struct comp_info          *comp = NULL;
 
 	cos_info = cos_cpu_local_info();
 	assert(cos_info);
-	thd_curr = thd_current(cos_info);
+	thd_curr = cap_ulthd_restore(regs, cos_info, &comp);
 	assert(thd_curr && thd_curr->cpuid == get_cpuid());
-	comp = thd_invstk_current(thd_curr, &ip, &sp, cos_info);
 	assert(comp);
 
 	return expended_process(regs, thd_curr, comp, cos_info, 1);
@@ -950,7 +982,6 @@ composite_syscall_handler(struct pt_regs *regs)
 	struct comp_info * ci;
 	struct thread *    thd;
 	capid_t            cap;
-	unsigned long      ip, sp;
 
 	/*
 	 * We lookup this struct (which is on stack) only once, and
@@ -960,8 +991,10 @@ composite_syscall_handler(struct pt_regs *regs)
 	int                        ret        = -ENOENT;
 	int                        thd_switch = 0;
 
+	/* Definitely do it for all the fast-path calls. */
+	thd = cap_ulthd_restore(regs, cos_info, &ci);
+	assert(thd);
 	cap = __userregs_getcap(regs);
-	thd = thd_current(cos_info);
 
 	/* printk("thd %d calling cap %d (ip %x, sp %x), operation %d: %x, %x, %x, %x\n", thd->tid, cap,
 	 *        __userregs_getip(regs), __userregs_getsp(regs), __userregs_getop(regs),
@@ -981,14 +1014,12 @@ composite_syscall_handler(struct pt_regs *regs)
 		return 0;
 	}
 
-	ci = thd_invstk_current(thd, &ip, &sp, cos_info);
-	assert(ci && ci->captbl);
-
 	/*
 	 * We don't check the liveness of the current component
 	 * because it's guaranteed by component quiescence period,
 	 * which is at timer tick granularity.
 	 */
+	assert(ci && ci->captbl);
 	ch = captbl_lkup(ci->captbl, cap);
 	if (unlikely(!ch)) {
 		printk("cos: cap %d not found!\n", (int)cap);
diff --git a/src/kernel/include/shared/cos_types.h b/src/kernel/include/shared/cos_types.h
index f832d08cdb..d7ef746460 100644
--- a/src/kernel/include/shared/cos_types.h
+++ b/src/kernel/include/shared/cos_types.h
@@ -405,7 +405,7 @@ struct cos_scb_info {
 } CACHE_ALIGNED;
 
 struct cos_dcb_info {
-	unsigned long ip;
+	unsigned long ip, ip_kret;
 	unsigned long sp;
 } __attribute__((packed));
 
diff --git a/src/kernel/include/thd.h b/src/kernel/include/thd.h
index 8861188c86..e58d8f3b5b 100644
--- a/src/kernel/include/thd.h
+++ b/src/kernel/include/thd.h
@@ -475,6 +475,21 @@ curr_invstk_top(struct cos_cpu_local_info *cos_info)
 	return cos_info->invstk_top;
 }
 
+static inline struct comp_info *
+thd_invstk_peek_compinfo(struct thread *curr_thd, struct cos_cpu_local_info *cos_info, int peek_index)
+{
+	/* curr_thd should be the current thread! We are using cached invstk_top. */
+	return &(curr_thd->invstk[peek_index].comp_info);
+}
+
+static inline struct comp_info *
+thd_invstk_current_compinfo(struct thread *curr_thd, struct cos_cpu_local_info *cos_info, int *invstk_top)
+{
+	*invstk_top = curr_invstk_top(cos_info);
+
+	return &(curr_thd->invstk[*invstk_top].comp_info);
+}
+
 static inline struct comp_info *
 thd_invstk_current(struct thread *curr_thd, unsigned long *ip, unsigned long *sp, struct cos_cpu_local_info *cos_info)
 {
@@ -573,6 +588,17 @@ thd_switch_update(struct thread *thd, struct pt_regs *regs, int issame)
 		 */
 	}
 
+	if (thd->dcbinfo && thd->dcbinfo->sp) {
+		if (!preempt) {
+			regs->dx = regs->ip = thd->dcbinfo->ip_kret;
+			regs->cx = regs->sp = thd->dcbinfo->sp;
+		} else {
+			regs->ip = thd->dcbinfo->ip_kret;
+			regs->sp = thd->dcbinfo->sp;
+		}
+		thd->dcbinfo->sp = 0;
+	}
+
 	if (issame && preempt == 0) {
 		__userregs_set(regs, 0, __userregs_getsp(regs), __userregs_getip(regs));
 	}
@@ -589,11 +615,9 @@ thd_introspect(struct thread *t, unsigned long op, unsigned long *retval)
 		break;
 	case THD_GET_DCB_IP:
 		*retval = t->dcbinfo->ip;
-		printk("%lx\n", t->dcbinfo->ip);
 		break;
 	case THD_GET_DCB_SP:
 		*retval = t->dcbinfo->sp;
-		printk("%lx\n", t->dcbinfo->sp);
 		break;
 	default:
 		return -EINVAL;

From d558e3bb4e3bfe1ce38f1cb7ee17c74b77801a0e Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Fri, 22 Feb 2019 17:02:08 -0500
Subject: [PATCH 028/127] Removed push and pop of registers (works!) and added
 TODOs

---
 src/components/include/sl.h | 16 ++++------------
 src/kernel/capinv.c         |  9 +++++----
 2 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index 3680d47ee9..4a37f2a339 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -411,13 +411,10 @@ sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 {
 	struct cos_scb_info *scb = sl_scb_info_cpu();
 
+	/* TODO: remove 3f label and use dcb->ip + OFFSET for kernel jump */
+	/* TODO: token check outside the assembly */
+
 	__asm__ __volatile__ (				\
-		"pushl %%eax\n\t"			\
-		"pushl %%ebx\n\t"			\
-		"pushl %%ecx\n\t"			\
-		"pushl %%edx\n\t"			\
-		"pushl %%esi\n\t"			\
-		"pushl %%edi\n\t"			\
 		"movl $2f, (%%eax)\n\t"			\
 		"movl $3f, 4(%%eax)\n\t"		\
 		"movl %%esp, 8(%%eax)\n\t"		\
@@ -437,12 +434,6 @@ sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 		"2:\n\t"				\
 		"movl $0, 8(%%ebx)\n\t"			\
 		"3:\n\t"				\
-		"popl %%edi\n\t"			\
-		"popl %%esi\n\t"			\
-		"popl %%edx\n\t"			\
-		"popl %%ecx\n\t"			\
-		"popl %%ebx\n\t"			\
-		"popl %%eax\n\t"			\
 		:
 		: "a" (sl_thd_dcbinfo(curr)), "b" (sl_thd_dcbinfo(next)),
 		  "S" ((u32_t)((u64_t)tok >> 32)), "D" ((u32_t)(((u64_t)tok << 32) >> 32)),
@@ -559,6 +550,7 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 
 	assert(sl_thd_is_runnable(t));
 	sl_cs_exit();
+	if (t == sl_thd_curr()) return 0;
 
 	ret = sl_thd_activate(t, tok);
 	/*
diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index 41c4f4b895..6f50c6b75a 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -82,7 +82,8 @@ printfn(struct pt_regs *regs)
 	return 0;
 }
 
-static struct thread *
+/* TODO: inline fast path and force non-inlined slow-path */
+static inline struct thread *
 cap_ulthd_restore(struct pt_regs *regs, struct cos_cpu_local_info *cos_info, struct comp_info **ci_ptr)
 {
 	struct thread  *thd = thd_current(cos_info);
@@ -94,7 +95,7 @@ cap_ulthd_restore(struct pt_regs *regs, struct cos_cpu_local_info *cos_info, str
 	*ci_ptr = thd_invstk_current_compinfo(thd, cos_info, &invstk_top);
 
 	/* no user-level thread switches in invocations! */
-	if (unlikely(invstk_top)) goto done;
+	/* if (unlikely(invstk_top)) goto done; */
 
 	assert(*ci_ptr && (*ci_ptr)->captbl);
 
@@ -103,8 +104,7 @@ cap_ulthd_restore(struct pt_regs *regs, struct cos_cpu_local_info *cos_info, str
 	ultc   = (*ci_ptr)->scb_data->curr_thd;
 	if (!ultc) goto done;
 	ch_ult = (struct cap_thd *)captbl_lkup((*ci_ptr)->captbl, ultc);
-	/* TODO: use kernel curr thread? or sched thread in that component? */
-	if (unlikely(!CAP_TYPECHK_CORE(ch_ult, CAP_THD))) return NULL;
+	if (unlikely(!CAP_TYPECHK_CORE(ch_ult, CAP_THD))) goto done;
 
 	/* reset inconsistency from user-level thd! */
 	(*ci_ptr)->scb_data->curr_thd = 0;
@@ -112,6 +112,7 @@ cap_ulthd_restore(struct pt_regs *regs, struct cos_cpu_local_info *cos_info, str
 	ulthd = ch_ult->t;
 	assert(ulthd->dcbinfo);
 	if (ulthd == thd) goto done;
+	/* TODO: check if the threads are running in the same component.. */
 
 	thd_current_update(ulthd, thd, cos_info);
 	thd = ulthd;

From 35bfd0dc19bf84e17c25856f542470da4940e00b Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Wed, 27 Feb 2019 17:55:42 -0500
Subject: [PATCH 029/127] use offsets for kernel dispatch "ip" in user-level
 dispatch routine

---
 src/components/include/sl.h           | 12 ++++++------
 src/kernel/include/shared/cos_types.h | 17 ++++++++++++++++-
 src/kernel/include/thd.h              |  4 ++--
 3 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index 4a37f2a339..623a75834d 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -411,17 +411,15 @@ sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 {
 	struct cos_scb_info *scb = sl_scb_info_cpu();
 
-	/* TODO: remove 3f label and use dcb->ip + OFFSET for kernel jump */
 	/* TODO: token check outside the assembly */
 
 	__asm__ __volatile__ (				\
 		"movl $2f, (%%eax)\n\t"			\
-		"movl $3f, 4(%%eax)\n\t"		\
-		"movl %%esp, 8(%%eax)\n\t"		\
-		"cmp $0, 8(%%ebx)\n\t"			\
+		"movl %%esp, 4(%%eax)\n\t"		\
+		"cmp $0, 4(%%ebx)\n\t"			\
 		"je 1f\n\t"				\
 		"movl %%edx, (%%ecx)\n\t"		\
-		"movl 8(%%ebx), %%esp\n\t"		\
+		"movl 4(%%ebx), %%esp\n\t"		\
 		"jmp *(%%ebx)\n\t"			\
 		"1:\n\t"				\
 		"pushl %%ebp\n\t"			\
@@ -431,8 +429,10 @@ sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 		"addl $4, %%esp\n\t"			\
 		"popl %%ebp\n\t"			\
 		"jmp 3f\n\t"				\
+		".align 4\n\t"				\
 		"2:\n\t"				\
-		"movl $0, 8(%%ebx)\n\t"			\
+		"movl $0, 4(%%ebx)\n\t"			\
+		".align 4\n\t"				\
 		"3:\n\t"				\
 		:
 		: "a" (sl_thd_dcbinfo(curr)), "b" (sl_thd_dcbinfo(next)),
diff --git a/src/kernel/include/shared/cos_types.h b/src/kernel/include/shared/cos_types.h
index d7ef746460..3e416ab8fc 100644
--- a/src/kernel/include/shared/cos_types.h
+++ b/src/kernel/include/shared/cos_types.h
@@ -405,10 +405,25 @@ struct cos_scb_info {
 } CACHE_ALIGNED;
 
 struct cos_dcb_info {
-	unsigned long ip, ip_kret;
+	unsigned long ip;
 	unsigned long sp;
 } __attribute__((packed));
 
+/*
+ * This is the "ip" the kernel uses to update the thread when it sees that the
+ * thread is still in user-level dispatch routine.
+ * This is the offset of instruction after resetting the "next" thread's "sp" to zero
+ * in a purely user-level dispatch.
+ *
+ * Whenever kernel is switching to a thread which has "sp" non-zero, it would switch
+ * to the "ip" saved in the dcb_info and reset the "sp" of the thread that the kernel
+ * is dispatching to!
+ * This is necessary because, if the kernel is dispatching to a thread that was in the
+ * user-level dispatch routine before, then the only registers that it can restore are
+ * "ip" and "sp", everything else is either clobbered or saved/loaded at user-level.
+ */
+#define DCB_IP_KERN_OFF 8
+
 struct cos_component_information {
 	struct cos_stack_freelists cos_stacks;
 	long                       cos_this_spd_id;
diff --git a/src/kernel/include/thd.h b/src/kernel/include/thd.h
index e58d8f3b5b..17f012c543 100644
--- a/src/kernel/include/thd.h
+++ b/src/kernel/include/thd.h
@@ -590,10 +590,10 @@ thd_switch_update(struct thread *thd, struct pt_regs *regs, int issame)
 
 	if (thd->dcbinfo && thd->dcbinfo->sp) {
 		if (!preempt) {
-			regs->dx = regs->ip = thd->dcbinfo->ip_kret;
+			regs->dx = regs->ip = thd->dcbinfo->ip + DCB_IP_KERN_OFF;
 			regs->cx = regs->sp = thd->dcbinfo->sp;
 		} else {
-			regs->ip = thd->dcbinfo->ip_kret;
+			regs->ip = thd->dcbinfo->ip + DCB_IP_KERN_OFF;
 			regs->sp = thd->dcbinfo->sp;
 		}
 		thd->dcbinfo->sp = 0;

From 11e6312da5c3e6ff91093b40546648ecc16d6df5 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Thu, 28 Feb 2019 11:27:36 -0500
Subject: [PATCH 030/127] User-level scheduling synchronization using sched_tok
 in the SCB

* TOKEN is only incremented on preemption.
* TODO: make sure the sched_tok accessed by the kernel is from SCB of
        a scheduling component.
* Clean up rcvcap sched_counter now! TODO!!
---
 .../tests/unit_schedtests/unit_schedlib.c     |  2 +-
 src/components/include/cos_component.h        |  6 +++
 src/components/include/sl.h                   | 18 +++++++-
 src/components/lib/cos_kernel_api.c           |  4 +-
 src/kernel/capinv.c                           | 44 ++++++++++++-------
 5 files changed, 53 insertions(+), 21 deletions(-)

diff --git a/src/components/implementation/tests/unit_schedtests/unit_schedlib.c b/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
index 9b098260c8..0814ccbd94 100644
--- a/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
+++ b/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
@@ -90,7 +90,7 @@ test_thd_fn(void *data)
 	while (1) {
 		int workiters = WORKITERS * ((int)data);
 
-		printc("%d", (int)data);
+		printc("%c", 'a' + (int)data);
 		//SPIN(workiters);
 		sl_thd_yield(0);
 	}
diff --git a/src/components/include/cos_component.h b/src/components/include/cos_component.h
index 6c0e4da537..ac43a72092 100644
--- a/src/components/include/cos_component.h
+++ b/src/components/include/cos_component.h
@@ -220,6 +220,12 @@ cos_scb_info_get(void)
 	return scb_info;
 }
 
+static inline struct cos_scb_info *
+cos_scb_info_get_core(void)
+{
+	return cos_scb_info_get() + cos_cpuid();
+}
+
 static inline struct cos_dcb_info *
 cos_init_dcb_get(void)
 {
diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index 623a75834d..fe988d27e1 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -411,7 +411,21 @@ sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 {
 	struct cos_scb_info *scb = sl_scb_info_cpu();
 
-	/* TODO: token check outside the assembly */
+	/*
+	 * jump labels in the asm routine:
+	 *
+	 * 1: slowpath dispatch using cos_thd_switch to switch to a thread
+	 *    if the dcb sp of the next thread is reset.
+	 *
+	 * 2: if user-level dispatch routine completed successfully so
+	 *    the register states still retained and in the dispatched thread
+	 *    we reset its dcb sp!
+	 *
+	 * 3: if user-level dispatch was either preempted in the middle
+	 *    of this routine or kernel at some point had to switch to a
+	 *    thread that co-operatively switched away from this routine.
+	 *    NOTE: kernel takes care of resetting dcb sp in this case!
+	 */
 
 	__asm__ __volatile__ (				\
 		"movl $2f, (%%eax)\n\t"			\
@@ -440,7 +454,7 @@ sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 		  "c" (&(scb->curr_thd)), "d" (sl_thd_thdcap(next))
 		: "memory", "cc");
 
-	return 0;
+	return sl_scb_info_cpu()->sched_tok != tok ? -EAGAIN : 0;
 }
 
 static inline int
diff --git a/src/components/lib/cos_kernel_api.c b/src/components/lib/cos_kernel_api.c
index 5d25f85219..4f73b08127 100644
--- a/src/components/lib/cos_kernel_api.c
+++ b/src/components/lib/cos_kernel_api.c
@@ -860,9 +860,7 @@ cos_thd_wakeup(thdcap_t thd, tcap_t tc, tcap_prio_t prio, tcap_res_t res)
 sched_tok_t
 cos_sched_sync(void)
 {
-	static sched_tok_t stok[NUM_CPU] CACHE_ALIGNED;
-
-	return ps_faa((unsigned long *)&stok[cos_cpuid()], 1);
+	return ps_load(&cos_scb_info_get_core()->sched_tok);
 }
 
 int
diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index 6f50c6b75a..589f7b0695 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -84,13 +84,14 @@ printfn(struct pt_regs *regs)
 
 /* TODO: inline fast path and force non-inlined slow-path */
 static inline struct thread *
-cap_ulthd_restore(struct pt_regs *regs, struct cos_cpu_local_info *cos_info, struct comp_info **ci_ptr)
+cap_ulthd_restore(struct pt_regs *regs, struct cos_cpu_local_info *cos_info, int interrupt, struct comp_info **ci_ptr)
 {
-	struct thread  *thd = thd_current(cos_info);
-	struct cap_thd *ch_ult;
-	struct thread  *ulthd;
-	capid_t         ultc;
-	int             invstk_top;
+	struct thread       *thd = thd_current(cos_info);
+	struct cap_thd      *ch_ult;
+	struct thread       *ulthd;
+	capid_t              ultc;
+	int                  invstk_top;
+	struct cos_scb_info *scb_core = NULL; /* per-core scb_info */
 
 	*ci_ptr = thd_invstk_current_compinfo(thd, cos_info, &invstk_top);
 
@@ -100,14 +101,20 @@ cap_ulthd_restore(struct pt_regs *regs, struct cos_cpu_local_info *cos_info, str
 	assert(*ci_ptr && (*ci_ptr)->captbl);
 
 	if (unlikely(!(*ci_ptr)->scb_data)) goto done;
+	scb_core = (((*ci_ptr)->scb_data) + get_cpuid());
 
-	ultc   = (*ci_ptr)->scb_data->curr_thd;
+	if (unlikely(interrupt)) {
+		assert(scb_core->sched_tok < ~0U);
+		cos_faa((int *)&(scb_core->sched_tok), 1);
+	}
+
+	ultc   = scb_core->curr_thd;
 	if (!ultc) goto done;
 	ch_ult = (struct cap_thd *)captbl_lkup((*ci_ptr)->captbl, ultc);
 	if (unlikely(!CAP_TYPECHK_CORE(ch_ult, CAP_THD))) goto done;
 
 	/* reset inconsistency from user-level thd! */
-	(*ci_ptr)->scb_data->curr_thd = 0;
+	scb_core->curr_thd = 0;
 
 	ulthd = ch_ult->t;
 	assert(ulthd->dcbinfo);
@@ -121,7 +128,6 @@ cap_ulthd_restore(struct pt_regs *regs, struct cos_cpu_local_info *cos_info, str
 	return thd;
 }
 
-void cos_cap_ipi_handling(void);
 void
 cos_cap_ipi_handling(void)
 {
@@ -659,11 +665,19 @@ cap_switch(struct pt_regs *regs, struct thread *curr, struct thread *next, struc
 static int
 cap_sched_tok_validate(struct thread *rcvt, sched_tok_t usr_tok, struct comp_info *ci, struct cos_cpu_local_info *cos_info)
 {
+	struct cos_scb_info *scb_core = ci->scb_data + get_cpuid();
+
 	assert(rcvt && usr_tok < ~0U);
 
-	/* race-condition check for user-level thread switches */
-	if (thd_rcvcap_get_counter(rcvt) > usr_tok) return -EAGAIN;
-	thd_rcvcap_set_counter(rcvt, usr_tok);
+	/*
+	 * Kernel increments the sched_tok on preemption only.
+	 * The rest is all co-operative, so if sched_tok in scb page
+	 * increments after someone fetching a tok, then check for that!
+	 *
+	 * FIXME: make sure we're checking the scb of the scheduling component and not in any other component.
+	 *        I don't know if the comp_info here is of the scheduling component!
+	 */
+	if (unlikely(scb_core->sched_tok != usr_tok)) return -EAGAIN;
 
 	return 0;
 }
@@ -827,7 +841,7 @@ cap_hw_asnd(struct cap_asnd *asnd, struct pt_regs *regs)
 
 	cos_info = cos_cpu_local_info();
 	assert(cos_info);
-	thd = cap_ulthd_restore(regs, cos_info, &ci);
+	thd = cap_ulthd_restore(regs, cos_info, 1, &ci);
 	assert(thd && ci && ci->captbl);
 	assert(!(thd->state & THD_STATE_PREEMPTED));
 	tcap = tcap_current(cos_info);
@@ -878,7 +892,7 @@ timer_process(struct pt_regs *regs)
 
 	cos_info = cos_cpu_local_info();
 	assert(cos_info);
-	thd_curr = cap_ulthd_restore(regs, cos_info, &comp);
+	thd_curr = cap_ulthd_restore(regs, cos_info, 1, &comp);
 	assert(thd_curr && thd_curr->cpuid == get_cpuid());
 	assert(comp);
 
@@ -993,7 +1007,7 @@ composite_syscall_handler(struct pt_regs *regs)
 	int                        thd_switch = 0;
 
 	/* Definitely do it for all the fast-path calls. */
-	thd = cap_ulthd_restore(regs, cos_info, &ci);
+	thd = cap_ulthd_restore(regs, cos_info, 0, &ci);
 	assert(thd);
 	cap = __userregs_getcap(regs);
 

From bedf48e16a44073e5275c78db62c04aa00332223 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Thu, 7 Mar 2019 19:09:33 -0500
Subject: [PATCH 031/127] scb and dcb capabilities and resources

* Have SCB capability and resource working.
* TODO: DCB capabilities to work.
* TODO: Fix the API around SCB frontier.
* SCB address in a component will be the start of the heap pointer
  and the INIT DCB (initial dcb caps that are used when creating the
  INIT threads in those components) are next to SCB and statically set
  to be NUM_CPU number of pages. This is the idea to fix their addresses
  and avoid passing in component_information structure.
---
 .../implementation/capmgr/naive/cap_info.c    |   2 +-
 .../implementation/capmgr/naive/init.c        |   2 +-
 .../no_interface/llbooter/boot_deps.h         |  29 +++--
 .../no_interface/vkernel/vk_api.c             |   4 +-
 .../no_interface/vkernel/vkernel.c            |   2 +-
 .../no_interface/vkernel/vm_booter.c          |   3 +-
 .../tests/micro_booter/mb_tests.c             |  15 +--
 .../tests/micro_booter/micro_booter.c         |   6 +-
 .../tests/unit_defcompinfo/unit_defcompinfo.c |   5 +-
 src/components/include/cos_component.h        |  10 +-
 src/components/include/cos_defkernel_api.h    |   6 +-
 src/components/include/cos_kernel_api.h       |  14 ++-
 src/components/lib/cos_defkernel_api.c        |  10 +-
 src/components/lib/cos_kernel_api.c           |  73 +++++++++----
 src/components/lib/sl/sl_sched.c              |   1 +
 src/kernel/capinv.c                           |  61 ++++++++++-
 src/kernel/include/captbl.h                   |   2 +-
 src/kernel/include/component.h                |  39 ++++---
 src/kernel/include/dcb.h                      | 101 ++++++++++++++++++
 src/kernel/include/scb.h                      | 100 +++++++++++++++++
 src/kernel/include/shared/cos_types.h         |  24 ++++-
 src/kernel/include/thd.h                      |   2 +
 src/platform/i386/boot_comp.c                 |  18 ++--
 23 files changed, 421 insertions(+), 108 deletions(-)
 create mode 100644 src/kernel/include/dcb.h
 create mode 100644 src/kernel/include/scb.h

diff --git a/src/components/implementation/capmgr/naive/cap_info.c b/src/components/implementation/capmgr/naive/cap_info.c
index 1d2e242e7c..de9c015100 100644
--- a/src/components/implementation/capmgr/naive/cap_info.c
+++ b/src/components/implementation/capmgr/naive/cap_info.c
@@ -69,7 +69,7 @@ cap_info_comp_init(spdid_t spdid, captblcap_t captbl_cap, pgtblcap_t pgtbl_cap,
 	capci[spdid].cid = spdid;
 	cos_meminfo_init(&ci->mi, 0, 0, 0);
 	cos_compinfo_init(ci, pgtbl_cap, captbl_cap, compcap, heap_frontier, cap_frontier,
-			cos_compinfo_get(cos_defcompinfo_curr_get()));
+			  0, 0, cos_compinfo_get(cos_defcompinfo_curr_get()));
 
 	memset(rglb, 0, sizeof(struct cap_shmem_glb_info));
 	memset(cap_shi, 0, sizeof(struct cap_shmem_info));
diff --git a/src/components/implementation/capmgr/naive/init.c b/src/components/implementation/capmgr/naive/init.c
index 77c9b4e0ea..1849eada9b 100644
--- a/src/components/implementation/capmgr/naive/init.c
+++ b/src/components/implementation/capmgr/naive/init.c
@@ -184,7 +184,7 @@ cos_init(void)
 		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
 		cos_defcompinfo_init_ext(BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE,
 				BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_SELF_CT,
-				BOOT_CAPTBL_SELF_COMP, heap_frontier, cap_frontier);
+				BOOT_CAPTBL_SELF_COMP, BOOT_CAPTBL_SELF_SCB, heap_frontier - COS_SCB_SIZE, heap_frontier, cap_frontier);
 		cap_info_init();
 		sl_init(SL_MIN_PERIOD_US);
 		capmgr_comp_info_iter();
diff --git a/src/components/implementation/no_interface/llbooter/boot_deps.h b/src/components/implementation/no_interface/llbooter/boot_deps.h
index 3c9254beda..a622fb9517 100644
--- a/src/components/implementation/no_interface/llbooter/boot_deps.h
+++ b/src/components/implementation/no_interface/llbooter/boot_deps.h
@@ -164,17 +164,6 @@ boot_comp_mem_alloc(spdid_t spdid)
 	cos_meminfo_alloc(compinfo, BOOT_MEM_KM_BASE, mem_sz);
 }
 
-static void
-boot_comp_scb_alloc(spdid_t spdid)
-{
-	struct comp_cap_info   *spdinfo  = boot_spd_compcapinfo_get(spdid);
-	struct cos_compinfo    *compinfo = boot_spd_compinfo_get(spdid);
-	struct comp_sched_info *spdsi    = boot_spd_comp_schedinfo_get(spdid);
-
-	spdinfo->scbpg = (vaddr_t)cos_scbpg_bump_allocn(compinfo, COS_SCB_SIZE);
-	assert(spdinfo->scbpg);
-}
-
 /* TODO: Should booter create that INITDCB page for all components for each core? */
 static void
 boot_comp_dcb_alloc(spdid_t spdid)
@@ -194,13 +183,20 @@ boot_compinfo_init(spdid_t spdid, captblcap_t *ct, pgtblcap_t *pt, u32_t heap_st
 {
 	struct cos_compinfo *compinfo  = boot_spd_compinfo_get(spdid);
 	struct cos_compinfo *boot_info = boot_spd_compinfo_curr_get();
+	scbcap_t scbc = 0;
+	vaddr_t scb_uaddr = 0;
 
-	*ct = cos_captbl_alloc(boot_info);
+	*ct  = cos_captbl_alloc(boot_info);
 	assert(*ct);
-	*pt = cos_pgtbl_alloc(boot_info);
+	*pt  = cos_pgtbl_alloc(boot_info);
 	assert(*pt);
+	scbc = cos_pgtbl_alloc(boot_info);
+	assert(scbc);
 
-	cos_compinfo_init(compinfo, *pt, *ct, 0, (vaddr_t)heap_start_vaddr, BOOT_CAPTBL_FREE, boot_info);
+	cos_compinfo_init(compinfo, *pt, *ct, 0, scbc, 0, (vaddr_t)heap_start_vaddr, BOOT_CAPTBL_FREE, boot_info);
+	scb_uaddr = cos_page_bump_intern_valloc(compinfo, COS_SCB_SIZE);
+	assert(scb_uaddr);
+	compinfo->scb_vas = scb_uaddr;
 
 	/*
 	 * if this is a capmgr, let it manage its share (ideally rest of system memory) of memory.
@@ -405,11 +401,10 @@ boot_newcomp_create(spdid_t spdid, struct cos_compinfo *comp_info)
 	invtoken_t token = (invtoken_t)spdid;
 	int ret;
 
-	boot_comp_scb_alloc(spdid);
-	cc = cos_comp_alloc(boot_info, ct, pt, (vaddr_t)spdinfo->upcall_entry, spdinfo->scbpg);
+	/* scb info created on compinfo_init */
+	cc = cos_comp_alloc(boot_info, ct, pt, compinfo->scb_cap, (vaddr_t)spdinfo->upcall_entry, compinfo->scb_vas);
 	assert(cc);
 	compinfo->comp_cap = cc;
-	cobj_info->cos_scb_data = (struct cos_scb_info *)spdinfo->scbpg;
 
 	/* Create sinv capability from Userspace to Booter components */
 	sinv = cos_sinv_alloc(boot_info, boot_info->comp_cap, (vaddr_t)hypercall_entry_rets_inv, token);
diff --git a/src/components/implementation/no_interface/vkernel/vk_api.c b/src/components/implementation/no_interface/vkernel/vk_api.c
index feb5c20a3f..2a2d5277cf 100644
--- a/src/components/implementation/no_interface/vkernel/vk_api.c
+++ b/src/components/implementation/no_interface/vkernel/vk_api.c
@@ -35,9 +35,9 @@ vk_vm_create(struct vms_info *vminfo, struct vkernel_info *vkinfo)
 
 	cos_meminfo_init(&(vmcinfo->mi), BOOT_MEM_KM_BASE, VM_UNTYPED_SIZE, vmutpt);
 	ret = cos_defcompinfo_child_alloc(vmdci, (vaddr_t)&cos_upcall_entry, (vaddr_t)BOOT_MEM_VM_BASE,
-					  VM_CAPTBL_FREE, 1, &initdcbpg, &scbpg);
+					  VM_CAPTBL_FREE, 1, &initdcbpg);
 	cos_compinfo_init(&(vminfo->shm_cinfo), vmcinfo->pgtbl_cap, vmcinfo->captbl_cap, vmcinfo->comp_cap,
-			  (vaddr_t)VK_VM_SHM_BASE, VM_CAPTBL_FREE, vk_cinfo);
+			  vmcinfo->scb_cap, (vaddr_t)VK_VM_SHM_BASE, (vaddr_t)(VK_VM_SHM_BASE + COS_SCB_SIZE), VM_CAPTBL_FREE, vk_cinfo);
 
 	printc("\tCreating and copying initial component capabilities\n");
 	ret = cos_cap_cpy_at(vmcinfo, BOOT_CAPTBL_SELF_CT, vk_cinfo, vmcinfo->captbl_cap);
diff --git a/src/components/implementation/no_interface/vkernel/vkernel.c b/src/components/implementation/no_interface/vkernel/vkernel.c
index bab3e7af1f..af106a5bba 100644
--- a/src/components/implementation/no_interface/vkernel/vkernel.c
+++ b/src/components/implementation/no_interface/vkernel/vkernel.c
@@ -51,7 +51,7 @@ cos_init(void)
 	 *       Or use some offset into the future in CAPTBL_FREE
 	 */
 	cos_compinfo_init(&vk_info.shm_cinfo, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_COMP,
-			  (vaddr_t)VK_VM_SHM_BASE, BOOT_CAPTBL_FREE, ci);
+			  BOOT_CAPTBL_SELF_SCB, (vaddr_t)cos_scb_info_get(), (vaddr_t)VK_VM_SHM_BASE, BOOT_CAPTBL_FREE, ci);
 
 	vk_info.termthd = cos_thd_alloc(vk_cinfo, vk_cinfo->comp_cap, vk_terminate, NULL, 0, 0);
 	assert(vk_info.termthd);
diff --git a/src/components/implementation/no_interface/vkernel/vm_booter.c b/src/components/implementation/no_interface/vkernel/vm_booter.c
index 59b9d6435e..67c634290d 100644
--- a/src/components/implementation/no_interface/vkernel/vm_booter.c
+++ b/src/components/implementation/no_interface/vkernel/vm_booter.c
@@ -29,8 +29,9 @@ vm_init(void *d)
 	vmid = cos_sinv(VM_CAPTBL_SELF_SINV_BASE, VK_SERV_VM_ID << 16 | cos_thdid(), 0, 0, 0);
 
 	cos_meminfo_init(&booter_info.mi, BOOT_MEM_KM_BASE, VM_UNTYPED_SIZE, BOOT_CAPTBL_SELF_UNTYPED_PT);
+	/* FIXME: will need to verify if scb stuff works here */
 	cos_compinfo_init(&booter_info, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_COMP,
-	                  (vaddr_t)cos_get_heap_ptr(), vmid == 0 ? DOM0_CAPTBL_FREE : VM_CAPTBL_FREE, &booter_info);
+	                  BOOT_CAPTBL_SELF_SCB, (vaddr_t)cos_scb_info_get(), (vaddr_t)cos_get_heap_ptr(), vmid == 0 ? DOM0_CAPTBL_FREE : VM_CAPTBL_FREE, &booter_info);
 
 	PRINTC("Virtual-machine booter started.\n");
 	test_run_vk();
diff --git a/src/components/implementation/tests/micro_booter/mb_tests.c b/src/components/implementation/tests/micro_booter/mb_tests.c
index ecfa74cdbd..68f9fd1981 100644
--- a/src/components/implementation/tests/micro_booter/mb_tests.c
+++ b/src/components/implementation/tests/micro_booter/mb_tests.c
@@ -825,11 +825,8 @@ test_inv(void)
 	compcap_t    cc;
 	sinvcap_t    ic;
 	unsigned int r;
-	vaddr_t      scbpg;
 
-	scbpg = (vaddr_t)cos_scbpg_bump_allocn(&booter_info, PAGE_SIZE);
-	assert(scbpg);
-	cc = cos_comp_alloc(&booter_info, booter_info.captbl_cap, booter_info.pgtbl_cap, (vaddr_t)NULL, scbpg);
+	cc = cos_comp_alloc(&booter_info, booter_info.captbl_cap, booter_info.pgtbl_cap, 0, (vaddr_t)NULL, 0);
 	assert(cc > 0);
 	ic = cos_sinv_alloc(&booter_info, cc, (vaddr_t)__inv_test_serverfn, 0);
 	assert(ic > 0);
@@ -847,11 +844,8 @@ test_inv_perf(void)
 	int          i;
 	long long    total_inv_cycles = 0LL, total_ret_cycles = 0LL;
 	unsigned int ret;
-	vaddr_t      scbpg;
 
-	scbpg = (vaddr_t)cos_scbpg_bump_allocn(&booter_info, PAGE_SIZE);
-	assert(scbpg);
-	cc = cos_comp_alloc(&booter_info, booter_info.captbl_cap, booter_info.pgtbl_cap, (vaddr_t)NULL, scbpg);
+	cc = cos_comp_alloc(&booter_info, booter_info.captbl_cap, booter_info.pgtbl_cap, 0, (vaddr_t)NULL, 0);
 	assert(cc > 0);
 	ic = cos_sinv_alloc(&booter_info, cc, (vaddr_t)__inv_test_serverfn, 0);
 	assert(ic > 0);
@@ -880,11 +874,8 @@ test_captbl_expand(void)
 {
 	int       i;
 	compcap_t cc;
-	vaddr_t      scbpg;
 
-	scbpg = (vaddr_t)cos_scbpg_bump_allocn(&booter_info, PAGE_SIZE);
-	assert(scbpg);
-	cc = cos_comp_alloc(&booter_info, booter_info.captbl_cap, booter_info.pgtbl_cap, (vaddr_t)NULL, scbpg);
+	cc = cos_comp_alloc(&booter_info, booter_info.captbl_cap, booter_info.pgtbl_cap, 0, (vaddr_t)NULL, 0);
 	assert(cc);
 	for (i = 0; i < 1024; i++) {
 		sinvcap_t ic;
diff --git a/src/components/implementation/tests/micro_booter/micro_booter.c b/src/components/implementation/tests/micro_booter/micro_booter.c
index 3868c90c02..a6328c4278 100644
--- a/src/components/implementation/tests/micro_booter/micro_booter.c
+++ b/src/components/implementation/tests/micro_booter/micro_booter.c
@@ -63,7 +63,7 @@ cos_init(void)
 		first_init = 0;
 		cos_meminfo_init(&booter_info.mi, BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
 		cos_compinfo_init(&booter_info, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_COMP,
-				(vaddr_t)cos_get_heap_ptr(), BOOT_CAPTBL_FREE, &booter_info);
+				BOOT_CAPTBL_SELF_SCB, (vaddr_t)cos_scb_info_get(), (vaddr_t)cos_get_heap_ptr(), BOOT_CAPTBL_FREE, &booter_info);
 		init_done = 1;
 	}
 
@@ -72,7 +72,7 @@ cos_init(void)
 	initaddr = cos_init_dcb_get();
 	PRINTC("%u DCB IP: %lx, DCB SP: %lx\n", (unsigned int)BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, (unsigned long)cos_introspect(&booter_info, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, THD_GET_DCB_IP), (unsigned long)cos_introspect(&booter_info, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, THD_GET_DCB_SP));
 	initaddr->ip = 10;
-	initaddr->sp = 20;
+	initaddr->sp = 0;
 	PRINTC("%u DCB IP: %lx, DCB SP: %lx\n", (unsigned int)BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, (unsigned long)cos_introspect(&booter_info, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, THD_GET_DCB_IP), (unsigned long)cos_introspect(&booter_info, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, THD_GET_DCB_SP));
 
 
@@ -81,7 +81,7 @@ cos_init(void)
 	assert(termthd[cos_cpuid()]);
 	PRINTC("%u DCB IP: %lx, DCB SP: %lx\n", (unsigned int)termthd[cos_cpuid()], (unsigned long)cos_introspect(&booter_info, termthd[cos_cpuid()], THD_GET_DCB_IP), (unsigned long)cos_introspect(&booter_info, termthd[cos_cpuid()], THD_GET_DCB_SP));
 	termaddr->ip = 30;
-	termaddr->sp = 40;
+	termaddr->sp = 0;
 	PRINTC("%u DCB IP: %lx, DCB SP: %lx\n", (unsigned int)termthd[cos_cpuid()], (unsigned long)cos_introspect(&booter_info, termthd[cos_cpuid()], THD_GET_DCB_IP), (unsigned long)cos_introspect(&booter_info, termthd[cos_cpuid()], THD_GET_DCB_SP));
 	PRINTC("%u DCB IP: %lx, DCB SP: %lx\n", (unsigned int)BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, (unsigned long)cos_introspect(&booter_info, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, THD_GET_DCB_IP), (unsigned long)cos_introspect(&booter_info, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, THD_GET_DCB_SP));
 	PRINTC("%u DCB IP: %lx, DCB SP: %lx\n", (unsigned int)termthd[cos_cpuid()], (unsigned long)cos_introspect(&booter_info, termthd[cos_cpuid()], THD_GET_DCB_IP), (unsigned long)cos_introspect(&booter_info, termthd[cos_cpuid()], THD_GET_DCB_SP));
diff --git a/src/components/implementation/tests/unit_defcompinfo/unit_defcompinfo.c b/src/components/implementation/tests/unit_defcompinfo/unit_defcompinfo.c
index 5b6a306e4f..c9f5398df7 100644
--- a/src/components/implementation/tests/unit_defcompinfo/unit_defcompinfo.c
+++ b/src/components/implementation/tests/unit_defcompinfo/unit_defcompinfo.c
@@ -125,7 +125,7 @@ cos_init(void)
 		cos_defcompinfo_init();
 
 		for (id = 0; id < CHILD_COMP_COUNT; id++) {
-			vaddr_t              vm_range, addr, scbaddr, dcbaddr;
+			vaddr_t              vm_range, addr, dcbaddr;
 			pgtblcap_t           child_utpt;
 			int                  is_sched = ((id == CHILD_SCHED_ID) ? 1 : 0);
 			struct cos_compinfo *child_ci = cos_compinfo_get(&child_defci[id]);
@@ -136,7 +136,7 @@ cos_init(void)
 
 			cos_meminfo_init(&(child_ci->mi), BOOT_MEM_KM_BASE, CHILD_UNTYPED_SIZE, child_utpt);
 			cos_defcompinfo_child_alloc(&child_defci[id], (vaddr_t)&cos_upcall_entry,
-			                            (vaddr_t)BOOT_MEM_VM_BASE, BOOT_CAPTBL_FREE, is_sched, &dcbaddr, &scbaddr);
+			                            (vaddr_t)BOOT_MEM_VM_BASE, BOOT_CAPTBL_FREE, is_sched, &dcbaddr);
 
 			printc("\t\tCopying new capabilities\n");
 			ret = cos_cap_cpy_at(child_ci, BOOT_CAPTBL_SELF_CT, ci, child_ci->captbl_cap);
@@ -147,6 +147,7 @@ cos_init(void)
 			assert(ret == 0);
 			ret = cos_cap_cpy_at(child_ci, BOOT_CAPTBL_SELF_COMP, ci, child_ci->comp_cap);
 			assert(ret == 0);
+			/* FIXME: copy BOOT_CAPTBL_SELF_SCB cap?? */
 
 			ret = cos_cap_cpy_at(child_ci, BOOT_CAPTBL_SELF_INITTHD_BASE, ci,
 			                     cos_sched_aep_get(&child_defci[id])->thd);
diff --git a/src/components/include/cos_component.h b/src/components/include/cos_component.h
index ac43a72092..4217ccb09b 100644
--- a/src/components/include/cos_component.h
+++ b/src/components/include/cos_component.h
@@ -203,8 +203,8 @@ cos_spd_id(void)
 static inline void *
 cos_get_heap_ptr(void)
 {
-	/* page at heap_ptr is actually the SCB_PAGE for the booter alone! */
-	unsigned int off = (cos_spd_id() == 0 ? (COS_SCB_SIZE + (PAGE_SIZE * NUM_CPU)) : 0);
+	/* page at heap_ptr is actually the SCB_PAGE for any component. */
+	unsigned int off = COS_SCB_SIZE + (PAGE_SIZE * NUM_CPU);
 	void *heap_ptr = ((void *)(cos_comp_info.cos_heap_ptr + off));
 
 	return heap_ptr;
@@ -213,11 +213,7 @@ cos_get_heap_ptr(void)
 static inline struct cos_scb_info *
 cos_scb_info_get(void)
 {
-	struct cos_scb_info *scb_info = cos_comp_info.cos_scb_data;
-
-	if (cos_spd_id() == 0) scb_info = (struct cos_scb_info *)(cos_comp_info.cos_heap_ptr);
-
-	return scb_info;
+	return (struct cos_scb_info *)(cos_comp_info.cos_heap_ptr);
 }
 
 static inline struct cos_scb_info *
diff --git a/src/components/include/cos_defkernel_api.h b/src/components/include/cos_defkernel_api.h
index c39e77c0f5..629ada6432 100644
--- a/src/components/include/cos_defkernel_api.h
+++ b/src/components/include/cos_defkernel_api.h
@@ -86,7 +86,7 @@ void cos_defcompinfo_init(void);
  * passed.
  */
 void cos_defcompinfo_init_ext(tcap_t sched_tc, thdcap_t sched_thd, arcvcap_t sched_rcv, pgtblcap_t pgtbl_cap,
-                              captblcap_t captbl_cap, compcap_t comp_cap, vaddr_t heap_ptr, capid_t cap_frontier);
+                              captblcap_t captbl_cap, compcap_t comp_cap, scbcap_t scb_cap, vaddr_t scb_ptr, vaddr_t heap_ptr, capid_t cap_frontier);
 
 /* for AP cores */
 void cos_defcompinfo_sched_init_ext(tcap_t sched_tc, thdcap_t sched_thd, arcvcap_t sched_rcv);
@@ -96,10 +96,10 @@ void cos_defcompinfo_sched_init(void);
  * cos_defcompinfo_child_alloc: called to create a new child component including initial capabilities like pgtbl,
  * captbl, compcap, aep. if is_sched is set, scheduling end-point will also be created for the child component, else,
  * the current component's scheduler will remain the scheduler for the child component.
- * NOTE: dcbuaddr is the address in child_dci page-table and scbuaddr too!.
+ * NOTE: dcbuaddr is the address in child_dci page-table!.
  */
 int cos_defcompinfo_child_alloc(struct cos_defcompinfo *child_defci, vaddr_t entry, vaddr_t heap_ptr,
-                                capid_t cap_frontier, int is_sched, vaddr_t *dcbuaddr, vaddr_t *scbuaddr);
+                                capid_t cap_frontier, int is_sched, vaddr_t *dcbuaddr);
 
 /*
  * cos_aep_alloc: creates a new async activation end-point which includes thread, tcap and rcv capabilities.
diff --git a/src/components/include/cos_kernel_api.h b/src/components/include/cos_kernel_api.h
index 28cd57d713..a67d50ac84 100644
--- a/src/components/include/cos_kernel_api.h
+++ b/src/components/include/cos_kernel_api.h
@@ -54,6 +54,8 @@ typedef capid_t compcap_t;
 typedef capid_t captblcap_t;
 typedef capid_t pgtblcap_t;
 typedef capid_t hwcap_t;
+typedef capid_t scbcap_t;
+typedef capid_t dcbcap_t;
 
 /* Memory source information */
 struct cos_meminfo {
@@ -65,7 +67,7 @@ struct cos_meminfo {
 /* Component captbl/pgtbl allocation information */
 struct cos_compinfo {
 	/* capabilities to higher-order capability tables (or -1) */
-	capid_t pgtbl_cap, captbl_cap, comp_cap;
+	capid_t pgtbl_cap, captbl_cap, comp_cap, scb_cap;
 	/* the frontier of unallocated caps, and the allocated captbl range */
 	capid_t cap_frontier, caprange_frontier;
 	/* the frontier for each of the various sizes of capability per core! */
@@ -75,13 +77,14 @@ struct cos_compinfo {
 	/* the source of memory */
 	struct cos_compinfo *memsrc; /* might be self-referential */
 	struct cos_meminfo   mi;     /* only populated for the component with real memory */
+	vaddr_t scb_vas;             /* scb virtual address in the component's pgtbl */
 
 	struct ps_lock cap_lock, mem_lock; /* locks to make the cap frontier and mem frontier updates and expands atomic */
 	struct ps_lock va_lock; /* lock to make the vas frontier and bump expands for vas atomic */
 };
 
 void cos_compinfo_init(struct cos_compinfo *ci, pgtblcap_t pgtbl_cap, captblcap_t captbl_cap, compcap_t comp_cap,
-                       vaddr_t heap_ptr, capid_t cap_frontier, struct cos_compinfo *ci_resources);
+		       scbcap_t scb_cap, vaddr_t scb_vas, vaddr_t heap_ptr, capid_t cap_frontier, struct cos_compinfo *ci_resources);
 /*
  * This only needs be called on compinfos that are managing resources
  * (i.e. likely only one).  All of the capabilities will be relative
@@ -108,10 +111,12 @@ int cos_pgtbl_intern_expandwith(struct cos_compinfo *ci, pgtblcap_t intern, vadd
  * correctly populate ci (allocating all resources from ci_resources).
  */
 int         cos_compinfo_alloc(struct cos_compinfo *ci, vaddr_t heap_ptr, capid_t cap_frontier, vaddr_t entry,
-                               struct cos_compinfo *ci_resources, vaddr_t *scbpg);
+                               struct cos_compinfo *ci_resources);
 captblcap_t cos_captbl_alloc(struct cos_compinfo *ci);
 pgtblcap_t  cos_pgtbl_alloc(struct cos_compinfo *ci);
-compcap_t   cos_comp_alloc(struct cos_compinfo *ci, captblcap_t ctc, pgtblcap_t ptc, vaddr_t entry, vaddr_t scbpg);
+compcap_t   cos_comp_alloc(struct cos_compinfo *ci, captblcap_t ctc, pgtblcap_t ptc, scbcap_t scbc, vaddr_t entry, vaddr_t scb_addr);
+scbcap_t    cos_scb_alloc(struct cos_compinfo *ci);
+dcbcap_t    cos_dcb_alloc(struct cos_compinfo *ci, vaddr_t *dcb_uaddr);
 
 typedef void (*cos_thd_fn_t)(void *);
 thdcap_t cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, cos_thd_fn_t fn, void *data, pgtblcap_t ptc, vaddr_t dcbuaddr);
@@ -197,5 +202,6 @@ int     cos_hw_cycles_per_usec(hwcap_t hwc);
 int     cos_hw_cycles_thresh(hwcap_t hwc);
 
 capid_t cos_capid_bump_alloc(struct cos_compinfo *ci, cap_t cap);
+vaddr_t cos_page_bump_intern_valloc(struct cos_compinfo *ci, size_t sz);
 
 #endif /* COS_KERNEL_API_H */
diff --git a/src/components/lib/cos_defkernel_api.c b/src/components/lib/cos_defkernel_api.c
index 6ff0beb15f..97c8b364f8 100644
--- a/src/components/lib/cos_defkernel_api.c
+++ b/src/components/lib/cos_defkernel_api.c
@@ -42,20 +42,20 @@ cos_defcompinfo_init(void)
 
 	cos_defcompinfo_init_ext(BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE,
 	                         BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_SELF_CT,
-	                         BOOT_CAPTBL_SELF_COMP, (vaddr_t)cos_get_heap_ptr(), BOOT_CAPTBL_FREE);
+	                         BOOT_CAPTBL_SELF_COMP, BOOT_CAPTBL_SELF_SCB, (vaddr_t)cos_scb_info_get(), (vaddr_t)cos_get_heap_ptr(), BOOT_CAPTBL_FREE);
 
 }
 
 void
 cos_defcompinfo_init_ext(tcap_t sched_tc, thdcap_t sched_thd, arcvcap_t sched_rcv, pgtblcap_t pgtbl_cap,
-                         captblcap_t captbl_cap, compcap_t comp_cap, vaddr_t heap_ptr, capid_t cap_frontier)
+                         captblcap_t captbl_cap, compcap_t comp_cap, scbcap_t scb_cap, vaddr_t scb_ptr, vaddr_t heap_ptr, capid_t cap_frontier)
 {
 	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
 	struct cos_compinfo    *ci    = cos_compinfo_get(defci);
 
 	if (curr_defci_init_status == INITIALIZED) return;
 
-	cos_compinfo_init(ci, pgtbl_cap, captbl_cap, comp_cap, heap_ptr, cap_frontier, ci);
+	cos_compinfo_init(ci, pgtbl_cap, captbl_cap, comp_cap, scb_cap, scb_ptr, heap_ptr, cap_frontier, ci);
 	curr_defci_init_status = INITIALIZED;
 	cos_defcompinfo_sched_init_ext(sched_tc, sched_thd, sched_rcv);
 }
@@ -121,7 +121,7 @@ cos_aep_alloc_intern(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci,
 
 int
 cos_defcompinfo_child_alloc(struct cos_defcompinfo *child_defci, vaddr_t entry, vaddr_t heap_ptr, capid_t cap_frontier,
-                            int is_sched, vaddr_t *dcbuaddr, vaddr_t *scbaddr)
+                            int is_sched, vaddr_t *dcbuaddr)
 {
 	int                     ret;
 	struct cos_defcompinfo *defci     = cos_defcompinfo_curr_get();
@@ -131,7 +131,7 @@ cos_defcompinfo_child_alloc(struct cos_defcompinfo *child_defci, vaddr_t entry,
 	struct cos_aep_info    *child_aep = cos_sched_aep_get(child_defci);
 
 	assert(curr_defci_init_status == INITIALIZED);
-	ret = cos_compinfo_alloc(child_ci, heap_ptr, cap_frontier, entry, ci, scbaddr);
+	ret = cos_compinfo_alloc(child_ci, heap_ptr, cap_frontier, entry, ci);
 	if (ret) return ret;
 	*dcbuaddr = (vaddr_t)cos_dcbpg_bump_allocn(child_ci, PAGE_SIZE);
 	assert(*dcbuaddr);
diff --git a/src/components/lib/cos_kernel_api.c b/src/components/lib/cos_kernel_api.c
index 4f73b08127..17a61c629f 100644
--- a/src/components/lib/cos_kernel_api.c
+++ b/src/components/lib/cos_kernel_api.c
@@ -34,7 +34,7 @@ __compinfo_metacap(struct cos_compinfo *ci)
 static inline void
 cos_vasfrontier_init(struct cos_compinfo *ci, vaddr_t heap_ptr)
 {
-	ci->vas_frontier = heap_ptr;
+	ci->vas_frontier    = heap_ptr;
 	/*
 	 * The first allocation should trigger PTE allocation, unless
 	 * it is in the middle of a PGD, in which case we assume one
@@ -71,24 +71,27 @@ cos_capfrontier_init(struct cos_compinfo *ci, capid_t cap_frontier)
 
 void
 cos_compinfo_init(struct cos_compinfo *ci, pgtblcap_t pgtbl_cap, captblcap_t captbl_cap, compcap_t comp_cap,
-                  vaddr_t heap_ptr, capid_t cap_frontier, struct cos_compinfo *ci_resources)
+		  scbcap_t scb_cap, vaddr_t scb_vas, vaddr_t heap_ptr, capid_t cap_frontier, struct cos_compinfo *ci_resources)
 {
 	assert(ci && ci_resources);
 	assert(cap_frontier % CAPMAX_ENTRY_SZ == 0);
 
 	ci->memsrc = ci_resources;
 	assert(ci_resources->memsrc == ci_resources); /* prevent infinite data-structs */
-
-	ci->pgtbl_cap    = pgtbl_cap;
-	ci->captbl_cap   = captbl_cap;
-	ci->comp_cap     = comp_cap;
-
-	cos_vasfrontier_init(ci, heap_ptr);
-	cos_capfrontier_init(ci, cap_frontier);
-
 	ps_lock_init(&ci->cap_lock);
 	ps_lock_init(&ci->mem_lock);
 	ps_lock_init(&ci->va_lock);
+
+	ci->pgtbl_cap  = pgtbl_cap;
+	ci->captbl_cap = captbl_cap;
+	ci->comp_cap   = comp_cap;
+	ci->scb_cap    = scb_cap;
+
+	assert(!scb_vas || scb_vas + COS_SCB_SIZE <= heap_ptr);
+	ci->scb_vas = scb_vas;
+
+	cos_capfrontier_init(ci, cap_frontier);
+	cos_vasfrontier_init(ci, heap_ptr);
 }
 
 /**************** [Memory Capability Allocation Functions] ***************/
@@ -469,7 +472,7 @@ __page_bump_mem_alloc(struct cos_compinfo *ci, vaddr_t *mem_addr, vaddr_t *mem_f
 	struct cos_compinfo *meta = __compinfo_metacap(ci);
 	size_t               rounded;
 
-	printd("__page_bump_alloc\n");
+	printd("__page_bump_mem_alloc\n");
 
 	assert(sz % PAGE_SIZE == 0);
 	assert(meta == __compinfo_metacap(meta)); /* prevent unbounded structures */
@@ -506,6 +509,12 @@ __page_bump_valloc(struct cos_compinfo *ci, size_t sz)
 	return ret_addr;
 }
 
+vaddr_t
+cos_page_bump_intern_valloc(struct cos_compinfo *ci, size_t sz)
+{
+	return __page_bump_valloc(ci, sz);
+}
+
 static vaddr_t
 __page_bump_alloc(struct cos_compinfo *ci, size_t sz, int shared)
 {
@@ -657,30 +666,53 @@ cos_pgtbl_alloc(struct cos_compinfo *ci)
 	return cap;
 }
 
+scbcap_t
+cos_scb_alloc(struct cos_compinfo *ci)
+{
+	vaddr_t kmem;
+	capid_t cap;
+	u32_t   lid = livenessid_bump_alloc();
+
+	printd("cos_scb_alloc\n");
+
+	assert(ci && lid);
+
+	if (__alloc_mem_cap(ci, CAP_SCB, &kmem, &cap)) return 0;
+	if (call_cap_op(ci->captbl_cap, CAPTBL_OP_SCB_ACTIVATE, cap, __compinfo_metacap(ci)->mi.pgtbl_cap, kmem, lid))
+		BUG();
+
+	return cap;
+}
+
 compcap_t
-cos_comp_alloc(struct cos_compinfo *ci, captblcap_t ctc, pgtblcap_t ptc, vaddr_t entry, vaddr_t scbpg)
+cos_comp_alloc(struct cos_compinfo *ci, captblcap_t ctc, pgtblcap_t ptc, scbcap_t scbc, vaddr_t entry, vaddr_t uaddr)
 {
 	capid_t cap;
+	/* FIXME: same or diff liveness ids in scb and comp resources? */
 	u32_t   lid = livenessid_bump_alloc();
 
 	printd("cos_comp_alloc\n");
 
-	assert(ci && ctc && ptc && lid && scbpg);
+	assert(ci && ctc && ptc && lid);
+	/* FIXME: packing scbc in 12 bits */
+	assert(scbc < (1 << 12));
 
 	cap = __capid_bump_alloc(ci, CAP_COMP);
 	if (!cap) return 0;
-	if (call_cap_op(ci->captbl_cap, CAPTBL_OP_COMPACTIVATE, (lid << 16) | cap, (ctc << 16) | ptc, scbpg, entry)) BUG();
+	if (call_cap_op(ci->captbl_cap, CAPTBL_OP_COMPACTIVATE, (lid << 16) | cap, (ctc << 16) | ptc, uaddr | scbc, entry)) BUG();
 
 	return cap;
 }
 
 int
 cos_compinfo_alloc(struct cos_compinfo *ci, vaddr_t heap_ptr, capid_t cap_frontier, vaddr_t entry,
-                   struct cos_compinfo *ci_resources, vaddr_t *scbpg)
+                   struct cos_compinfo *ci_resources)
 {
 	pgtblcap_t  ptc;
 	captblcap_t ctc;
 	compcap_t   compc;
+	scbcap_t    scbc;
+	vaddr_t     scb_vaddr;
 
 	printd("cos_compinfo_alloc\n");
 
@@ -688,13 +720,16 @@ cos_compinfo_alloc(struct cos_compinfo *ci, vaddr_t heap_ptr, capid_t cap_fronti
 	assert(ptc);
 	ctc = cos_captbl_alloc(ci_resources);
 	assert(ctc);
-	cos_compinfo_init(ci, ptc, ctc, 0, heap_ptr, cap_frontier, ci_resources);
-	*scbpg = (vaddr_t)cos_scbpg_bump_allocn(ci, COS_SCB_SIZE);
-	assert(*scbpg);
+	scbc = cos_scb_alloc(ci_resources);
+	assert(scbc);
+	cos_compinfo_init(ci, ptc, ctc, 0, scbc, 0, heap_ptr, cap_frontier, ci_resources);
 
-	compc = cos_comp_alloc(ci_resources, ctc, ptc, entry, *scbpg);
+	scb_vaddr = (vaddr_t)__page_bump_valloc(ci, COS_SCB_SIZE);
+	assert(scb_vaddr);
+	compc     = cos_comp_alloc(ci_resources, ctc, ptc, scbc, entry, scb_vaddr);
 	assert(compc);
 	ci->comp_cap = compc;
+	ci->scb_vas  = scb_vaddr;
 
 	return 0;
 }
diff --git a/src/components/lib/sl/sl_sched.c b/src/components/lib/sl/sl_sched.c
index 9411262657..48ad45f205 100644
--- a/src/components/lib/sl/sl_sched.c
+++ b/src/components/lib/sl/sl_sched.c
@@ -720,5 +720,6 @@ sl_sched_loop_nonblock(void)
 int
 sl_thd_kern_dispatch(thdcap_t t)
 {
+	//return cos_switch(t, sl__globals_cpu()->sched_tcap, 0, sl__globals_cpu()->timeout_next, sl__globals_cpu()->sched_rcv, cos_sched_sync());
 	return cos_thd_switch(t);
 }
diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index 589f7b0695..73fda151e5 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -15,6 +15,8 @@
 #include "include/tcap.h"
 #include "include/chal/defs.h"
 #include "include/hw.h"
+#include "include/scb.h"
+//#include "include/dcb.h"
 
 #define COS_DEFAULT_RET_CAP 0
 
@@ -1310,10 +1312,11 @@ static int __attribute__((noinline)) composite_syscall_slowpath(struct pt_regs *
 			capid_t      pgtbl_cap  = __userregs_get2(regs) & 0xFFFF;
 			livenessid_t lid        = (capin >> 16);
 			capid_t      comp_cap   = (capin << 16) >> 16;
-			vaddr_t      scb_uaddr  = __userregs_get3(regs);
+			vaddr_t      scb_uaddr  = __userregs_get3(regs) | ~((1 << 12) - 1);
 			vaddr_t      entry_addr = __userregs_get4(regs);
+			capid_t      scb_cap    = __userregs_get3(regs) & ((1 << 12) - 1);
 
-			ret = comp_activate(ct, cap, comp_cap, captbl_cap, pgtbl_cap, lid, entry_addr, scb_uaddr);
+			ret = comp_activate(ct, cap, comp_cap, captbl_cap, pgtbl_cap, scb_cap, lid, entry_addr, scb_uaddr);
 			break;
 		}
 		case CAPTBL_OP_COMPDEACTIVATE: {
@@ -1422,6 +1425,60 @@ static int __attribute__((noinline)) composite_syscall_slowpath(struct pt_regs *
 			ret = hw_deactivate(op_cap, capin, lid);
 			break;
 		}
+		case CAPTBL_OP_SCB_ACTIVATE: {
+			capid_t      ptcap  = __userregs_get2(regs);
+			livenessid_t lid    = __userregs_get3(regs);
+			vaddr_t      addr   = __userregs_get4(regs);
+			unsigned long *pte;
+			struct cos_scb_info *scb;
+
+			ret = cap_kmem_activate(ct, ptcap, addr, (unsigned long *)&scb, &pte);
+			if (ret) cos_throw(err, ret);
+
+			ret = scb_activate(ct, cap, capin, (vaddr_t)scb, lid);
+
+			break;
+		}
+		case CAPTBL_OP_SCB_DEACTIVATE: {
+			u32_t        r2      = __userregs_get2(regs);
+			livenessid_t lid     = r2 >> 16;
+			capid_t      ptcap   = (r2 << 16) >> 16;
+			capid_t      cf_addr = __userregs_get3(regs);
+
+			ret = scb_deactivate(op_cap, capin, ptcap, cf_addr, lid);
+
+			break;
+		}
+//		case CAPTBL_OP_DCB_ACTIVATE: {
+//			u32_t        r1      = __userregs_get1(regs);
+//			u32_t        r2      = __userregs_get2(regs);
+//			u32_t        r3      = __userregs_get3(regs);
+//			u32_t        r4      = __userregs_get4(regs);
+//			capid_t      dcbcap  = r1 >> 16;
+//			capid_t      ptcap   = (r1 << 16) >> 16;
+//			livenessid_t lid     = r2 >> 16;
+//			capid_t      ptcapin = (r2 << 16) >> 16;
+//			vaddr_t      kaddr   = r3;
+//			vaddr_t      uaddrin = r4;
+//
+//			ret = dcb_activate(ct, cap, dcbcap, ptcap, kaddr, lid, ptcapin, uaddr);
+//
+//			break;
+//		}
+//		case CAPTBL_OP_DCB_DEACTIVATE: {
+//			u32_t        r2      = __userregs_get2(regs);
+//			u32_t        r3      = __userregs_get3(regs);
+//			u32_t        r4      = __userregs_get4(regs);
+//			livenessid_t lid     = r2 >> 16;
+//			capid_t      ptcap   = (r2 << 16) >> 16;
+//			vaddr_t      cf_addr = r3 & (~0 << 12);
+//			vaddr_t      uaddrin = r4 & (~0 << 12);
+//			capid_t      ptcapin = (r4 << 20) >> 12 | ((r3 << 20) >> 20);
+//
+//			ret = dcb_deactivate(ct, capin, lid, ptcap, cf_addr, ptcapin, uaddrin);
+//
+//			break;
+//		}
 		default:
 			goto err;
 		}
diff --git a/src/kernel/include/captbl.h b/src/kernel/include/captbl.h
index 102fe147d3..7530b06796 100644
--- a/src/kernel/include/captbl.h
+++ b/src/kernel/include/captbl.h
@@ -51,7 +51,7 @@ typedef enum {
 #define CAP_HEAD_AMAP_SZ 4
 #define CAP_HEAD_SZ_SZ 2
 #define CAP_HEAD_FLAGS_SZ 3
-#define CAP_HEAD_TYPE_SZ 7
+#define CAP_HEAD_TYPE_SZ CAP_TYPE_MAXBITS
 
 /*
  * This is the header for each capability.  Includes information about
diff --git a/src/kernel/include/component.h b/src/kernel/include/component.h
index 0c6c31581b..6d6f8d18f9 100644
--- a/src/kernel/include/component.h
+++ b/src/kernel/include/component.h
@@ -16,56 +16,60 @@
 struct comp_info {
 	struct liveness_data        liveness;
 	pgtbl_t                     pgtbl;
-	struct captbl *             captbl;
+	struct captbl              *captbl;
 	struct cos_scb_info        *scb_data;
 } __attribute__((packed));
 
 struct cap_comp {
 	struct cap_header  h;
 	vaddr_t            entry_addr;
-	struct cap_pgtbl * pgd;
+	struct cap_pgtbl  *pgd;
 	struct cap_captbl *ct_top;
 	struct comp_info   info;
 } __attribute__((packed));
 
+#include "scb.h"
+
 static int
-comp_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t captbl_cap, capid_t pgtbl_cap, livenessid_t lid,
-              vaddr_t entry_addr, vaddr_t scb_uaddr)
+comp_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t captbl_cap, capid_t pgtbl_cap, capid_t scbcap,
+	      livenessid_t lid, vaddr_t entry_addr, vaddr_t scb_uaddr)
 {
-	struct cap_comp *  compc;
-	struct cap_pgtbl * ptc;
+	struct cap_comp   *compc;
+	struct cap_pgtbl  *ptc;
 	struct cap_captbl *ctc;
 	u32_t              v, flags;
 	int                ret = 0;
-	vaddr_t            scb_kaddr = 0;
+	struct cap_scb    *scbc = NULL;
 
 	ctc = (struct cap_captbl *)captbl_lkup(t, captbl_cap);
 	if (unlikely(!ctc || ctc->h.type != CAP_CAPTBL || ctc->lvl > 0)) return -EINVAL;
 	ptc = (struct cap_pgtbl *)captbl_lkup(t, pgtbl_cap);
 	if (unlikely(!ptc || ptc->h.type != CAP_PGTBL || ptc->lvl > 0)) return -EINVAL;
+	if (likely(scbcap)) {
+		scbc = (struct cap_scb *)captbl_lkup(t, scbcap);
+		if (unlikely(!scbc || scbc->h.type != CAP_SCB)) return -EINVAL;
+	}
 
 	v = ptc->refcnt_flags;
 	if (v & CAP_MEM_FROZEN_FLAG) return -EINVAL;
 	if (cos_cas((unsigned long *)&ptc->refcnt_flags, v, v + 1) != CAS_SUCCESS) return -ECASFAIL;
 
-	scb_kaddr = (vaddr_t)pgtbl_lkup(((struct cap_pgtbl *)ptc)->pgtbl, scb_uaddr, &flags);
-	assert(scb_kaddr);
-
 	v = ctc->refcnt_flags;
 	if (v & CAP_MEM_FROZEN_FLAG) cos_throw(undo_ptc, -EINVAL);
 	if (cos_cas((unsigned long *)&ctc->refcnt_flags, v, v + 1) != CAS_SUCCESS) {
 		/* undo before return */
 		cos_throw(undo_ptc, -ECASFAIL);
 	}
-
 	compc = (struct cap_comp *)__cap_capactivate_pre(t, cap, capin, CAP_COMP, &ret);
 	if (!compc) cos_throw(undo_ctc, ret);
 
+	if (likely(scbc)) {
+		ret = scb_comp_update(t, scbc, compc, ptc, scb_uaddr);
+		if (ret) cos_throw(undo_capact, ret);
+	}
 	compc->entry_addr    = entry_addr;
 	compc->info.pgtbl    = ptc->pgtbl;
 	compc->info.captbl   = ctc->captbl;
-	compc->info.scb_data = (struct cos_scb_info *)scb_kaddr;
-	memset(compc->info.scb_data, 0, PAGE_SIZE);
 	compc->pgd           = ptc;
 	compc->ct_top        = ctc;
 	ltbl_get(lid, &compc->info.liveness);
@@ -73,6 +77,9 @@ comp_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t captbl_cap,
 
 	return 0;
 
+/*undo_scb:
+	scb_comp_remove(t, scbc, pgtbl_cap, scb_uaddr);*/
+undo_capact:
 undo_ctc:
 	cos_faa((int *)&ctc->refcnt_flags, -1);
 undo_ptc:
@@ -84,8 +91,8 @@ static int
 comp_deactivate(struct cap_captbl *ct, capid_t capin, livenessid_t lid)
 {
 	int                ret;
-	struct cap_comp *  compc;
-	struct cap_pgtbl * pgd;
+	struct cap_comp   *compc;
+	struct cap_pgtbl  *pgd;
 	struct cap_captbl *ct_top;
 
 	compc = (struct cap_comp *)captbl_lkup(ct->captbl, capin);
@@ -94,6 +101,8 @@ comp_deactivate(struct cap_captbl *ct, capid_t capin, livenessid_t lid)
 	ltbl_expire(&compc->info.liveness);
 	pgd    = compc->pgd;
 	ct_top = compc->ct_top;
+	/* TODO: right way to remove scb info */
+	if (likely(compc->info.scb_data)) scb_comp_remove(ct, 0, 0, 0);
 
 	ret = cap_capdeactivate(ct, capin, CAP_COMP, lid);
 	if (ret) return ret;
diff --git a/src/kernel/include/dcb.h b/src/kernel/include/dcb.h
new file mode 100644
index 0000000000..3b64c4477e
--- /dev/null
+++ b/src/kernel/include/dcb.h
@@ -0,0 +1,101 @@
+/**
+ * Copyright 2019 by Phani Gadepalli, phanikishoreg@gwu.edu
+ *
+ * Redistribution of this file is permitted under the GNU General Public License v2.
+ */
+
+#ifndef DCB_H
+#define DCB_H
+
+#include "cap_ops.h"
+#include "pgtbl.h"
+#include "retype_tbl.h"
+#include "component.h"
+
+#define DCB_ENTRIES_MAX_PER_PAGE (PAGE_SIZE/sizeof(struct cos_dcb_info))
+
+struct cap_dcb {
+	struct cap_header     h;
+	struct liveness_data  liveness;
+	unsigned int          refcnt;
+	vaddr_t               kern_addr;
+	cpuid_t               cpuid;
+} __attribute__((packed));
+
+static int
+dcb_activate(struct captbl *t, capid_t ctcap, capid_t dcbcap, capid_t ptcap, vaddr_t kaddr, livenessid_t lid, capid_t ptcapin, vaddr_t uaddr)
+{
+	struct cap_dcb      *dc;
+	struct cap_pgtbl    *ptc;
+	unsigned long       *tpte;
+	struct cos_dcb_info *di;
+	int                  ret;
+
+	ret = cap_kmem_activate(t, ptcap, kaddr, (unsigned long *)&di, &tpte);
+	if (unlikely(ret)) return -EINVAL;
+	assert(di && tpte);
+
+	/* TODO: memactivate kaddr -> uaddr in ptcapin */
+
+	dc = (struct cap_dcb *)__cap_capactivate_pre(t, ctcap, dcbcap, CAP_DCB, &ret);
+	if (!dc) return -EINVAL;
+
+	ltbl_get(lid, &dc->liveness);
+	dc->kern_addr = (vaddr_t)di;
+	dc->refcnt    = 0;
+	dc->cpuid     = get_cpuid();
+
+	__cap_capactivate_post(&dc->h, CAP_DCB);
+
+	return 0;
+}
+
+static int
+dcb_deactivate(struct cap_captbl *ct, capid_t dcbcap, livenessid_t lid, capid_t ptcap, capid_t cosframe_addr, capid_t ptcapin, vaddr_t uaddrin)
+{
+	struct cap_dcb *dc;
+	int ret;
+
+	dc = (struct cap_comp *)captbl_lkup(ct->captbl, dcbcap);
+	if (dc->h.type != CAP_DCB) return -EINVAL;
+
+	if (dc->refcnt) return -EPERM;
+	/* TODO: verify uaddrin in ptcapin maps to kaddr for this dcb and then unmap from ptcapin at uaddrin */
+
+	ltbl_expire(&dc->liveness);
+	ret = kmem_deact_pre(dc, ct, ptcap, cosframe_addr, &pte, &old_v);
+	if (ret) return ret;
+	ret = kmem_deact_post(pte, old_v);
+	if (ret) return ret;
+	dc->kern_addr = 0;
+
+	return cap_capdeactivate(ct, dcbcap, CAP_DCB, lid);
+}
+
+static inline int
+dcb_thd_ref(struct cap_dcb *dc, struct thread *thd)
+{
+	if (dc->refcnt >= DCB_ENTRIES_MAX_PER_PAGE) return -EINVAL;
+	if (dc->cpuid != thd->cpuid) return -EINVAL;
+	if (!ltbl_isalive(&dc->liveness)) return -EPERM;
+
+	dc->refcnt++;
+
+	return 0;
+}
+
+static inline int
+dcb_thd_deref(struct cap_dcb *dc, struct thread *thd)
+{
+	if (!dc->refcnt) return -EINVAL;
+	if (dc->cpuid != thd->cpuid) return -EINVAL;
+	if (!ltbl_isalive(&dc->liveness)) return -EPERM;
+
+	assert((vaddr_t)thd->dcbinfo >= dc->kern_addr && (vaddr_t)thd->dcbinfo < (dc->kern_addr + PAGE_SIZE));
+
+	dc->refcnt--;
+
+	return 0;
+}
+
+#endif /* DCB_H */
diff --git a/src/kernel/include/scb.h b/src/kernel/include/scb.h
new file mode 100644
index 0000000000..b3618bed12
--- /dev/null
+++ b/src/kernel/include/scb.h
@@ -0,0 +1,100 @@
+/**
+ * Copyright 2019 by Phani Gadepalli, phanikishoreg@gwu.edu
+ *
+ * Redistribution of this file is permitted under the GNU General Public License v2.
+ */
+
+#ifndef SCB_H
+#define SCB_H
+
+#include "component.h"
+#include "cap_ops.h"
+#include "pgtbl.h"
+#include "retype_tbl.h"
+
+struct comp_info;
+
+struct cap_scb {
+	struct cap_header     h;
+	struct liveness_data  liveness;
+	struct cap_comp      *compc;
+	vaddr_t               kern_addr;
+} __attribute__((packed));
+
+static int
+scb_activate(struct captbl *t, capid_t ctcap, capid_t scbcap, vaddr_t kaddr, livenessid_t lid)
+{
+	struct cap_scb *sc;
+	int             ret;
+
+	sc = (struct cap_scb *)__cap_capactivate_pre(t, ctcap, scbcap, CAP_SCB, &ret);
+	if (!sc) return -EINVAL;
+
+	ltbl_get(lid, &sc->liveness);
+	sc->kern_addr = kaddr;
+	sc->compc     = NULL;
+	memset((void *)kaddr, 0, COS_SCB_SIZE);
+
+	__cap_capactivate_post(&sc->h, CAP_SCB);
+
+	return 0;
+}
+
+static int
+scb_deactivate(struct cap_captbl *ct, capid_t scbcap, capid_t ptcap, capid_t cosframe_addr, livenessid_t lid)
+{
+	struct cap_scb *sc;
+	unsigned long old_v = 0, *pte = NULL;
+	int ret;
+
+	sc = (struct cap_scb *)captbl_lkup(ct->captbl, scbcap);
+	if (sc->h.type != CAP_SCB) return -EINVAL;
+
+	/* FIXME: component using this scbcap is still active! how to handle this? */
+	if (sc->compc) return -EPERM;
+
+	ltbl_expire(&sc->liveness);
+	ret = kmem_deact_pre((struct cap_header *)sc, ct->captbl, ptcap, cosframe_addr, &pte, &old_v);
+	if (ret) return ret;
+	ret = kmem_deact_post(pte, old_v);
+	if (ret) return ret;
+
+	return cap_capdeactivate(ct, scbcap, CAP_SCB, lid);
+}
+
+static inline int
+scb_comp_update(struct captbl *ct, struct cap_scb *sc, struct cap_comp *compc, struct cap_pgtbl *ptcin, vaddr_t uaddrin)
+{
+	paddr_t pf = chal_va2pa((void *)(sc->kern_addr));
+
+	if (unlikely(!ltbl_isalive(&sc->liveness))) return -EPERM;
+	if (pgtbl_mapping_add(ptcin->pgtbl, uaddrin, pf, PGTBL_USER_DEF)) return -EINVAL;
+
+	sc->compc = compc;
+	compc->info.scb_data = (struct cos_scb_info *)(sc->kern_addr);
+
+	return 0;
+}
+
+static inline int
+scb_comp_remove(struct cap_captbl *ct, struct cap_scb *sc, capid_t ptcapin, vaddr_t uaddrin)
+{
+	int ret;
+
+	if (unlikely(!ct || !sc || !ptcapin || !uaddrin)) return -EINVAL;
+
+	if (unlikely(!ltbl_isalive(&sc->liveness))) return -EPERM;
+	if (unlikely(!sc->compc)) return -EINVAL;
+
+	/* TODO: unmap uaddrin in the user-land */
+
+	return 0;
+}
+
+static inline struct liveness_data *
+scb_liveness(struct cap_scb *sc)
+{
+	return &sc->liveness;
+}
+
+#endif /* SCB_H */
diff --git a/src/kernel/include/shared/cos_types.h b/src/kernel/include/shared/cos_types.h
index 3e416ab8fc..ac86706d2a 100644
--- a/src/kernel/include/shared/cos_types.h
+++ b/src/kernel/include/shared/cos_types.h
@@ -126,6 +126,12 @@ typedef enum {
 	CAPTBL_OP_HW_MAP,
 	CAPTBL_OP_HW_CYC_USEC,
 	CAPTBL_OP_HW_CYC_THRESH,
+
+	CAPTBL_OP_SCB_ACTIVATE,
+	CAPTBL_OP_SCB_DEACTIVATE,
+
+	CAPTBL_OP_DCB_ACTIVATE,
+	CAPTBL_OP_DCB_DEACTIVATE,
 } syscall_op_t;
 
 typedef enum {
@@ -143,8 +149,13 @@ typedef enum {
 	CAP_QUIESCENCE, /* when deactivating, set to track quiescence state */
 	CAP_TCAP,       /* tcap captable entry */
 	CAP_HW,         /* hardware (interrupt) */
+	CAP_SCB,	/* Scheduler control block (SCB) */
+	CAP_DCB,	/* Dispatch control block (DCB) */
 } cap_t;
 
+/* maximum size allowed for CAP TYPE in a capability header */
+#define CAP_TYPE_MAXBITS 7
+#define CAP_TYPE_MAX (1 << CAP_TYPE_MAXBITS - 1)
 /* TODO: pervasive use of these macros */
 /* v \in struct cap_* *, type \in cap_t */
 #define CAP_TYPECHK(v, t) ((v) && (v)->h.type == (t))
@@ -193,12 +204,16 @@ typedef int cpuid_t;
 static inline cap_sz_t
 __captbl_cap2sz(cap_t c)
 {
+	/* if (unlikely(c > CAP_TYPE_MAX)) return CAP_SZ_ERR; */
+
 	/* TODO: optimize for invocation and return */
 	switch (c) {
 	case CAP_SRET:
 	case CAP_THD:
 	case CAP_TCAP:
 		return CAP_SZ_16B;
+	case CAP_SCB:
+	case CAP_DCB:
 	case CAP_CAPTBL:
 	case CAP_PGTBL:
 	case CAP_HW: /* TODO: 256bits = 32B * 8b */
@@ -252,11 +267,13 @@ enum
 	BOOT_CAPTBL_KM_PTE          = 18,
 
 	BOOT_CAPTBL_SINV_CAP           = 20,
-	BOOT_CAPTBL_SELF_INITHW_BASE   = 24,
+	BOOT_CAPTBL_SELF_SCB           = 24, /* FIXME: Do we need this? */
+	BOOT_CAPTBL_SELF_INITHW_BASE   = 26,
 	BOOT_CAPTBL_SELF_INITTHD_BASE  = 28,
 	/*
 	 * NOTE: kernel doesn't support sharing a cache-line across cores,
 	 *       so optimize to place INIT THD/TCAP on same cache line and bump by 64B for next CPU
+	 * Update: add per-core INIT DCB cap in to the same cache-line.
 	 */
 	BOOT_CAPTBL_SELF_INITRCV_BASE  = round_up_to_pow2(BOOT_CAPTBL_SELF_INITTHD_BASE + NUM_CPU * CAP64B_IDSZ,
                                                          CAPMAX_ENTRY_SZ),
@@ -266,13 +283,17 @@ enum
 };
 
 #define BOOT_CAPTBL_SELF_INITTCAP_BASE (BOOT_CAPTBL_SELF_INITTHD_BASE + CAP16B_IDSZ)
+#define BOOT_CAPTBL_SELF_INITDCB_BASE (BOOT_CAPTBL_SELF_INITTHD_BASE + CAP32B_IDSZ)
+
 #define BOOT_CAPTBL_SELF_INITTHD_CPU_BASE (BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cos_cpuid()))
 #define BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE (BOOT_CAPTBL_SELF_INITTCAP_BASE_CPU(cos_cpuid()))
 #define BOOT_CAPTBL_SELF_INITRCV_CPU_BASE (BOOT_CAPTBL_SELF_INITRCV_BASE_CPU(cos_cpuid()))
+#define BOOT_CAPTBL_SELF_INITDCB_CPU_BASE (BOOT_CAPTBL_SELF_INITDCB_BASE_CPU(cos_cpuid()))
 
 #define BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cpuid) (BOOT_CAPTBL_SELF_INITTHD_BASE + cpuid * CAP64B_IDSZ)
 #define BOOT_CAPTBL_SELF_INITTCAP_BASE_CPU(cpuid) (BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cpuid) + CAP16B_IDSZ)
 #define BOOT_CAPTBL_SELF_INITRCV_BASE_CPU(cpuid) (BOOT_CAPTBL_SELF_INITRCV_BASE + cpuid * CAP64B_IDSZ)
+#define BOOT_CAPTBL_SELF_INITDCB_BASE_CPU(cpuid) (BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cpuid) + CAP32B_IDSZ)
 
 /*
  * The half of the first page of init captbl is devoted to root node. So, the
@@ -433,7 +454,6 @@ struct cos_component_information {
 	vaddr_t                    cos_heap_allocated, cos_heap_alloc_extent;
 	vaddr_t                    cos_upcall_entry;
 	vaddr_t                    cos_async_inv_entry;
-	struct cos_scb_info               *cos_scb_data;
 	vaddr_t                            cos_user_caps;
 	struct restartable_atomic_sequence cos_ras[COS_NUM_ATOMIC_SECTIONS / 2];
 	vaddr_t                            cos_poly[COMP_INFO_POLY_NUM];
diff --git a/src/kernel/include/thd.h b/src/kernel/include/thd.h
index 17f012c543..cb269e3f7c 100644
--- a/src/kernel/include/thd.h
+++ b/src/kernel/include/thd.h
@@ -370,6 +370,7 @@ thd_activate(struct captbl *t, capid_t cap, capid_t capin, struct thread *thd, c
 	tc->t     = thd;
 	tc->cpuid = get_cpuid();
 	__cap_capactivate_post(&tc->h, CAP_THD);
+	/* TODO: dcb_thd_ref() */
 
 	return 0;
 }
@@ -426,6 +427,7 @@ thd_deactivate(struct captbl *ct, struct cap_captbl *dest_ct, unsigned long capi
 		ret = kmem_deact_post(pte, old_v);
 		if (ret) cos_throw(err, ret);
 	}
+	/* TODO: dcb_thd_deref() */
 
 	return 0;
 err:
diff --git a/src/platform/i386/boot_comp.c b/src/platform/i386/boot_comp.c
index 4c05e2a006..9599051ff5 100644
--- a/src/platform/i386/boot_comp.c
+++ b/src/platform/i386/boot_comp.c
@@ -95,11 +95,6 @@ boot_pgtbl_mappings_add(struct captbl *ct, capid_t pgdcap, capid_t ptecap, const
 
 		assert(i == range / PAGE_SIZE);
 		assert(COS_SCB_SIZE == PAGE_SIZE); /* FIXME: for prototype impl! */
-		p = mem_boot_alloc(1);
-		assert(p);
-		pf = chal_va2pa(p);
-
-		if (pgtbl_mapping_add(pgtbl, mapat, pf, PGTBL_USER_DEF)) assert(0);
 		*scb_uaddr = (unsigned long)mapat;
 		i++;
 
@@ -183,7 +178,7 @@ kern_boot_comp(const cpuid_t cpu_id)
 	u8_t *         boot_comp_captbl;
 	pgtbl_t        pgtbl     = (pgtbl_t)chal_va2pa(&boot_comp_pgd), boot_vm_pgd;
 	u32_t          hw_bitmap = 0xFFFFFFFF;
-	vaddr_t        scb_uaddr  = 0;
+	vaddr_t        scb_uaddr  = 0, scb_kaddr = 0;
 
 	assert(cpu_id >= 0);
 	if (NUM_CPU > 1 && cpu_id > 0) {
@@ -222,6 +217,8 @@ kern_boot_comp(const cpuid_t cpu_id)
 	hw_asndcap_init();
 	if (hw_activate(glb_boot_ct, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_INITHW_BASE, hw_bitmap)) assert(0);
 
+	scb_kaddr = (vaddr_t)mem_boot_alloc(1);
+	assert(scb_kaddr);
 	/*
 	 * separate pgd for boot component virtual memory
 	 */
@@ -250,14 +247,15 @@ kern_boot_comp(const cpuid_t cpu_id)
                                       mem_utmem_end() - mem_boot_nalloc_end(nkmemptes), 0, 0);
 	assert(ret == 0);
 
-	printk("\tCapability table and page-table created.\n");
-
 	/* Shut off further bump allocations */
 	glb_memlayout.allocs_avail = 0;
+	if (scb_activate(glb_boot_ct, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_SCB, scb_kaddr, 0)) assert(0);
+	printk("\tCapability table and page-table created.\n");
 
-	if (comp_activate(glb_boot_ct, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_COMP, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_PT, 0,
-	                  (vaddr_t)mem_bootc_entry(), scb_uaddr))
+	if (comp_activate(glb_boot_ct, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_COMP, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_PT,
+	                  BOOT_CAPTBL_SELF_SCB, 0, (vaddr_t)mem_bootc_entry(), scb_uaddr))
 		assert(0);
+
 	printk("\tCreated boot component structure from page-table and capability-table.\n");
 
 	kern_boot_thd(glb_boot_ct, thd_mem[cpu_id], tcap_mem[cpu_id], dcb_addr[cpu_id], cpu_id);

From fb5a7e9348ef58f44ce2cf1bbcc96d511cf2801c Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Mon, 11 Mar 2019 19:33:08 -0400
Subject: [PATCH 032/127] SCB and DCB capabilities

* Got SCB and DCB capabilities, api all plumbing working!!
* Some issues to debug, like asserts in my test for hierarchical
  scheduling. (without user-level dispatch!)

* TODO: with user-level dispatch, test in an "application" component
  using capmgr for dcb setup etc!
---
 src/components/Makefile.comp                  |   4 +-
 .../implementation/capmgr/naive/cap_info.c    |  28 +++-
 .../implementation/capmgr/naive/cap_info.h    |  11 +-
 .../implementation/capmgr/naive/cap_mgr.c     | 101 +++++++++-----
 .../implementation/capmgr/naive/init.c        |  90 ++++++++++---
 .../implementation/capmgr/naive/mem_mgr.c     |  30 -----
 .../no_interface/llbooter/boot_deps.h         | 123 +++++++++++-------
 .../no_interface/vkernel/micro_booter.h       |   3 -
 .../no_interface/vkernel/vk_api.c             |   8 +-
 .../no_interface/vkernel/vk_types.h           |   2 +-
 .../no_interface/vkernel/vkernel.c            |   2 +-
 .../no_interface/vkernel/vm_booter.c          |   2 +-
 .../implementation/sched/hier_fprr/init.c     |  31 ++++-
 .../implementation/sched/root_fprr/init.c     |  31 ++++-
 .../implementation/sched/root_fprr_raw/init.c |  25 +++-
 src/components/implementation/sched/sched.c   |  24 ++--
 .../implementation/sched/sched_info.c         |   7 -
 .../implementation/sched/sched_info.h         |   2 +
 .../tests/micro_booter/mb_tests.c             |  16 +--
 .../tests/micro_booter/micro_booter.c         |  49 +------
 .../tests/micro_booter/micro_booter.h         |   5 +-
 .../tests/unit_capmgr/unit_capmgr.c           |   4 +-
 .../tests/unit_defcompinfo/unit_defcompinfo.c |   2 +-
 .../tests/unit_schedcomp/unit_schedcomp.c     |   1 +
 .../tests/unit_schedtests/unit_schedlib.c     |   4 +-
 src/components/include/cos_component.h        |   4 +-
 src/components/include/cos_dcb.h              |  21 ++-
 src/components/include/cos_defkernel_api.h    |  19 +--
 src/components/include/cos_kernel_api.h       |  28 ++--
 src/components/include/hypercall.h            |  21 ++-
 src/components/include/sl.h                   |   8 +-
 src/components/interface/capmgr/capmgr.h      |   8 +-
 src/components/interface/capmgr/memmgr.h      |   3 -
 .../interface/capmgr/stubs/c_stub.c           |  48 ++++---
 .../interface/capmgr/stubs/s_stub.S           |   3 +-
 src/components/lib/Makefile                   |   2 +-
 src/components/lib/cos_dcb.c                  |  92 +++++++++++++
 src/components/lib/cos_dcbcapmgr.c            |  45 -------
 src/components/lib/cos_dcbraw.c               |  47 -------
 src/components/lib/cos_defkernel_api.c        |  68 ++++++----
 src/components/lib/cos_kernel_api.c           |  56 +++++---
 src/components/lib/sl/sl_capmgr.c             |  36 +++--
 src/components/lib/sl/sl_raw.c                |  64 ++++++---
 src/components/lib/sl/sl_sched.c              |   8 +-
 src/kernel/capinv.c                           | 104 +++++++--------
 src/kernel/include/dcb.h                      |  38 +++---
 src/kernel/include/scb.h                      |   2 +-
 src/kernel/include/shared/cos_types.h         |  14 +-
 src/kernel/include/thd.h                      |  44 +++++--
 src/platform/i386/boot_comp.c                 |  91 ++++++-------
 50 files changed, 859 insertions(+), 620 deletions(-)
 create mode 100644 src/components/lib/cos_dcb.c
 delete mode 100644 src/components/lib/cos_dcbcapmgr.c
 delete mode 100644 src/components/lib/cos_dcbraw.c

diff --git a/src/components/Makefile.comp b/src/components/Makefile.comp
index 80251128cc..9f4f96dc8c 100644
--- a/src/components/Makefile.comp
+++ b/src/components/Makefile.comp
@@ -53,5 +53,5 @@ CLIENT_STUB=c_stub.o
 
 LIBCOSDEFKERN=-lcos_kernel_api -lcos_defkernel_api
 LIBSLCORE=$(LIBCOSDEFKERN) -lsl_sched -lheap -lsl_xcpu -lsl_child -lck
-LIBSLCAPMGR=$(LIBSLCORE) -lsl_capmgr -lcos_dcbcapmgr
-LIBSLRAW=$(LIBSLCORE) -lsl_raw -lcos_dcbraw
+LIBSLCAPMGR=$(LIBSLCORE) -lsl_capmgr
+LIBSLRAW=$(LIBSLCORE) -lsl_raw -lcos_dcb
diff --git a/src/components/implementation/capmgr/naive/cap_info.c b/src/components/implementation/capmgr/naive/cap_info.c
index de9c015100..c82f82450d 100644
--- a/src/components/implementation/capmgr/naive/cap_info.c
+++ b/src/components/implementation/capmgr/naive/cap_info.c
@@ -54,6 +54,27 @@ cap_info_thd_next(struct cap_comp_info *rci)
 	return NULL;
 }
 
+void
+cap_info_cpu_initdcb_init(struct cap_comp_info *rci)
+{
+	dcbcap_t initdcb = 0;
+	unsigned short init_off = 0;
+	vaddr_t  initaddr = 0;
+	struct cos_compinfo *ci = cos_compinfo_get(cap_info_dci(rci));
+	struct cap_comp_cpu_info *rci_cpu = cap_info_cpu_local(rci);
+
+	if (rci->cid == 0 || rci->cid == cos_spd_id()) {
+		cos_dcb_info_init_ext(cap_info_cpu_dcbdata(rci_cpu), 0, 0, 0, 0);
+		return;
+	}
+
+	initaddr = rci->init_dcb_start + cos_cpuid() * PAGE_SIZE;
+	initdcb  = cos_dcb_alloc(cos_compinfo_get(cos_defcompinfo_curr_get()), ci->pgtbl_cap, initaddr);
+	assert(initdcb);
+
+	cos_dcb_info_init_ext(cap_info_cpu_dcbdata(rci_cpu), ci, initdcb, initaddr, init_off);
+}
+
 struct cap_comp_info *
 cap_info_comp_init(spdid_t spdid, captblcap_t captbl_cap, pgtblcap_t pgtbl_cap, compcap_t compcap,
 		   capid_t cap_frontier, vaddr_t heap_frontier, spdid_t sched_spdid)
@@ -68,13 +89,16 @@ cap_info_comp_init(spdid_t spdid, captblcap_t captbl_cap, pgtblcap_t pgtbl_cap,
 
 	capci[spdid].cid = spdid;
 	cos_meminfo_init(&ci->mi, 0, 0, 0);
-	cos_compinfo_init(ci, pgtbl_cap, captbl_cap, compcap, heap_frontier, cap_frontier,
-			  0, 0, cos_compinfo_get(cos_defcompinfo_curr_get()));
+	cos_compinfo_init(ci, pgtbl_cap, captbl_cap, compcap, heap_frontier,
+			  cap_frontier, cos_compinfo_get(cos_defcompinfo_curr_get()));
 
 	memset(rglb, 0, sizeof(struct cap_shmem_glb_info));
 	memset(cap_shi, 0, sizeof(struct cap_shmem_info));
 	cap_shi->cinfo = ci;
 
+	capci[spdid].init_dcb_start = heap_frontier - (NUM_CPU * PAGE_SIZE);
+	cap_info_cpu_initdcb_init(&capci[spdid]);
+
 	capci[spdid].initflag = 1;
 	ps_faa((unsigned long *)&cap_comp_count, 1);
 
diff --git a/src/components/implementation/capmgr/naive/cap_info.h b/src/components/implementation/capmgr/naive/cap_info.h
index fc9a85dc03..2bafc7c08a 100644
--- a/src/components/implementation/capmgr/naive/cap_info.h
+++ b/src/components/implementation/capmgr/naive/cap_info.h
@@ -8,6 +8,7 @@
 #include <capmgr.h>
 #include <memmgr.h>
 #include <bitmap.h>
+#include <cos_dcb.h>
 
 #define CAP_INFO_MAX_THREADS (MAX_NUM_THREADS)
 
@@ -44,7 +45,7 @@ struct cap_comp_cpu_info {
 	thdcap_t p_initthdcap; /* init thread's cap in parent */
 	thdid_t  initthdid; /* init thread's tid */
 
-	vaddr_t  initdcbpg;
+	struct cos_dcbinfo_data dcb_data;
 } CACHE_ALIGNED;
 
 struct cap_comp_info {
@@ -52,6 +53,7 @@ struct cap_comp_info {
 	struct cos_defcompinfo defci;
 	struct cap_shmem_info shminfo;
 	int initflag;
+	vaddr_t init_dcb_start;
 
 	struct cap_comp_cpu_info cpu_local[NUM_CPU];
 };
@@ -61,6 +63,7 @@ struct cap_comp_info *cap_info_comp_init(spdid_t spdid, captblcap_t captbl_cap,
 
 struct sl_thd *cap_info_thd_init(struct cap_comp_info *rci, struct sl_thd *t, cos_channelkey_t key);
 struct sl_thd *cap_info_initthd_init(struct cap_comp_info *rci, struct sl_thd *t, cos_channelkey_t key);
+void           cap_info_cpu_initdcb_init(struct cap_comp_info *rci);
 
 struct cap_comp_info *cap_info_comp_find(spdid_t s);
 struct sl_thd        *cap_info_thd_find(struct cap_comp_info *r, thdid_t t);
@@ -98,6 +101,12 @@ cap_info_cpu_local(struct cap_comp_info *c)
 	return &c->cpu_local[cos_cpuid()];
 }
 
+static inline struct cos_dcbinfo_data *
+cap_info_cpu_dcbdata(struct cap_comp_cpu_info *c)
+{
+	return &c->dcb_data;
+}
+
 static inline struct cap_comp_info *
 cap_info_parent(struct cap_comp_info *r)
 {
diff --git a/src/components/implementation/capmgr/naive/cap_mgr.c b/src/components/implementation/capmgr/naive/cap_mgr.c
index 3b4f5c6618..5b8a35ba9c 100644
--- a/src/components/implementation/capmgr/naive/cap_mgr.c
+++ b/src/components/implementation/capmgr/naive/cap_mgr.c
@@ -6,7 +6,7 @@
 #include <cap_info.h>
 
 thdcap_t
-capmgr_thd_create_cserialized(thdid_t *tid, int *unused, thdclosure_index_t idx)
+capmgr_thd_create_cserialized(struct cos_dcb_info **dcb, thdid_t *tid, thdclosure_index_t idx)
 {
 	spdid_t                 cur     = cos_inv_token();
 	struct cos_defcompinfo *cap_dci = cos_defcompinfo_curr_get();
@@ -14,13 +14,18 @@ capmgr_thd_create_cserialized(thdid_t *tid, int *unused, thdclosure_index_t idx)
 	struct cap_comp_info   *r       = cap_info_comp_find(cur);
 	struct sl_thd          *rt      = NULL, *t = NULL;
 	thdcap_t                thdcap  = 0;
+	dcbcap_t                dcbcap  = 0;
+	dcboff_t                dcboff  = 0;
+	vaddr_t                 dcbaddr = 0;
 
 	if (!r || !cap_info_init_check(r)) return 0;
 	if (!cap_info_is_sched(cur)) return 0;
 	if (idx <= 0) return 0;
 
-	t = sl_thd_aep_alloc_ext(cap_info_dci(r), NULL, idx, 0, 0, 0, 0, NULL);
-	if (!t) return 0;
+	dcbcap = cos_dcb_info_alloc(cap_info_cpu_dcbdata(cap_info_cpu_local(r)), &dcboff, &dcbaddr);
+	if (!dcbcap || !dcbaddr || !dcboff) return 0; /* dcboff == 0 for initthd in that comp! */
+  	t = sl_thd_aep_alloc_ext_dcb(cap_info_dci(r), NULL, idx, 0, 0, 0, dcbcap, dcboff, NULL);
+  	if (!t) return 0;
 	thdcap = cos_cap_cpy(cap_info_ci(r), cap_ci, CAP_THD, sl_thd_thdcap(t));
 	if (!thdcap) goto err;
 
@@ -35,7 +40,7 @@ capmgr_thd_create_cserialized(thdid_t *tid, int *unused, thdclosure_index_t idx)
 }
 
 thdcap_t
-capmgr_thd_create_ext_cserialized(thdid_t *tid, int *unused, spdid_t s, thdclosure_index_t idx)
+capmgr_thd_create_ext_cserialized(struct cos_dcb_info **dcb, thdid_t *tid, spdid_t s, thdclosure_index_t idx)
 {
 	spdid_t                 cur     = cos_inv_token();
 	struct cos_defcompinfo *cap_dci = cos_defcompinfo_curr_get();
@@ -44,6 +49,9 @@ capmgr_thd_create_ext_cserialized(thdid_t *tid, int *unused, spdid_t s, thdclosu
 	struct cap_comp_info   *rs      = cap_info_comp_find(s);
 	struct sl_thd          *t       = NULL;
 	thdcap_t                thdcap  = 0;
+	dcbcap_t                dcbcap  = 0;
+	dcboff_t                dcboff  = 0;
+	vaddr_t                 dcbaddr = 0;
 
 	if (!rc || !cap_info_init_check(rc)) return 0;
 	if (!rs || !cap_info_init_check(rs)) return 0;
@@ -51,7 +59,9 @@ capmgr_thd_create_ext_cserialized(thdid_t *tid, int *unused, spdid_t s, thdclosu
 	if (cap_info_is_sched(s)) return 0;
 	if (idx <= 0) return 0;
 
-	t = sl_thd_aep_alloc_ext(cap_info_dci(rs), NULL, idx, 0, 0, 0, 0, NULL);
+	dcbcap = cos_dcb_info_alloc(cap_info_cpu_dcbdata(cap_info_cpu_local(rs)), &dcboff, &dcbaddr);
+	if (!dcbcap || !dcbaddr || !dcboff) return 0; /* dcboff == 0 for initthd in that comp! */
+	t = sl_thd_aep_alloc_ext_dcb(cap_info_dci(rs), NULL, idx, 0, 0, 0, dcbcap, dcboff, NULL);
 	if (!t) return 0;
 	thdcap = cos_cap_cpy(cap_info_ci(rc), cap_ci, CAP_THD, sl_thd_thdcap(t));
 	if (!thdcap) goto err;
@@ -80,16 +90,18 @@ capmgr_initthd_create_cserialized(thdid_t *tid, int *unused, spdid_t s)
 	struct cos_compinfo      *rs_ci   = cap_info_ci(rs);
 	struct sl_thd            *t       = NULL;
 	thdcap_t                  thdcap  = 0;
+	dcbcap_t                  dcbcap  = 0;
+	dcboff_t                  dcboff  = 0;
+	vaddr_t                   dcbaddr = 0;
 
 	if (!rc || !cap_info_init_check(rc)) return 0;
 	if (!rs || !cap_info_init_check(rs)) return 0;
 	if (!cap_info_is_sched(cur) || !cap_info_is_child(rc, s)) return 0;
 	if (cap_info_is_sched(s)) return 0;
 
-	assert(rs_cpu->initdcbpg == 0);
-	rs_cpu->initdcbpg = (vaddr_t)cos_dcbpg_bump_allocn(rs_ci, PAGE_SIZE);
-	assert(rs_cpu->initdcbpg);
-	t = sl_thd_initaep_alloc(cap_info_dci(rs), NULL, 0, 0, 0, rs_cpu->initdcbpg);
+	dcbcap = cos_dcb_info_alloc(cap_info_cpu_dcbdata(rs_cpu), &dcboff, &dcbaddr);
+	if (!dcbcap || !dcbaddr || dcboff) return 0; /* dcboff == 0 for initthd in that comp! */
+	t = sl_thd_initaep_alloc_dcb(cap_info_dci(rs), NULL, 0, 0, 0, dcbcap);
 	if (!t) return 0;
 	/* child is not a scheduler, don't copy into child */
 	/* parent only needs the thdcap */
@@ -120,6 +132,9 @@ capmgr_initaep_create_cserialized(u32_t *sndtidret, u32_t *rcvtcret, spdid_t s,
 	struct cos_compinfo      *rs_ci   = cap_info_ci(rs);
 	struct sl_thd            *t       = NULL, *rinit = NULL;
 	thdcap_t                  thdcap  = 0;
+	dcbcap_t                  dcbcap  = 0;
+	dcboff_t                  dcboff  = 0;
+	vaddr_t                   dcbaddr = 0;
 	int                       ret;
 	tcap_t                    tc;
 	arcvcap_t                 rcv;
@@ -133,10 +148,9 @@ capmgr_initaep_create_cserialized(u32_t *sndtidret, u32_t *rcvtcret, spdid_t s,
 
 	rinit = cap_info_initthd(rc);
 	if (!rinit) return 0;
-	assert(rs_cpu->initdcbpg == 0);
-	rs_cpu->initdcbpg = (vaddr_t)cos_dcbpg_bump_allocn(rs_ci, PAGE_SIZE);
-	assert(rs_cpu->initdcbpg);
-	t = sl_thd_initaep_alloc(cap_info_dci(rs), rinit, 1, owntc, 0, rs_cpu->initdcbpg);
+	dcbcap = cos_dcb_info_alloc(cap_info_cpu_dcbdata(rs_cpu), &dcboff, &dcbaddr);
+	if (!dcbcap || !dcbaddr || dcboff) return 0; /* dcboff == 0 for initthd in that comp! */
+	t = sl_thd_initaep_alloc_dcb(cap_info_dci(rs), rinit, 1, owntc, 0, dcbcap);
 	if (!t) return 0;
 	/* child is a scheduler.. copy initcaps */
 	ret = cos_cap_cpy_at(rs_ci, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, cap_ci, sl_thd_thdcap(t));
@@ -176,8 +190,33 @@ capmgr_initaep_create_cserialized(u32_t *sndtidret, u32_t *rcvtcret, spdid_t s,
 	return 0;
 }
 
-thdcap_t
-capmgr_aep_create_ext_cserialized(u32_t *drcvtidret, u32_t *rcvtcret, spdid_t s, thdclosure_index_t tidx, u32_t owntc_chkey)
+arcvcap_t
+capmgr_aep_rcv_retrieve_cserialized(spdid_t s, thdid_t tid)
+{
+	spdid_t                   cur     = cos_inv_token();
+	struct cos_defcompinfo   *cap_dci = cos_defcompinfo_curr_get();
+	struct cos_compinfo      *cap_ci  = cos_compinfo_get(cap_dci);
+	struct cap_comp_info     *rc      = cap_info_comp_find(cur);
+	struct cap_comp_info     *rs      = cap_info_comp_find(s);
+	struct sl_thd            *ti      = cap_info_thd_find(rs, tid);
+	arcvcap_t                 dstrcv  = 0;
+
+	if (!rc || !cap_info_init_check(rc)) return 0;
+	if (!rs || !cap_info_init_check(rs)) return 0;
+	if (!cap_info_is_sched(cur) || !cap_info_is_child(rc, s)) return 0;
+	if (!ti || !sl_thd_thdcap(ti)) return 0;
+
+	/*
+	 * for aep thread.. rcv cap should be accessible in the destination component,
+	 * so we return that cap so the scheduler can init proper structure of the dest component.
+	 */
+	dstrcv = cos_cap_cpy(cap_info_ci(rs), cap_ci, CAP_ARCV, sl_thd_rcvcap(ti));
+
+	return dstrcv;
+}
+
+u32_t
+capmgr_aep_create_ext_cserialized(struct cos_dcb_info **dcb, u32_t *rcvtcret, spdid_t s, thdclosure_index_t tidx, u32_t owntc_chkey)
 {
 	spdid_t                 cur     = cos_inv_token();
 	struct cos_defcompinfo *cap_dci = cos_defcompinfo_curr_get();
@@ -186,9 +225,12 @@ capmgr_aep_create_ext_cserialized(u32_t *drcvtidret, u32_t *rcvtcret, spdid_t s,
 	struct cap_comp_info   *rs      = cap_info_comp_find(s);
 	struct sl_thd          *t       = NULL, *rinit = NULL;
 	thdcap_t                thdcap  = 0;
+	dcbcap_t                dcbcap  = 0;
+	dcboff_t                dcboff  = 0;
+	vaddr_t                 dcbaddr = 0;
 	int                     owntc   = (int)(owntc_chkey >> 16);
 	cos_channelkey_t        key     = (cos_channelkey_t)((owntc_chkey << 16) >> 16);
-	arcvcap_t               srcrcv, dstrcv;
+	arcvcap_t               srcrcv;
 	tcap_t                  tc;
 	int                     ret;
 
@@ -200,17 +242,13 @@ capmgr_aep_create_ext_cserialized(u32_t *drcvtidret, u32_t *rcvtcret, spdid_t s,
 	rinit = cap_info_initthd(rc);
 	if (!rinit) return 0;
 
-	t = sl_thd_aep_alloc_ext(cap_info_dci(rs), rinit, tidx, 1, owntc, 0, 0, &srcrcv);
+	dcbcap = cos_dcb_info_alloc(cap_info_cpu_dcbdata(cap_info_cpu_local(rs)), &dcboff, &dcbaddr);
+	if (!dcbcap || !dcbaddr || !dcboff) return 0; /* dcboff == 0 for initthd in that comp! */
+	t = sl_thd_aep_alloc_ext_dcb(cap_info_dci(rs), rinit, tidx, 1, owntc, 0, dcbcap, dcboff, &srcrcv);
 	if (!t) return 0;
 	/* cur is a scheduler, copy thdcap */
 	ret = cos_cap_cpy(cap_info_ci(rc), cap_ci, CAP_THD, sl_thd_thdcap(t));
 	if (!ret) goto err;
-	/*
-	 * for aep thread.. rcv cap should be accessible in the destination component,
-	 * so we return that cap so the scheduler can init proper structucap of the dest component.
-	 */
-	dstrcv = cos_cap_cpy(cap_info_ci(rs), cap_ci, CAP_ARCV, sl_thd_rcvcap(t));
-	if (!dstrcv) goto err;
 
 	if (owntc) {
 		/*
@@ -232,8 +270,7 @@ capmgr_aep_create_ext_cserialized(u32_t *drcvtidret, u32_t *rcvtcret, spdid_t s,
 
 	cap_info_thd_init(rc, t, key);
 	cap_info_thd_init(rs, t, 0);
-	*drcvtidret = (dstrcv << 16 | sl_thd_thdid(t));
-	thdcap = ret;
+	thdcap = ret << 16 | sl_thd_thdid(t);
 
 	return thdcap;
 err:
@@ -242,8 +279,8 @@ capmgr_aep_create_ext_cserialized(u32_t *drcvtidret, u32_t *rcvtcret, spdid_t s,
 	return 0;
 }
 
-thdcap_t
-capmgr_aep_create_cserialized(thdid_t *tid, u32_t *tcrcvret, thdclosure_index_t tidx, int owntc, cos_channelkey_t key)
+u32_t
+capmgr_aep_create_cserialized(struct cos_dcb_info **dcb, u32_t *tcrcvret, thdclosure_index_t tidx, int owntc, cos_channelkey_t key)
 {
 	spdid_t                 cur     = cos_inv_token();
 	struct cos_defcompinfo *cap_dci = cos_defcompinfo_curr_get();
@@ -251,6 +288,9 @@ capmgr_aep_create_cserialized(thdid_t *tid, u32_t *tcrcvret, thdclosure_index_t
 	struct cap_comp_info   *rc      = cap_info_comp_find(cur);
 	struct sl_thd          *t       = NULL, *rinit = NULL;
 	thdcap_t                thdcap  = 0;
+	dcbcap_t                dcbcap  = 0;
+	dcboff_t                dcboff  = 0;
+	vaddr_t                 dcbaddr = 0;
 	arcvcap_t               rcv;
 	tcap_t                  tc;
 	int                     ret;
@@ -262,7 +302,9 @@ capmgr_aep_create_cserialized(thdid_t *tid, u32_t *tcrcvret, thdclosure_index_t
 	rinit = cap_info_initthd(rc);
 	if (!rinit) return 0;
 
-	t = sl_thd_aep_alloc_ext(cap_info_dci(rc), rinit, tidx, 1, owntc, 0, 0, &rcv);
+	dcbcap = cos_dcb_info_alloc(cap_info_cpu_dcbdata(cap_info_cpu_local(rc)), &dcboff, &dcbaddr);
+	if (!dcbcap || !dcbaddr || !dcboff) return 0; /* dcboff == 0 for initthd in that comp! */
+	t = sl_thd_aep_alloc_ext_dcb(cap_info_dci(rc), rinit, tidx, 1, owntc, 0, dcbcap, dcboff, &rcv);
 	if (!t) return 0;
 	/* current is a sched, so copy */
 	ret = cos_cap_cpy(cap_info_ci(rc), cap_ci, CAP_THD, sl_thd_thdcap(t));
@@ -281,8 +323,7 @@ capmgr_aep_create_cserialized(thdid_t *tid, u32_t *tcrcvret, thdclosure_index_t
 
 	cap_info_thd_init(rc, t, key);
 	*tcrcvret = (tc << 16 | rcv);
-	*tid      = sl_thd_thdid(t);
-	thdcap    = ret;
+	thdcap    = ret << 16 | sl_thd_thdid(t);
 
 	return thdcap;
 err:
diff --git a/src/components/implementation/capmgr/naive/init.c b/src/components/implementation/capmgr/naive/init.c
index 1849eada9b..2817b1b14c 100644
--- a/src/components/implementation/capmgr/naive/init.c
+++ b/src/components/implementation/capmgr/naive/init.c
@@ -12,8 +12,8 @@ static int capmgr_init_core_done = 0;
 static void
 capmgr_comp_info_iter_cpu(void)
 {
-	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
-	struct cos_compinfo *   ci    = cos_compinfo_get(defci);
+	struct cos_defcompinfo *defci  = cos_defcompinfo_curr_get();
+	struct cos_compinfo *   ci     = cos_compinfo_get(defci);
 	struct cap_comp_info   *btinfo = cap_info_comp_find(0);
 	int remaining = hypercall_numcomps_get(), i;
 	int num_comps = 0;
@@ -41,14 +41,17 @@ capmgr_comp_info_iter_cpu(void)
 		if (spdid == 0 || (spdid != cos_spd_id() && cap_info_is_child(btinfo, spdid))) {
 			is_sched = (spdid == 0 || cap_info_is_sched_child(btinfo, spdid)) ? 1 : 0;
 
-			ret = hypercall_comp_initaep_get(spdid, is_sched, &aep, &sched_spdid);
-			assert(ret == 0);
+			if (!spdid || (spdid && sched_spdid != 0)) {
+				ret = hypercall_comp_initaep_get(spdid, is_sched, &aep, &sched_spdid);
+				assert(ret == 0);
+			}
 		}
 
 		rci_sched = cap_info_comp_find(sched_spdid);
 		assert(rci_sched && cap_info_init_check(rci_sched));
 		rci_cpu->parent = rci_sched;
 		rci_cpu->thd_used = 1;
+		cap_info_cpu_initdcb_init(rci);
 
 		while ((remain_child = hypercall_comp_child_next(spdid, &childid, &ch_flags)) >= 0) {
 			bitmap_set(rci_cpu->child_bitmap, childid - 1);
@@ -60,12 +63,6 @@ capmgr_comp_info_iter_cpu(void)
 			if (!remain_child) break;
 		}
 
-		if (sched_spdid == 0) {
-			initdcbpg = hypercall_initdcb_get(spdid);
-			assert(initdcbpg);
-			rci_cpu->initdcbpg = initdcbpg;
-		}
-
 		if (aep.thd) {
 			ithd = sl_thd_init_ext(&aep, NULL);
 			assert(ithd);
@@ -73,7 +70,34 @@ capmgr_comp_info_iter_cpu(void)
 			cap_info_initthd_init(rci, ithd, 0);
 		} else if (cos_spd_id() == spdid) {
 			cap_info_initthd_init(rci, sl__globals_cpu()->sched_thd, 0);
+		} else if (!sched_spdid && spdid) {
+			struct sl_thd *booter_thd = cap_info_initthd(btinfo);
+			dcbcap_t dcap;
+			dcboff_t off = 0;
+			vaddr_t  addr = 0;
+			struct cos_compinfo *rt_ci = cap_info_ci(rci);
+
+			dcap = cos_dcb_info_alloc(&rci_cpu->dcb_data, &off, &addr);
+			if (dcap) assert(off == 0 && addr);
+
+			/* root-scheduler */
+			ithd = sl_thd_initaep_alloc_dcb(cap_info_dci(rci), booter_thd, is_sched, is_sched ? 1 : 0, 0, dcap);
+			assert(ithd);
+
+			ret = cos_cap_cpy_at(rt_ci, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, ci, sl_thd_thdcap(ithd));
+			assert(ret == 0);
+			if (is_sched) {
+				ret = cos_cap_cpy_at(rt_ci, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, ci, sl_thd_tcap(ithd));
+				assert(ret == 0);
+				ret = cos_cap_cpy_at(rt_ci, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, ci, sl_thd_rcvcap(ithd));
+				assert(ret == 0);
+			}
+
+			ret = hypercall_root_initaep_set(spdid, sl_thd_aepinfo(ithd));
+			assert(ret == 0);
+			cap_info_initthd_init(rci, ithd, 0);
 		}
+
 	} while (remaining > 0);
 
 	for (i = 0; i < (int)MAX_NUM_COMP_WORDS; i++) PRINTLOG(PRINT_DEBUG, "Scheduler bitmap[%d]: %u\n", i, cap_info_schedbmp[cos_cpuid()][i]);
@@ -85,8 +109,8 @@ capmgr_comp_info_iter_cpu(void)
 static void
 capmgr_comp_info_iter(void)
 {
-	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
-	struct cos_compinfo *   ci    = cos_compinfo_get(defci);
+	struct cos_defcompinfo *defci  = cos_defcompinfo_curr_get();
+	struct cos_compinfo *   ci     = cos_compinfo_get(defci);
 	struct cap_comp_info   *btinfo = cap_info_comp_find(0);
 	int remaining = 0, i;
 	int num_comps = 0;
@@ -123,8 +147,10 @@ capmgr_comp_info_iter(void)
 
 			is_sched = (spdid == 0 || cap_info_is_sched_child(btinfo, spdid)) ? 1 : 0;
 
-			ret = hypercall_comp_initaep_get(spdid, is_sched, &aep, &ss);
-			assert(ret == 0 && ss == sched_spdid);
+			if (!spdid || (spdid && sched_spdid != 0)) {
+				ret = hypercall_comp_initaep_get(spdid, is_sched, &aep, &ss);
+				assert(ret == 0 && ss == sched_spdid);
+			}
 		}
 
 		ret = hypercall_comp_frontier_get(spdid, &vasfr, &capfr);
@@ -144,12 +170,6 @@ capmgr_comp_info_iter(void)
 			if (!remain_child) break;
 		}
 
-		if (sched_spdid == 0) {
-			initdcbpg = hypercall_initdcb_get(spdid);
-			assert(initdcbpg);
-			rci_cpu->initdcbpg = initdcbpg;
-		}
-
 		if (aep.thd) {
 			ithd = sl_thd_init_ext(&aep, NULL);
 			assert(ithd);
@@ -157,6 +177,32 @@ capmgr_comp_info_iter(void)
 			cap_info_initthd_init(rci, ithd, 0);
 		} else if (cos_spd_id() == spdid) {
 			cap_info_initthd_init(rci, sl__globals_cpu()->sched_thd, 0);
+		} else if (!sched_spdid && spdid) {
+			struct sl_thd *booter_thd = cap_info_initthd(btinfo);
+			dcbcap_t dcap;
+			dcboff_t off = 0;
+			vaddr_t  addr = 0;
+			struct cos_compinfo *rt_ci = cap_info_ci(rci);
+
+			dcap = cos_dcb_info_alloc(&rci_cpu->dcb_data, &off, &addr);
+			if (dcap) assert(off == 0 && addr);
+
+			/* root-scheduler */
+			ithd = sl_thd_initaep_alloc_dcb(cap_info_dci(rci), booter_thd, is_sched, is_sched ? 1 : 0, 0, dcap);
+			assert(ithd);
+
+			ret = cos_cap_cpy_at(rt_ci, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, ci, sl_thd_thdcap(ithd));
+			assert(ret == 0);
+			if (is_sched) {
+				ret = cos_cap_cpy_at(rt_ci, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, ci, sl_thd_tcap(ithd));
+				assert(ret == 0);
+				ret = cos_cap_cpy_at(rt_ci, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, ci, sl_thd_rcvcap(ithd));
+				assert(ret == 0);
+			}
+
+			ret = hypercall_root_initaep_set(spdid, sl_thd_aepinfo(ithd));
+			assert(ret == 0);
+			cap_info_initthd_init(rci, ithd, 0);
 		}
 	} while (remaining > 0);
 
@@ -184,14 +230,16 @@ cos_init(void)
 		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
 		cos_defcompinfo_init_ext(BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE,
 				BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_SELF_CT,
-				BOOT_CAPTBL_SELF_COMP, BOOT_CAPTBL_SELF_SCB, heap_frontier - COS_SCB_SIZE, heap_frontier, cap_frontier);
+				BOOT_CAPTBL_SELF_COMP, heap_frontier, cap_frontier);
 		cap_info_init();
+		cos_dcb_info_init_curr();
 		sl_init(SL_MIN_PERIOD_US);
 		capmgr_comp_info_iter();
 	} else {
 		while (!capmgr_init_core_done) ; /* WAIT FOR INIT CORE TO BE DONE */
 
 		cos_defcompinfo_sched_init();
+		cos_dcb_info_init_curr();
 		sl_init(SL_MIN_PERIOD_US);
 		capmgr_comp_info_iter_cpu();
 	}
diff --git a/src/components/implementation/capmgr/naive/mem_mgr.c b/src/components/implementation/capmgr/naive/mem_mgr.c
index a0e58ee71c..bb1d64936e 100644
--- a/src/components/implementation/capmgr/naive/mem_mgr.c
+++ b/src/components/implementation/capmgr/naive/mem_mgr.c
@@ -24,36 +24,6 @@ memmgr_heap_page_allocn(unsigned long npages)
 	return dst_pg;
 }
 
-vaddr_t
-memmgr_dcbpage_allocn(unsigned long npages)
-{
-	spdid_t cur = cos_inv_token();
-	struct cos_compinfo  *cap_ci  = cos_compinfo_get(cos_defcompinfo_curr_get());
-	struct cap_comp_info *cur_rci = cap_info_comp_find(cur);
-	struct cos_compinfo  *cur_ci  = cap_info_ci(cur_rci);
-	vaddr_t pg;
-
-	if (!cur_rci || !cap_info_init_check(cur_rci)) return 0;
-	if (!cur_ci) return 0;
-
-	pg = (vaddr_t)cos_dcbpg_bump_allocn(cur_ci, npages * PAGE_SIZE);
-
-	return pg;
-}
-
-vaddr_t
-memmgr_initdcbpage_retrieve(void)
-{
-	spdid_t cur = cos_inv_token();
-	struct cap_comp_info *cur_rci     = cap_info_comp_find(cur);
-	struct cap_comp_cpu_info *rci_cpu = cap_info_cpu_local(cur_rci);
-
-	/* this should have been initialized either through the booter for capmgr/root-sched or by the capmgr on inithrd creation in cur component */
-	assert(rci_cpu->initdcbpg);
-
-	return rci_cpu->initdcbpg;
-}
-
 cbuf_t
 memmgr_shared_page_allocn_cserialized(vaddr_t *pgaddr, int *unused, unsigned long npages)
 {
diff --git a/src/components/implementation/no_interface/llbooter/boot_deps.h b/src/components/implementation/no_interface/llbooter/boot_deps.h
index a622fb9517..4d72beb8b1 100644
--- a/src/components/implementation/no_interface/llbooter/boot_deps.h
+++ b/src/components/implementation/no_interface/llbooter/boot_deps.h
@@ -35,7 +35,7 @@ struct comp_cap_info {
 	vaddr_t                           addr_start;
 	vaddr_t                           vaddr_mapped_in_booter;
 	vaddr_t                           upcall_entry;
-	vaddr_t                           scbpg, initdcbpg[NUM_CPU];
+	vaddr_t                           initdcbpgs;
 	struct comp_sched_info           *schedinfo[NUM_CPU];
 	struct cos_component_information *cobj_info;
 } new_comp_cap_info[MAX_NUM_SPDS];
@@ -164,40 +164,20 @@ boot_comp_mem_alloc(spdid_t spdid)
 	cos_meminfo_alloc(compinfo, BOOT_MEM_KM_BASE, mem_sz);
 }
 
-/* TODO: Should booter create that INITDCB page for all components for each core? */
-static void
-boot_comp_dcb_alloc(spdid_t spdid)
-{
-	int i;
-	struct comp_cap_info   *spdinfo  = boot_spd_compcapinfo_get(spdid);
-	struct cos_compinfo    *compinfo = boot_spd_compinfo_get(spdid);
-	struct comp_sched_info *spdsi    = boot_spd_comp_schedinfo_get(spdid);
-
-	spdinfo->initdcbpg[cos_cpuid()] = (vaddr_t)cos_dcbpg_bump_allocn(compinfo, PAGE_SIZE);
-	assert(spdinfo->initdcbpg[cos_cpuid()]);
-}
-
 /* Initialize just the captblcap and pgtblcap, due to hack for upcall_fn addr */
 static void
 boot_compinfo_init(spdid_t spdid, captblcap_t *ct, pgtblcap_t *pt, u32_t heap_start_vaddr)
 {
 	struct cos_compinfo *compinfo  = boot_spd_compinfo_get(spdid);
 	struct cos_compinfo *boot_info = boot_spd_compinfo_curr_get();
-	scbcap_t scbc = 0;
-	vaddr_t scb_uaddr = 0;
+	struct comp_cap_info *spdinfo  = boot_spd_compcapinfo_get(spdid);
 
 	*ct  = cos_captbl_alloc(boot_info);
 	assert(*ct);
 	*pt  = cos_pgtbl_alloc(boot_info);
 	assert(*pt);
-	scbc = cos_pgtbl_alloc(boot_info);
-	assert(scbc);
-
-	cos_compinfo_init(compinfo, *pt, *ct, 0, scbc, 0, (vaddr_t)heap_start_vaddr, BOOT_CAPTBL_FREE, boot_info);
-	scb_uaddr = cos_page_bump_intern_valloc(compinfo, COS_SCB_SIZE);
-	assert(scb_uaddr);
-	compinfo->scb_vas = scb_uaddr;
 
+	cos_compinfo_init(compinfo, *pt, *ct, 0, (vaddr_t)heap_start_vaddr, BOOT_CAPTBL_FREE, boot_info);
 	/*
 	 * if this is a capmgr, let it manage its share (ideally rest of system memory) of memory.
 	 * if there is no capmgr in the system, allow every component to manage its memory.
@@ -271,9 +251,14 @@ boot_newcomp_defcinfo_init(spdid_t spdid)
 	struct cos_compinfo    *boot_info = boot_spd_compinfo_curr_get();
 	struct comp_sched_info *spdsi     = boot_spd_comp_schedinfo_get(spdid);
 	struct comp_cap_info   *spdinfo   = boot_spd_compcapinfo_get(spdid);
+	dcbcap_t dcbcap = 0;
+	dcboff_t dcboff = 0;
 
-	boot_comp_dcb_alloc(spdid);
-	child_aep->thd = cos_initthd_alloc(boot_info, child_ci->comp_cap, child_ci->pgtbl_cap, spdinfo->initdcbpg[cos_cpuid()]);
+
+	dcbcap = cos_dcb_alloc(boot_info, child_ci->pgtbl_cap, spdinfo->initdcbpgs + cos_cpuid() * PAGE_SIZE);
+	assert(dcbcap);
+
+	child_aep->thd = cos_initthd_alloc(boot_info, child_ci->comp_cap, dcbcap);
 	assert(child_aep->thd);
 
 	if (spdsi->flags & COMP_FLAG_SCHED) {
@@ -297,11 +282,8 @@ boot_comp_sched_set(spdid_t spdid)
 	struct cos_aep_info *child_aep = boot_spd_initaep_get(spdid);
 	int i = 0;
 
-	/* capmgr init only on boot core! */
 	if (!capmgr_spdid) goto set;
-	/*
-	 * if there is capmgr in the system, set it to be the first (index == 0) to initialize
-	 */
+	/* if there is capmgr in the system, set it to be the first (index == 0) to initialize */
 	if (spdid == capmgr_spdid) goto done;
 	i = 1;
 
@@ -322,8 +304,7 @@ boot_sched_caps_init(spdid_t spdid)
 	struct cos_aep_info    *child_aep = boot_spd_initaep_get(spdid);
 	int ret, i;
 
-	/* If booter should create the init caps in that component */
-	if (compsi->parent_spdid) return;
+	if (!capmgr_spdid || capmgr_spdid != spdid) return;
 
 	boot_newcomp_defcinfo_init(spdid);
 	ret = cos_cap_cpy_at(ci, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, boot_info, child_aep->thd);
@@ -400,9 +381,25 @@ boot_newcomp_create(spdid_t spdid, struct cos_compinfo *comp_info)
 	int         i = 0;
 	invtoken_t token = (invtoken_t)spdid;
 	int ret;
+	vaddr_t    scb_uaddr = 0;
+	scbcap_t   scbcap    = 0;
+
+	scbcap = cos_scb_alloc(boot_info);
+	assert(scbcap);
+	scb_uaddr = cos_page_bump_intern_valloc(compinfo, COS_SCB_SIZE);
+	assert(scb_uaddr);
+
+	if (spdinfo->initdcbpgs == 0) {
+		vaddr_t  dcbaddr = 0;
+
+		dcbaddr = cos_page_bump_intern_valloc(compinfo, NUM_CPU * PAGE_SIZE);
+		assert(dcbaddr);
+
+		spdinfo->initdcbpgs = dcbaddr;
+	}
 
 	/* scb info created on compinfo_init */
-	cc = cos_comp_alloc(boot_info, ct, pt, compinfo->scb_cap, (vaddr_t)spdinfo->upcall_entry, compinfo->scb_vas);
+	cc = cos_comp_alloc(boot_info, ct, pt, scbcap, (vaddr_t)spdinfo->upcall_entry, scb_uaddr);
 	assert(cc);
 	compinfo->comp_cap = cc;
 
@@ -427,7 +424,9 @@ boot_bootcomp_init(void)
 	if (first_time) {
 		first_time = 0;
 		cos_meminfo_init(&(boot_info->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
-		cos_defcompinfo_init();
+		cos_defcompinfo_init_ext(BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE,
+			 BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_SELF_CT,
+			 BOOT_CAPTBL_SELF_COMP, (vaddr_t)cos_get_heap_ptr(), LLBOOT_CAPTBL_FREE);
 	} else {
 		cos_defcompinfo_sched_init();
 	}
@@ -570,6 +569,41 @@ boot_comp_initaep_get(spdid_t dstid, spdid_t srcid, thdcap_t thdslot, arcvcap_t
 	return ret;
 }
 
+static inline int
+boot_root_initaep_set(spdid_t dstid, spdid_t srcid, thdcap_t thd, arcvcap_t rcv, tcap_t tc)
+{
+	struct comp_sched_info *si = NULL;
+	struct cos_aep_info    *a  = NULL;
+	struct cos_compinfo   *b   = cos_compinfo_get(cos_defcompinfo_curr_get()), *c = boot_spd_compinfo_get(dstid);
+
+	if (srcid > num_cobj || dstid > num_cobj) return -EINVAL;
+	if (!thd) return -EINVAL;
+
+	si = boot_spd_comp_schedinfo_get(srcid);
+	if (si->parent_spdid != 0) return -EINVAL;
+
+	a = boot_spd_initaep_get(srcid);
+	if (!a) return -EINVAL;
+
+	a->thd = cos_cap_cpy(b, c, CAP_THD, thd);
+	assert(a->thd);
+	if ((si->flags & COMP_FLAG_SCHED) == 0) {
+		assert(!tc && !rcv);
+		goto done;
+	}
+	if (!rcv || !tc) return -EINVAL;
+
+	a->tc  = cos_cap_cpy(b, c, CAP_TCAP, tc);
+	assert(a->tc);
+	a->rcv = cos_cap_cpy(b, c, CAP_ARCV, rcv);
+	assert(a->rcv);
+
+	boot_comp_sched_set(srcid);
+
+done:
+	return 0;
+}
+
 static inline int
 boot_comp_info_get(spdid_t dstid, spdid_t srcid, pgtblcap_t ptslot, captblcap_t ctslot, compcap_t compslot, spdid_t *parentid)
 {
@@ -735,6 +769,18 @@ hypercall_entry(word_t *ret2, word_t *ret3, int op, word_t arg3, word_t arg4)
 
 		break;
 	}
+	case HYPERCALL_ROOT_INITAEP_SET:
+	{
+		spdid_t   srcid   = arg3 >> 16;
+		thdcap_t  thd = (arg3 << 16) >> 16;
+		tcap_t    tc  = (arg4 << 16) >> 16;;
+		arcvcap_t rcv = arg4 >> 16;
+
+		if (!__hypercall_resource_access_check(client, srcid, 0)) return -EACCES;
+		ret1 = boot_root_initaep_set(client, srcid, thd, rcv, tc);
+
+		break;
+	}
 	case HYPERCALL_COMP_CHILD_NEXT:
 	{
 		spdid_t srcid = arg3, child;
@@ -793,17 +839,6 @@ hypercall_entry(word_t *ret2, word_t *ret3, int op, word_t arg3, word_t arg4)
 
 		break;
 	}
-	case HYPERCALL_COMP_INITDCB_GET:
-	{
-		spdid_t compid = arg3;
-		struct comp_cap_info *compinfo = NULL;
-
-		if (!__hypercall_resource_access_check(client, compid, 0)) return 0;
-		if (!compid || compid > num_cobj) return 0;
-		compinfo = boot_spd_compcapinfo_get(compid);
-
-		return compinfo->initdcbpg[cos_cpuid()];
-	}
 	case HYPERCALL_NUMCOMPS_GET:
 	{
 		ret1 = num_cobj + 1; /* including booter */
diff --git a/src/components/implementation/no_interface/vkernel/micro_booter.h b/src/components/implementation/no_interface/vkernel/micro_booter.h
index 4c46b3e5cf..b6afc1bb80 100644
--- a/src/components/implementation/no_interface/vkernel/micro_booter.h
+++ b/src/components/implementation/no_interface/vkernel/micro_booter.h
@@ -47,7 +47,4 @@ tls_set(size_t off, unsigned long val)
 
 extern void test_run_vk(void);
 
-void cos_dcb_info_init(void);
-struct cos_dcb_info *cos_dcb_info_get(void);
-
 #endif /* MICRO_BOOTER_H */
diff --git a/src/components/implementation/no_interface/vkernel/vk_api.c b/src/components/implementation/no_interface/vkernel/vk_api.c
index 2a2d5277cf..ad710949dd 100644
--- a/src/components/implementation/no_interface/vkernel/vk_api.c
+++ b/src/components/implementation/no_interface/vkernel/vk_api.c
@@ -7,12 +7,6 @@ extern void vm_init(void *);
 extern void dom0_io_fn(void *);
 extern void vm_io_fn(void *);
 
-struct cos_dcb_info *
-cos_dcb_info_get(void)
-{
-	return cos_dcb_info_assign();
-}
-
 static struct cos_aep_info *
 vm_schedaep_get(struct vms_info *vminfo)
 { return cos_sched_aep_get(&(vminfo->dci)); }
@@ -37,7 +31,7 @@ vk_vm_create(struct vms_info *vminfo, struct vkernel_info *vkinfo)
 	ret = cos_defcompinfo_child_alloc(vmdci, (vaddr_t)&cos_upcall_entry, (vaddr_t)BOOT_MEM_VM_BASE,
 					  VM_CAPTBL_FREE, 1, &initdcbpg);
 	cos_compinfo_init(&(vminfo->shm_cinfo), vmcinfo->pgtbl_cap, vmcinfo->captbl_cap, vmcinfo->comp_cap,
-			  vmcinfo->scb_cap, (vaddr_t)VK_VM_SHM_BASE, (vaddr_t)(VK_VM_SHM_BASE + COS_SCB_SIZE), VM_CAPTBL_FREE, vk_cinfo);
+			  (vaddr_t)(VK_VM_SHM_BASE + COS_SCB_SIZE), VM_CAPTBL_FREE, vk_cinfo);
 
 	printc("\tCreating and copying initial component capabilities\n");
 	ret = cos_cap_cpy_at(vmcinfo, BOOT_CAPTBL_SELF_CT, vk_cinfo, vmcinfo->captbl_cap);
diff --git a/src/components/implementation/no_interface/vkernel/vk_types.h b/src/components/implementation/no_interface/vkernel/vk_types.h
index 071786d37f..88fc6b9382 100644
--- a/src/components/implementation/no_interface/vkernel/vk_types.h
+++ b/src/components/implementation/no_interface/vkernel/vk_types.h
@@ -17,7 +17,7 @@
 #define VM_FIXED_PERIOD_MS 10
 #define VM_FIXED_BUDGET_MS 5
 
-#define VM_CAPTBL_SELF_SINV_BASE         BOOT_CAPTBL_FREE
+#define VM_CAPTBL_SELF_SINV_BASE         LLBOOT_CAPTBL_FREE
 /* VM1~ I/O Capabilities layout */
 #define VM_CAPTBL_SELF_IOTHD_BASE        round_up_to_pow2(VM_CAPTBL_SELF_SINV_BASE + captbl_idsize(CAP_SINV), CAPMAX_ENTRY_SZ)
 #define VM_CAPTBL_SELF_IORCV_BASE        round_up_to_pow2(VM_CAPTBL_SELF_IOTHD_BASE + captbl_idsize(CAP_THD), CAPMAX_ENTRY_SZ)
diff --git a/src/components/implementation/no_interface/vkernel/vkernel.c b/src/components/implementation/no_interface/vkernel/vkernel.c
index af106a5bba..7a7f3ca591 100644
--- a/src/components/implementation/no_interface/vkernel/vkernel.c
+++ b/src/components/implementation/no_interface/vkernel/vkernel.c
@@ -51,7 +51,7 @@ cos_init(void)
 	 *       Or use some offset into the future in CAPTBL_FREE
 	 */
 	cos_compinfo_init(&vk_info.shm_cinfo, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_COMP,
-			  BOOT_CAPTBL_SELF_SCB, (vaddr_t)cos_scb_info_get(), (vaddr_t)VK_VM_SHM_BASE, BOOT_CAPTBL_FREE, ci);
+			  (vaddr_t)VK_VM_SHM_BASE, LLBOOT_CAPTBL_FREE, ci);
 
 	vk_info.termthd = cos_thd_alloc(vk_cinfo, vk_cinfo->comp_cap, vk_terminate, NULL, 0, 0);
 	assert(vk_info.termthd);
diff --git a/src/components/implementation/no_interface/vkernel/vm_booter.c b/src/components/implementation/no_interface/vkernel/vm_booter.c
index 67c634290d..a64cd656b4 100644
--- a/src/components/implementation/no_interface/vkernel/vm_booter.c
+++ b/src/components/implementation/no_interface/vkernel/vm_booter.c
@@ -31,7 +31,7 @@ vm_init(void *d)
 	cos_meminfo_init(&booter_info.mi, BOOT_MEM_KM_BASE, VM_UNTYPED_SIZE, BOOT_CAPTBL_SELF_UNTYPED_PT);
 	/* FIXME: will need to verify if scb stuff works here */
 	cos_compinfo_init(&booter_info, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_COMP,
-	                  BOOT_CAPTBL_SELF_SCB, (vaddr_t)cos_scb_info_get(), (vaddr_t)cos_get_heap_ptr(), vmid == 0 ? DOM0_CAPTBL_FREE : VM_CAPTBL_FREE, &booter_info);
+	                  (vaddr_t)cos_get_heap_ptr(), vmid == 0 ? DOM0_CAPTBL_FREE : VM_CAPTBL_FREE, &booter_info);
 
 	PRINTC("Virtual-machine booter started.\n");
 	test_run_vk();
diff --git a/src/components/implementation/sched/hier_fprr/init.c b/src/components/implementation/sched/hier_fprr/init.c
index 8e8ddba9b5..f7f45357fb 100644
--- a/src/components/implementation/sched/hier_fprr/init.c
+++ b/src/components/implementation/sched/hier_fprr/init.c
@@ -48,14 +48,33 @@ __init_done(void *d)
 void
 sched_child_init(struct sched_childinfo *schedci)
 {
-	struct sl_thd *initthd = NULL;
+	vaddr_t dcbaddr;
 
 	assert(schedci);
-	initthd = sched_child_initthd_get(schedci);
-	assert(initthd);
-	sl_thd_param_set(initthd, sched_param_pack(SCHEDP_PRIO, FIXED_PRIO));
-	sl_thd_param_set(initthd, sched_param_pack(SCHEDP_WINDOW, FIXED_PERIOD_MS));
-	sl_thd_param_set(initthd, sched_param_pack(SCHEDP_BUDGET, FIXED_BUDGET_MS));
+	schedci->initthd = sl_thd_initaep_alloc(sched_child_defci_get(schedci), NULL, schedci->flags & COMP_FLAG_SCHED, schedci->flags & COMP_FLAG_SCHED ? 1 : 0, 0, &dcbaddr);
+	assert(schedci->initthd);
+
+	sl_thd_param_set(schedci->initthd, sched_param_pack(SCHEDP_PRIO, FIXED_PRIO));
+	sl_thd_param_set(schedci->initthd, sched_param_pack(SCHEDP_WINDOW, FIXED_PERIOD_MS));
+	sl_thd_param_set(schedci->initthd, sched_param_pack(SCHEDP_BUDGET, FIXED_BUDGET_MS));
+}
+
+thdid_t
+sched_child_thd_create(struct sched_childinfo *schedci, thdclosure_index_t idx)
+{
+	vaddr_t addr;
+	struct sl_thd *t = sl_thd_aep_alloc_ext(sched_child_defci_get(schedci), NULL, idx, 0, 0, 0, &addr, NULL);
+
+	return t ? sl_thd_thdid(t) : 0;
+}
+
+thdid_t
+sched_child_aep_create(struct sched_childinfo *schedci, thdclosure_index_t idx, int owntc, cos_channelkey_t key, arcvcap_t *extrcv)
+{
+	vaddr_t addr;
+	struct sl_thd *t = sl_thd_aep_alloc_ext(sched_child_defci_get(schedci), NULL, idx, 1, owntc, key, &addr, extrcv);
+
+	return t ? sl_thd_thdid(t) : 0;
 }
 
 void
diff --git a/src/components/implementation/sched/root_fprr/init.c b/src/components/implementation/sched/root_fprr/init.c
index 559ef6c685..c3f804bbd3 100644
--- a/src/components/implementation/sched/root_fprr/init.c
+++ b/src/components/implementation/sched/root_fprr/init.c
@@ -38,14 +38,33 @@ __init_done(void *d)
 void
 sched_child_init(struct sched_childinfo *schedci)
 {
-	struct sl_thd *initthd = NULL;
+	vaddr_t dcbaddr;
 
 	assert(schedci);
-	initthd = sched_child_initthd_get(schedci);
-	assert(initthd);
-	sl_thd_param_set(initthd, sched_param_pack(SCHEDP_PRIO, FIXED_PRIO));
-	sl_thd_param_set(initthd, sched_param_pack(SCHEDP_WINDOW, FIXED_PERIOD_MS));
-	sl_thd_param_set(initthd, sched_param_pack(SCHEDP_BUDGET, FIXED_BUDGET_MS));
+	schedci->initthd = sl_thd_initaep_alloc(sched_child_defci_get(schedci), NULL, schedci->flags & COMP_FLAG_SCHED, schedci->flags & COMP_FLAG_SCHED ? 1 : 0, 0, &dcbaddr);
+        assert(schedci->initthd);
+
+	sl_thd_param_set(schedci->initthd, sched_param_pack(SCHEDP_PRIO, FIXED_PRIO));
+	sl_thd_param_set(schedci->initthd, sched_param_pack(SCHEDP_WINDOW, FIXED_PERIOD_MS));
+	sl_thd_param_set(schedci->initthd, sched_param_pack(SCHEDP_BUDGET, FIXED_BUDGET_MS));
+}
+
+thdid_t
+sched_child_thd_create(struct sched_childinfo *schedci, thdclosure_index_t idx)
+{
+	vaddr_t addr;
+	struct sl_thd *t = sl_thd_aep_alloc_ext(sched_child_defci_get(schedci), NULL, idx, 0, 0, 0, &addr, NULL);
+
+	return t ? sl_thd_thdid(t) : 0;
+}
+
+thdid_t
+sched_child_aep_create(struct sched_childinfo *schedci, thdclosure_index_t idx, int owntc, cos_channelkey_t key, arcvcap_t *extrcv)
+{
+	vaddr_t addr;
+	struct sl_thd *t = sl_thd_aep_alloc_ext(sched_child_defci_get(schedci), NULL, idx, 1, owntc, key, &addr, extrcv);
+
+	return t ? sl_thd_thdid(t) : 0;
 }
 
 void
diff --git a/src/components/implementation/sched/root_fprr_raw/init.c b/src/components/implementation/sched/root_fprr_raw/init.c
index ba79a791ab..40ae673549 100644
--- a/src/components/implementation/sched/root_fprr_raw/init.c
+++ b/src/components/implementation/sched/root_fprr_raw/init.c
@@ -17,13 +17,28 @@ capmgr_thd_retrieve_next(spdid_t child, thdid_t *tid)
 void
 sched_child_init(struct sched_childinfo *schedci)
 {
-	struct sl_thd *initthd = NULL;
-
 	assert(schedci);
 
-	initthd = sched_child_initthd_get(schedci);
-	assert(initthd);
-	sl_thd_param_set(initthd, sched_param_pack(SCHEDP_PRIO, FIXED_PRIO));
+	schedci->initthd = sl_thd_initaep_alloc_dcb(sched_child_defci_get(schedci), NULL, schedci->flags & COMP_FLAG_SCHED, schedci->flags & COMP_FLAG_SCHED ? 1 : 0, 0, 0);
+
+	assert(schedci->initthd);
+	sl_thd_param_set(schedci->initthd, sched_param_pack(SCHEDP_PRIO, FIXED_PRIO));
+}
+
+thdid_t
+sched_child_thd_create(struct sched_childinfo *schedci, thdclosure_index_t idx)
+{
+	struct sl_thd *t = sl_thd_aep_alloc_ext_dcb(sched_child_defci_get(schedci), NULL, idx, 0, 0, 0, 0, 0, NULL);
+
+	return t ? sl_thd_thdid(t) : 0;
+}
+
+thdid_t
+sched_child_aep_create(struct sched_childinfo *schedci, thdclosure_index_t idx, int owntc, cos_channelkey_t key, arcvcap_t *extrcv)
+{
+	struct sl_thd *t = sl_thd_aep_alloc_ext_dcb(sched_child_defci_get(schedci), NULL, idx, 1, owntc, key, 0, 0, extrcv);
+
+	return t ? sl_thd_thdid(t) : 0;
 }
 
 void
diff --git a/src/components/implementation/sched/sched.c b/src/components/implementation/sched/sched.c
index 5fa092a2ee..4eced00ffb 100644
--- a/src/components/implementation/sched/sched.c
+++ b/src/components/implementation/sched/sched.c
@@ -43,34 +43,26 @@ thdid_t
 sched_thd_create_cserialized(thdclosure_index_t idx)
 {
 	spdid_t c = cos_inv_token();
-	struct cos_defcompinfo *dci;
-	struct sl_thd *t = NULL;
+	struct sched_childinfo *sci;
 
 	if (!c) return 0;
-	dci = sched_child_defci_get(sched_childinfo_find(c));
-	if (!dci) return 0;
+	sci = sched_childinfo_find(c);
+	if (!sci) return 0;
 
-	t = sl_thd_aep_alloc_ext(dci, NULL, idx, 0, 0, 0, 0, NULL);
-	if (!t) return 0;
-
-	return sl_thd_thdid(t);
+	return sched_child_thd_create(sci, idx);
 }
 
 thdid_t
 sched_aep_create_cserialized(arcvcap_t *extrcv, int *unused, thdclosure_index_t idx, int owntc, cos_channelkey_t key)
 {
 	spdid_t c = cos_inv_token();
-	struct cos_defcompinfo *dci;
-	struct sl_thd *t = NULL;
+	struct sched_childinfo *sci;
 
 	if (!c) return 0;
-	dci = sched_child_defci_get(sched_childinfo_find(c));
-	if (!dci) return 0;
-
-	t = sl_thd_aep_alloc_ext(dci, NULL, idx, 1, owntc, key, 0, extrcv);
-	if (!t) return 0;
+	sci = sched_childinfo_find(c);
+	if (!sci) return 0;
 
-	return sl_thd_thdid(t);
+	return sched_child_aep_create(sci, idx, owntc, key, extrcv);
 }
 
 int
diff --git a/src/components/implementation/sched/sched_info.c b/src/components/implementation/sched/sched_info.c
index 5629220deb..a3c5df025a 100644
--- a/src/components/implementation/sched/sched_info.c
+++ b/src/components/implementation/sched/sched_info.c
@@ -75,9 +75,7 @@ sched_childinfo_init_intern(int is_raw)
 	memset(childinfo[cos_cpuid()], 0, sizeof(struct sched_childinfo) * SCHED_MAX_CHILD_COMPS);
 
 	while ((remaining = hypercall_comp_child_next(cos_spd_id(), &child, &childflags)) >= 0) {
-		struct cos_defcompinfo *child_dci = NULL;
 		struct sched_childinfo *schedinfo = NULL;
-		struct sl_thd          *initthd   = NULL;
 		compcap_t               compcap   = 0;
 
 		PRINTLOG(PRINT_DEBUG, "Initializing child component %u, is_sched=%d\n", child, childflags & COMP_FLAG_SCHED);
@@ -88,11 +86,6 @@ sched_childinfo_init_intern(int is_raw)
 
 		schedinfo = sched_childinfo_alloc(child, compcap, childflags);
 		assert(schedinfo);
-		child_dci = sched_child_defci_get(schedinfo);
-
-		initthd = sl_thd_initaep_alloc(child_dci, NULL, childflags & COMP_FLAG_SCHED, childflags & COMP_FLAG_SCHED ? 1 : 0, 0, 0);
-		assert(initthd);
-		sched_child_initthd_set(schedinfo, initthd);
 
 		sched_child_init(schedinfo);
 		if (!remaining) break;
diff --git a/src/components/implementation/sched/sched_info.h b/src/components/implementation/sched/sched_info.h
index bbdecf6c97..d4270037ce 100644
--- a/src/components/implementation/sched/sched_info.h
+++ b/src/components/implementation/sched/sched_info.h
@@ -22,6 +22,8 @@ void sched_childinfo_init(void);
 void sched_childinfo_init_raw(void);
 
 extern unsigned int self_init[], num_child_init[];
+extern thdid_t sched_child_thd_create(struct sched_childinfo *schedci, thdclosure_index_t idx);
+extern thdid_t sched_child_aep_create(struct sched_childinfo *schedci, thdclosure_index_t idx, int owntc, cos_channelkey_t key, arcvcap_t *extrcv);
 
 static inline struct cos_defcompinfo *
 sched_child_defci_get(struct sched_childinfo *sci)
diff --git a/src/components/implementation/tests/micro_booter/mb_tests.c b/src/components/implementation/tests/micro_booter/mb_tests.c
index 68f9fd1981..7795ce0a2b 100644
--- a/src/components/implementation/tests/micro_booter/mb_tests.c
+++ b/src/components/implementation/tests/micro_booter/mb_tests.c
@@ -50,7 +50,7 @@ test_thds_perf(void)
 	long long start_swt_cycles = 0, end_swt_cycles = 0;
 	int       i;
 
-	ts = cos_thd_alloc(&booter_info, booter_info.comp_cap, thd_fn_perf, NULL, booter_info.pgtbl_cap, (vaddr_t)cos_dcb_info_get());
+	ts = cos_thd_alloc(&booter_info, booter_info.comp_cap, thd_fn_perf, NULL, 0, 0);
 	assert(ts);
 	cos_thd_switch(ts);
 
@@ -83,7 +83,7 @@ test_thds(void)
 	intptr_t i;
 
 	for (i = 0; i < TEST_NTHDS; i++) {
-		ts[i] = cos_thd_alloc(&booter_info, booter_info.comp_cap, thd_fn, (void *)i, booter_info.pgtbl_cap, (vaddr_t)cos_dcb_info_get());
+		ts[i] = cos_thd_alloc(&booter_info, booter_info.comp_cap, thd_fn, (void *)i, 0, 0);
 		assert(ts[i]);
 		tls_test[cos_cpuid()][i] = i;
 		cos_thd_mod(&booter_info, ts[i], &tls_test[cos_cpuid()][i]);
@@ -279,7 +279,7 @@ test_async_endpoints(void)
 	PRINTC("Creating threads, and async end-points.\n");
 	/* parent rcv capabilities */
 	tcp = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_parent,
-	                    (void *)BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, booter_info.pgtbl_cap, (vaddr_t)cos_dcb_info_get());
+	                    (void *)BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, 0, 0);
 	assert(tcp);
 	tccp = cos_tcap_alloc(&booter_info);
 	assert(tccp);
@@ -291,7 +291,7 @@ test_async_endpoints(void)
 	}
 
 	/* child rcv capabilities */
-	tcc = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_fn, (void *)tcp, booter_info.pgtbl_cap, (vaddr_t)cos_dcb_info_get());
+	tcc = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_fn, (void *)tcp, 0, 0);
 	assert(tcc);
 	tccc = cos_tcap_alloc(&booter_info);
 	assert(tccc);
@@ -324,7 +324,7 @@ test_async_endpoints_perf(void)
 
 	/* parent rcv capabilities */
 	tcp = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_parent_perf,
-	                    (void *)BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, booter_info.pgtbl_cap, (vaddr_t)cos_dcb_info_get());
+	                    (void *)BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, 0, 0);
 	assert(tcp);
 	tccp = cos_tcap_alloc(&booter_info);
 	assert(tccp);
@@ -333,7 +333,7 @@ test_async_endpoints_perf(void)
 	if (cos_tcap_transfer(rcp, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF, TCAP_PRIO_MAX + 1)) assert(0);
 
 	/* child rcv capabilities */
-	tcc = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_fn_perf, (void *)tcp, booter_info.pgtbl_cap, (vaddr_t)cos_dcb_info_get());
+	tcc = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_fn_perf, (void *)tcp, 0, 0);
 	assert(tcc);
 	tccc = cos_tcap_alloc(&booter_info);
 	assert(tccc);
@@ -367,7 +367,7 @@ test_timer(void)
 	cycles_t c = 0, p = 0, t = 0;
 
 	PRINTC("Starting timer test.\n");
-	tc = cos_thd_alloc(&booter_info, booter_info.comp_cap, spinner, NULL, booter_info.pgtbl_cap, (vaddr_t)cos_dcb_info_get());
+	tc = cos_thd_alloc(&booter_info, booter_info.comp_cap, spinner, NULL, 0, 0);
 
 	for (i = 0; i <= 16; i++) {
 		thdid_t     tid;
@@ -414,7 +414,7 @@ exec_cluster_alloc(struct exec_cluster *e, cos_thd_fn_t fn, void *d, arcvcap_t p
 {
 	e->tcc = cos_tcap_alloc(&booter_info);
 	assert(e->tcc);
-	e->tc = cos_thd_alloc(&booter_info, booter_info.comp_cap, fn, d, booter_info.pgtbl_cap, (vaddr_t)cos_dcb_info_get());
+	e->tc = cos_thd_alloc(&booter_info, booter_info.comp_cap, fn, d, 0, 0);
 	assert(e->tc);
 	e->rc = cos_arcv_alloc(&booter_info, e->tc, e->tcc, booter_info.comp_cap, parentc);
 	assert(e->rc);
diff --git a/src/components/implementation/tests/micro_booter/micro_booter.c b/src/components/implementation/tests/micro_booter/micro_booter.c
index a6328c4278..e9346c1b58 100644
--- a/src/components/implementation/tests/micro_booter/micro_booter.c
+++ b/src/components/implementation/tests/micro_booter/micro_booter.c
@@ -18,37 +18,6 @@ term_fn(void *d)
 
 static int test_done[NUM_CPU];
 
-#define COS_DCB_MAX_PER_PAGE (PAGE_SIZE / sizeof(struct cos_dcb_info))
-static unsigned long free_off[NUM_CPU] = { 0 }, total[NUM_CPU] = { 0 };
-static struct cos_dcb_info *dcb_st[NUM_CPU] = { NULL };
-
-void
-cos_dcb_info_init(void)
-{
-	free_off[cos_cpuid()] = 1;
-
-	dcb_st[cos_cpuid()] = cos_init_dcb_get();
-}
-
-struct cos_dcb_info *
-cos_dcb_info_get(void)
-{
-	unsigned int curr_off = 0;
-
-	curr_off = ps_faa(&free_off[cos_cpuid()], 1);
-	if (curr_off == COS_DCB_MAX_PER_PAGE) {
-		/* will need a version that calls down to capmgr for more pages */
-		dcb_st[cos_cpuid()] = cos_dcbpg_bump_allocn(&booter_info, PAGE_SIZE);
-		assert(dcb_st[cos_cpuid()]);
-
-		free_off[cos_cpuid()] = 0;
-
-		return dcb_st[cos_cpuid()];
-	}
-
-	return (dcb_st[cos_cpuid()] + curr_off);
-}
-
 void
 cos_init(void)
 {
@@ -63,28 +32,14 @@ cos_init(void)
 		first_init = 0;
 		cos_meminfo_init(&booter_info.mi, BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
 		cos_compinfo_init(&booter_info, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_COMP,
-				BOOT_CAPTBL_SELF_SCB, (vaddr_t)cos_scb_info_get(), (vaddr_t)cos_get_heap_ptr(), BOOT_CAPTBL_FREE, &booter_info);
+				  (vaddr_t)cos_get_heap_ptr(), LLBOOT_CAPTBL_FREE, &booter_info);
 		init_done = 1;
 	}
 
 	while (!init_done) ;
-	cos_dcb_info_init();
-	initaddr = cos_init_dcb_get();
-	PRINTC("%u DCB IP: %lx, DCB SP: %lx\n", (unsigned int)BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, (unsigned long)cos_introspect(&booter_info, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, THD_GET_DCB_IP), (unsigned long)cos_introspect(&booter_info, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, THD_GET_DCB_SP));
-	initaddr->ip = 10;
-	initaddr->sp = 0;
-	PRINTC("%u DCB IP: %lx, DCB SP: %lx\n", (unsigned int)BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, (unsigned long)cos_introspect(&booter_info, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, THD_GET_DCB_IP), (unsigned long)cos_introspect(&booter_info, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, THD_GET_DCB_SP));
-
 
-	termaddr = cos_dcb_info_get();
-	termthd[cos_cpuid()] = cos_thd_alloc(&booter_info, booter_info.comp_cap, term_fn, NULL, booter_info.pgtbl_cap, (vaddr_t)termaddr);
+	termthd[cos_cpuid()] = cos_thd_alloc(&booter_info, booter_info.comp_cap, term_fn, NULL, 0, 0);
 	assert(termthd[cos_cpuid()]);
-	PRINTC("%u DCB IP: %lx, DCB SP: %lx\n", (unsigned int)termthd[cos_cpuid()], (unsigned long)cos_introspect(&booter_info, termthd[cos_cpuid()], THD_GET_DCB_IP), (unsigned long)cos_introspect(&booter_info, termthd[cos_cpuid()], THD_GET_DCB_SP));
-	termaddr->ip = 30;
-	termaddr->sp = 0;
-	PRINTC("%u DCB IP: %lx, DCB SP: %lx\n", (unsigned int)termthd[cos_cpuid()], (unsigned long)cos_introspect(&booter_info, termthd[cos_cpuid()], THD_GET_DCB_IP), (unsigned long)cos_introspect(&booter_info, termthd[cos_cpuid()], THD_GET_DCB_SP));
-	PRINTC("%u DCB IP: %lx, DCB SP: %lx\n", (unsigned int)BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, (unsigned long)cos_introspect(&booter_info, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, THD_GET_DCB_IP), (unsigned long)cos_introspect(&booter_info, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, THD_GET_DCB_SP));
-	PRINTC("%u DCB IP: %lx, DCB SP: %lx\n", (unsigned int)termthd[cos_cpuid()], (unsigned long)cos_introspect(&booter_info, termthd[cos_cpuid()], THD_GET_DCB_IP), (unsigned long)cos_introspect(&booter_info, termthd[cos_cpuid()], THD_GET_DCB_SP));
 
 	PRINTC("Micro Booter started.\n");
 	test_run_mb();
diff --git a/src/components/implementation/tests/micro_booter/micro_booter.h b/src/components/implementation/tests/micro_booter/micro_booter.h
index 632897aae9..8410b110f6 100644
--- a/src/components/implementation/tests/micro_booter/micro_booter.h
+++ b/src/components/implementation/tests/micro_booter/micro_booter.h
@@ -26,11 +26,11 @@
 #include <cos_component.h>
 #include <cobj_format.h>
 #include <cos_kernel_api.h>
+#include <cos_dcb.h>
 
 #define ITER 10000
 #define TEST_NTHDS 5
 
-extern struct cos_dcb_info *init_dcbinfo[];
 extern struct cos_compinfo booter_info;
 extern thdcap_t            termthd[]; /* switch to this to shutdown */
 extern unsigned long       tls_test[][TEST_NTHDS];
@@ -54,7 +54,4 @@ tls_set(size_t off, unsigned long val)
 
 extern void test_run_mb(void);
 
-void cos_dcb_info_init(void);
-struct cos_dcb_info *cos_dcb_info_get(void);
-
 #endif /* MICRO_BOOTER_H */
diff --git a/src/components/implementation/tests/unit_capmgr/unit_capmgr.c b/src/components/implementation/tests/unit_capmgr/unit_capmgr.c
index 91b24e7e47..880428378c 100644
--- a/src/components/implementation/tests/unit_capmgr/unit_capmgr.c
+++ b/src/components/implementation/tests/unit_capmgr/unit_capmgr.c
@@ -33,7 +33,9 @@ test_thds(void)
 	int failure = 0;
 
 	for (; i < TEST_N_THDS; i++) {
-		test_ts[cos_cpuid()][i] = capmgr_thd_create(__test_thd_fn, (void *)i, &tid);
+		struct cos_dcb_info *dcb;
+
+		test_ts[cos_cpuid()][i] = capmgr_thd_create(__test_thd_fn, (void *)i, &tid, &dcb);
 		assert(test_ts[cos_cpuid()][i]);
 
 		if (cos_thd_switch(test_ts[cos_cpuid()][i])) {
diff --git a/src/components/implementation/tests/unit_defcompinfo/unit_defcompinfo.c b/src/components/implementation/tests/unit_defcompinfo/unit_defcompinfo.c
index c9f5398df7..b6db7a0af6 100644
--- a/src/components/implementation/tests/unit_defcompinfo/unit_defcompinfo.c
+++ b/src/components/implementation/tests/unit_defcompinfo/unit_defcompinfo.c
@@ -56,7 +56,7 @@ test_aeps(void)
 		asndcap_t snd;
 
 		printc("\tCreating AEP [%d]\n", i);
-		ret = cos_aep_tcap_alloc(&(test_aep[i]), BOOT_CAPTBL_SELF_INITTCAP_BASE, aep_thd_fn, (void *)i, 0);
+		ret = cos_aep_tcap_alloc(&(test_aep[i]), BOOT_CAPTBL_SELF_INITTCAP_BASE, aep_thd_fn, (void *)i, 0, 0);
 		assert(ret == 0);
 
 		snd = cos_asnd_alloc(ci, test_aep[i].rcv, ci->captbl_cap);
diff --git a/src/components/implementation/tests/unit_schedcomp/unit_schedcomp.c b/src/components/implementation/tests/unit_schedcomp/unit_schedcomp.c
index 55fee656d0..a885c32f38 100644
--- a/src/components/implementation/tests/unit_schedcomp/unit_schedcomp.c
+++ b/src/components/implementation/tests/unit_schedcomp/unit_schedcomp.c
@@ -152,6 +152,7 @@ cos_init(void)
 	assert(hypercall_comp_child_next(cos_spd_id(), &child, &childflag) == -1);
 
 	testtid = sched_thd_create(run_tests, NULL);
+	assert(testtid);
 	sched_thd_param_set(testtid, sched_param_pack(SCHEDP_PRIO, LOWEST_PRIORITY));
 
 	while (1) {
diff --git a/src/components/implementation/tests/unit_schedtests/unit_schedlib.c b/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
index 0814ccbd94..c7cd84a532 100644
--- a/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
+++ b/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
@@ -13,6 +13,7 @@
 #include <cos_defkernel_api.h>
 #include <llprint.h>
 #include <sl.h>
+#include <cos_dcb.h>
 
 /* sl also defines a SPIN macro */
 #undef SPIN
@@ -218,7 +219,8 @@ cos_init(void)
 
 	printc("Unit-test for the scheduling library (sl)\n");
 	cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
-	cos_defcompinfo_init();
+	cos_defcompinfo_llinit();
+	cos_dcb_info_init_curr();
 	sl_init(SL_MIN_PERIOD_US);
 
 	//test_yield_perf();
diff --git a/src/components/include/cos_component.h b/src/components/include/cos_component.h
index 4217ccb09b..5929c0b5aa 100644
--- a/src/components/include/cos_component.h
+++ b/src/components/include/cos_component.h
@@ -226,9 +226,7 @@ static inline struct cos_dcb_info *
 cos_init_dcb_get(void)
 {
 	/* created at boot-time for the first component in the system! */
-	if (cos_spd_id() == 0) return (struct cos_dcb_info *)(cos_comp_info.cos_heap_ptr + COS_SCB_SIZE + (PAGE_SIZE * cos_cpuid()));
-
-	return NULL;
+	return (struct cos_dcb_info *)(cos_comp_info.cos_heap_ptr + COS_SCB_SIZE + (PAGE_SIZE * cos_cpuid()));
 }
 
 static inline void
diff --git a/src/components/include/cos_dcb.h b/src/components/include/cos_dcb.h
index 81cc8b4395..1fc6298da6 100644
--- a/src/components/include/cos_dcb.h
+++ b/src/components/include/cos_dcb.h
@@ -2,10 +2,27 @@
 #define COS_DCB_H
 
 #include <cos_types.h>
+#include <cos_kernel_api.h>
 
 #define COS_DCB_PERPG_MAX (PAGE_SIZE / sizeof(struct cos_dcb_info))
 
-void cos_dcb_info_init(void);
-struct cos_dcb_info *cos_dcb_info_assign(void);
+#define COS_DCB_MAX_CAPS (MAX_NUM_THREADS / COS_DCB_PERPG_MAX + 1)
+
+struct cos_dcbinfo_data {
+	dcbcap_t dcbcaps[COS_DCB_MAX_CAPS];
+	vaddr_t  dcbaddr[COS_DCB_MAX_CAPS];
+	dcboff_t curr_cap_off;
+	unsigned short curr_cap;
+
+	struct cos_compinfo *ci;
+} CACHE_ALIGNED;
+
+void cos_dcb_info_init(struct cos_dcbinfo_data *cdi, struct cos_compinfo *ci);
+void cos_dcb_info_init_ext(struct cos_dcbinfo_data *cdi, struct cos_compinfo *ci, dcbcap_t initdcbcap, vaddr_t initdcbaddr, dcboff_t start_off);
+dcbcap_t cos_dcb_info_alloc(struct cos_dcbinfo_data *cdi, dcboff_t *dcboff, vaddr_t *dcbaddr);
+
+void cos_dcb_info_init_curr(void);
+void cos_dcb_info_init_curr_ext(dcbcap_t initdcbcap, vaddr_t initdcbaddr, dcboff_t st_off);
+dcbcap_t cos_dcb_info_alloc_curr(dcboff_t *dcboff, vaddr_t *dcbaddr);
 
 #endif /* COS_DCB_H */
diff --git a/src/components/include/cos_defkernel_api.h b/src/components/include/cos_defkernel_api.h
index 629ada6432..b98796c129 100644
--- a/src/components/include/cos_defkernel_api.h
+++ b/src/components/include/cos_defkernel_api.h
@@ -81,12 +81,13 @@ struct cos_aep_info *cos_sched_aep_get(struct cos_defcompinfo *defci);
  * capabilities layout.
  */
 void cos_defcompinfo_init(void);
+void cos_defcompinfo_llinit(void);
 /*
  * cos_defcompinfo_init_ext: initialize the current component's global cos_defcompinfo struct using the parameters
  * passed.
  */
 void cos_defcompinfo_init_ext(tcap_t sched_tc, thdcap_t sched_thd, arcvcap_t sched_rcv, pgtblcap_t pgtbl_cap,
-                              captblcap_t captbl_cap, compcap_t comp_cap, scbcap_t scb_cap, vaddr_t scb_ptr, vaddr_t heap_ptr, capid_t cap_frontier);
+                              captblcap_t captbl_cap, compcap_t comp_cap, vaddr_t heap_ptr, capid_t cap_frontier);
 
 /* for AP cores */
 void cos_defcompinfo_sched_init_ext(tcap_t sched_tc, thdcap_t sched_thd, arcvcap_t sched_rcv);
@@ -96,21 +97,21 @@ void cos_defcompinfo_sched_init(void);
  * cos_defcompinfo_child_alloc: called to create a new child component including initial capabilities like pgtbl,
  * captbl, compcap, aep. if is_sched is set, scheduling end-point will also be created for the child component, else,
  * the current component's scheduler will remain the scheduler for the child component.
- * NOTE: dcbuaddr is the address in child_dci page-table!.
+ * TODO: initdcb cap and initdcb addr?
  */
 int cos_defcompinfo_child_alloc(struct cos_defcompinfo *child_defci, vaddr_t entry, vaddr_t heap_ptr,
-                                capid_t cap_frontier, int is_sched, vaddr_t *dcbuaddr);
+                                capid_t cap_frontier, int is_sched, dcbcap_t *initdcbcap);
 
 /*
  * cos_aep_alloc: creates a new async activation end-point which includes thread, tcap and rcv capabilities.
  *                struct cos_aep_info passed in, must not be stack allocated.
  */
-int cos_aep_alloc(struct cos_aep_info *aep, cos_aepthd_fn_t fn, void *data, vaddr_t dcbuaddr);
+int cos_aep_alloc(struct cos_aep_info *aep, cos_aepthd_fn_t fn, void *data, dcbcap_t dcap, dcboff_t doff);
 /*
  * cos_aep_alloc: creates a new async activation end-point, using an existing tcap.
  *                struct cos_aep_info passed in, must not be stack allocated.
  */
-int cos_aep_tcap_alloc(struct cos_aep_info *aep, tcap_t tc, cos_aepthd_fn_t fn, void *data, vaddr_t dcbuaddr);
+int cos_aep_tcap_alloc(struct cos_aep_info *aep, tcap_t tc, cos_aepthd_fn_t fn, void *data, dcbcap_t dcap, dcboff_t doff);
 
 /*
  * cos_initaep_alloc: create an initaep in the @child_dci and using sched->rcv as the parent, sets up cos_sched_ape_get(@child_dci) with the init capabilities.
@@ -118,27 +119,27 @@ int cos_aep_tcap_alloc(struct cos_aep_info *aep, tcap_t tc, cos_aepthd_fn_t fn,
  *                    if @is_sched == 0, creates only the init thread (does not need @sched parameter)
  * NOTE: dcbuaddr is the address in child_dci page-table.
  */
-int cos_initaep_alloc(struct cos_defcompinfo *child_dci, struct cos_aep_info *sched, int is_sched, vaddr_t dcbuaddr);
+int cos_initaep_alloc(struct cos_defcompinfo *child_dci, struct cos_aep_info *sched, int is_sched, dcbcap_t dcap);
 /*
  * cos_initaep_tcap_alloc: same as cos_initaep_alloc with is_sched == 1, except it doesn't create a new tcap,
  *			   uses the tcap passed in @tc.
  * NOTE: dcbuaddr is the address in child_dci page-table.
  */
-int cos_initaep_tcap_alloc(struct cos_defcompinfo *child_dci, tcap_t tc, struct cos_aep_info *sched, vaddr_t dcbuaddr);
+int cos_initaep_tcap_alloc(struct cos_defcompinfo *child_dci, tcap_t tc, struct cos_aep_info *sched, dcbcap_t dcap);
 
 /*
  * cos_aep_alloc_ext: creates a new async activation end-point which includes thread, tcap and rcv capabilities in the child_dci component using sched_aep->rcv.
  *		      if @child_dci == NULL, create in the current component.
  * NOTE: dcbuaddr is the address in child_dci page-table.
  */
-int cos_aep_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *child_dci, struct cos_aep_info *sched_aep, thdclosure_index_t idx, vaddr_t dcbuaddr);
+int cos_aep_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *child_dci, struct cos_aep_info *sched_aep, thdclosure_index_t idx, dcbcap_t dcap, dcboff_t doff);
 
 /*
  * cos_aep_alloc_ext: creates a new async activation end-point which includes thread, tcap and rcv capabilities in the child_dci component using sched_aep->rcv.
  *		      if @child_dci == NULL, create in the current component.
  * NOTE: dcbuaddr is the address in child_dci page-table.
  */
-int cos_aep_tcap_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *child_dci, struct cos_aep_info *sched_aep, tcap_t tc, thdclosure_index_t idx, vaddr_t dcbuaddr);
+int cos_aep_tcap_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *child_dci, struct cos_aep_info *sched_aep, tcap_t tc, thdclosure_index_t idx, dcbcap_t dcap, dcboff_t doff);
 
 /*
  * cos_defswitch: thread switch api using the default scheduling tcap and rcv.
diff --git a/src/components/include/cos_kernel_api.h b/src/components/include/cos_kernel_api.h
index a67d50ac84..ffc5d2c6c1 100644
--- a/src/components/include/cos_kernel_api.h
+++ b/src/components/include/cos_kernel_api.h
@@ -56,6 +56,7 @@ typedef capid_t pgtblcap_t;
 typedef capid_t hwcap_t;
 typedef capid_t scbcap_t;
 typedef capid_t dcbcap_t;
+typedef unsigned short dcboff_t;
 
 /* Memory source information */
 struct cos_meminfo {
@@ -67,7 +68,7 @@ struct cos_meminfo {
 /* Component captbl/pgtbl allocation information */
 struct cos_compinfo {
 	/* capabilities to higher-order capability tables (or -1) */
-	capid_t pgtbl_cap, captbl_cap, comp_cap, scb_cap;
+	capid_t pgtbl_cap, captbl_cap, comp_cap;
 	/* the frontier of unallocated caps, and the allocated captbl range */
 	capid_t cap_frontier, caprange_frontier;
 	/* the frontier for each of the various sizes of capability per core! */
@@ -77,14 +78,13 @@ struct cos_compinfo {
 	/* the source of memory */
 	struct cos_compinfo *memsrc; /* might be self-referential */
 	struct cos_meminfo   mi;     /* only populated for the component with real memory */
-	vaddr_t scb_vas;             /* scb virtual address in the component's pgtbl */
 
 	struct ps_lock cap_lock, mem_lock; /* locks to make the cap frontier and mem frontier updates and expands atomic */
 	struct ps_lock va_lock; /* lock to make the vas frontier and bump expands for vas atomic */
 };
 
 void cos_compinfo_init(struct cos_compinfo *ci, pgtblcap_t pgtbl_cap, captblcap_t captbl_cap, compcap_t comp_cap,
-		       scbcap_t scb_cap, vaddr_t scb_vas, vaddr_t heap_ptr, capid_t cap_frontier, struct cos_compinfo *ci_resources);
+		       vaddr_t heap_ptr, capid_t cap_frontier, struct cos_compinfo *ci_resources);
 /*
  * This only needs be called on compinfos that are managing resources
  * (i.e. likely only one).  All of the capabilities will be relative
@@ -110,21 +110,24 @@ int cos_pgtbl_intern_expandwith(struct cos_compinfo *ci, pgtblcap_t intern, vadd
  * This uses the next three functions to allocate a new component and
  * correctly populate ci (allocating all resources from ci_resources).
  */
-int         cos_compinfo_alloc(struct cos_compinfo *ci, vaddr_t heap_ptr, capid_t cap_frontier, vaddr_t entry,
-                               struct cos_compinfo *ci_resources);
+int         cos_compinfo_alloc(struct cos_compinfo *ci, scbcap_t sc, vaddr_t heap_ptr, capid_t cap_frontier,         				    vaddr_t entry, struct cos_compinfo *ci_resources);
 captblcap_t cos_captbl_alloc(struct cos_compinfo *ci);
 pgtblcap_t  cos_pgtbl_alloc(struct cos_compinfo *ci);
-compcap_t   cos_comp_alloc(struct cos_compinfo *ci, captblcap_t ctc, pgtblcap_t ptc, scbcap_t scbc, vaddr_t entry, vaddr_t scb_addr);
+compcap_t   cos_comp_alloc(struct cos_compinfo *ci, captblcap_t ctc, pgtblcap_t ptc, scbcap_t scbc, vaddr_t entry,
+			   vaddr_t scb_addr);
 scbcap_t    cos_scb_alloc(struct cos_compinfo *ci);
-dcbcap_t    cos_dcb_alloc(struct cos_compinfo *ci, vaddr_t *dcb_uaddr);
+dcbcap_t    cos_dcb_alloc(struct cos_compinfo *ci, pgtblcap_t ptc, vaddr_t dcb_uaddr);
 
 typedef void (*cos_thd_fn_t)(void *);
-thdcap_t cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, cos_thd_fn_t fn, void *data, pgtblcap_t ptc, vaddr_t dcbuaddr);
-thdcap_t cos_thd_alloc_ext(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t idx, pgtblcap_t ptc, vaddr_t dcbuaddr);
+thdcap_t cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, cos_thd_fn_t fn, void *data, dcbcap_t dc,
+		       dcboff_t dcboff);
+thdcap_t cos_thd_alloc_ext(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t idx, dcbcap_t dc,
+			   dcboff_t dcboff);
 /* Create the initial (cos_init) thread */
-thdcap_t  cos_initthd_alloc(struct cos_compinfo *ci, compcap_t comp, pgtblcap_t, vaddr_t dcbuaddr);
+thdcap_t  cos_initthd_alloc(struct cos_compinfo *ci, compcap_t comp, dcbcap_t dc);
 sinvcap_t cos_sinv_alloc(struct cos_compinfo *srcci, compcap_t dstcomp, vaddr_t entry, invtoken_t token);
-arcvcap_t cos_arcv_alloc(struct cos_compinfo *ci, thdcap_t thdcap, tcap_t tcapcap, compcap_t compcap, arcvcap_t enotif);
+arcvcap_t cos_arcv_alloc(struct cos_compinfo *ci, thdcap_t thdcap, tcap_t tcapcap, compcap_t compcap,
+			 arcvcap_t enotif);
 asndcap_t cos_asnd_alloc(struct cos_compinfo *ci, arcvcap_t arcvcap, captblcap_t ctcap);
 
 void *cos_page_bump_alloc(struct cos_compinfo *ci);
@@ -163,7 +166,8 @@ int cos_asnd(asndcap_t snd, int yield);
 /* returns non-zero if there are still pending events (i.e. there have been pending snds) */
 int cos_rcv(arcvcap_t rcv, rcv_flags_t flags, int *rcvd);
 /* returns the same value as cos_rcv, but also information about scheduling events */
-int cos_sched_rcv(arcvcap_t rcv, rcv_flags_t flags, tcap_time_t timeout, int *rcvd, thdid_t *thdid, int *blocked, cycles_t *cycles, tcap_time_t *thd_timeout);
+int cos_sched_rcv(arcvcap_t rcv, rcv_flags_t flags, tcap_time_t timeout, int *rcvd, thdid_t *thdid, int *blocked,
+		  cycles_t *cycles, tcap_time_t *thd_timeout);
 
 int cos_introspect(struct cos_compinfo *ci, capid_t cap, unsigned long op);
 
diff --git a/src/components/include/hypercall.h b/src/components/include/hypercall.h
index 192f13ee67..1e7b466b61 100644
--- a/src/components/include/hypercall.h
+++ b/src/components/include/hypercall.h
@@ -14,12 +14,13 @@ enum hypercall_cntl {
 	HYPERCALL_COMP_CAPTBLCAP_GET,
 	HYPERCALL_COMP_PGTBLCAP_GET,
 	HYPERCALL_COMP_CAPFRONTIER_GET,
-	HYPERCALL_COMP_INITDCB_GET, /* per-core, each core. only for threads created by llbooter */
 
 	HYPERCALL_COMP_INITAEP_GET,
 	HYPERCALL_COMP_CHILD_NEXT,
 
 	HYPERCALL_NUMCOMPS_GET,
+
+	HYPERCALL_ROOT_INITAEP_SET, /* per-core root-scheduler init-aeps created by capmgr and passed to llbooter */
 };
 
 static inline int
@@ -81,6 +82,18 @@ hypercall_comp_initaep_get(spdid_t spdid, int is_sched, struct cos_aep_info *aep
 	return 0;
 }
 
+static inline int
+hypercall_root_initaep_set(spdid_t spdid, struct cos_aep_info *aep)
+{
+	int ret = 0;
+
+	ret = cos_sinv(BOOT_CAPTBL_SINV_CAP, 0, HYPERCALL_ROOT_INITAEP_SET, spdid << 16 | aep->thd,
+		       aep->rcv << 16 | aep->tc);
+	if (ret) return ret;
+
+	return 0;
+}
+
 /* Note: This API can be called ONLY by components that manage capability resources */
 static inline int
 hypercall_comp_info_get(spdid_t spdid, pgtblcap_t *ptslot, captblcap_t *ctslot, compcap_t *compslot, spdid_t *parentid)
@@ -188,12 +201,6 @@ hypercall_comp_capfrontier_get(spdid_t spdid)
 	return cap_frontier;
 }
 
-static inline vaddr_t
-hypercall_initdcb_get(spdid_t spdid)
-{
-	return (vaddr_t)cos_sinv(BOOT_CAPTBL_SINV_CAP, 0, HYPERCALL_COMP_INITDCB_GET, spdid, 0);
-}
-
 static inline int
 hypercall_numcomps_get(void)
 {
diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index fe988d27e1..09620b0054 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -289,8 +289,10 @@ struct sl_thd *sl_thd_aep_alloc(cos_aepthd_fn_t fn, void *data, int own_tcap, co
  */
 struct sl_thd *sl_thd_comp_init(struct cos_defcompinfo *comp, int is_sched);
 
-struct sl_thd *sl_thd_initaep_alloc(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key, vaddr_t dcbuaddr);
-struct sl_thd *sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, vaddr_t dcbuaddr, arcvcap_t *extrcv);
+struct sl_thd *sl_thd_initaep_alloc_dcb(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key, dcbcap_t dcap);
+struct sl_thd *sl_thd_aep_alloc_ext_dcb(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, dcbcap_t dcap, dcboff_t doff, arcvcap_t *extrcv);
+struct sl_thd *sl_thd_initaep_alloc(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key, vaddr_t *dcbaddr);
+struct sl_thd *sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, vaddr_t *dcbaddr, arcvcap_t *extrcv);
 
 struct sl_thd *sl_thd_init_ext(struct cos_aep_info *aep, struct sl_thd *sched_thd);
 
@@ -473,6 +475,8 @@ sl_thd_activate(struct sl_thd *t, sched_tok_t tok)
 	} else {
 		/* TODO: can't use if you're reprogramming a timer/prio */
 		return sl_thd_dispatch(t, tok, sl_thd_curr());
+		//return cos_switch(sl_thd_thdcap(t), g->sched_tcap, t->prio,
+		//		  g->timeout_next, g->sched_rcv, tok);
 	}
 }
 
diff --git a/src/components/interface/capmgr/capmgr.h b/src/components/interface/capmgr/capmgr.h
index 5ca8828d6b..05e73f4dce 100644
--- a/src/components/interface/capmgr/capmgr.h
+++ b/src/components/interface/capmgr/capmgr.h
@@ -6,10 +6,10 @@
 
 thdcap_t  capmgr_initthd_create(spdid_t child, thdid_t *tid);
 thdcap_t  capmgr_initaep_create(spdid_t child, struct cos_aep_info *aep, int owntc, cos_channelkey_t key, asndcap_t *sndret);
-thdcap_t  capmgr_thd_create(cos_thd_fn_t fn, void *data, thdid_t *tid);
-thdcap_t  capmgr_aep_create(struct cos_aep_info *a, cos_aepthd_fn_t fn, void *data, int owntc, cos_channelkey_t key);
-thdcap_t  capmgr_thd_create_ext(spdid_t child, thdclosure_index_t idx, thdid_t *tid);
-thdcap_t  capmgr_aep_create_ext(spdid_t child, struct cos_aep_info *a, thdclosure_index_t idx, int owntc, cos_channelkey_t key, arcvcap_t *extrcv);
+thdcap_t  capmgr_thd_create(cos_thd_fn_t fn, void *data, thdid_t *tid, struct cos_dcb_info **dcb);
+thdcap_t  capmgr_aep_create(struct cos_aep_info *a, cos_aepthd_fn_t fn, void *data, int owntc, cos_channelkey_t key, struct cos_dcb_info **dcb);
+thdcap_t  capmgr_thd_create_ext(spdid_t child, thdclosure_index_t idx, thdid_t *tid, struct cos_dcb_info **dcb);
+thdcap_t  capmgr_aep_create_ext(spdid_t child, struct cos_aep_info *a, thdclosure_index_t idx, int owntc, cos_channelkey_t key, struct cos_dcb_info **dcb, arcvcap_t *extrcv);
 thdcap_t  capmgr_thd_retrieve(spdid_t child, thdid_t t, thdid_t *inittid);
 thdcap_t  capmgr_thd_retrieve_next(spdid_t child, thdid_t *tid);
 asndcap_t capmgr_asnd_create(spdid_t child, thdid_t t);
diff --git a/src/components/interface/capmgr/memmgr.h b/src/components/interface/capmgr/memmgr.h
index 36b4c70de1..b4125336b2 100644
--- a/src/components/interface/capmgr/memmgr.h
+++ b/src/components/interface/capmgr/memmgr.h
@@ -11,7 +11,4 @@ cbuf_t        memmgr_shared_page_alloc(vaddr_t *pgaddr);
 cbuf_t        memmgr_shared_page_allocn(unsigned long num_pages, vaddr_t *pgaddr);
 unsigned long memmgr_shared_page_map(cbuf_t id, vaddr_t *pgaddr);
 
-vaddr_t memmgr_initdcbpage_retrieve(void);
-vaddr_t memmgr_dcbpage_allocn(unsigned long num_pages);
-
 #endif /* MEMMGR_H */
diff --git a/src/components/interface/capmgr/stubs/c_stub.c b/src/components/interface/capmgr/stubs/c_stub.c
index edb760c79a..d1dbfc9606 100644
--- a/src/components/interface/capmgr/stubs/c_stub.c
+++ b/src/components/interface/capmgr/stubs/c_stub.c
@@ -5,10 +5,12 @@
 
 thdcap_t capmgr_initthd_create_cserialized(thdid_t *tid, int *unused, spdid_t s);
 thdcap_t capmgr_initaep_create_cserialized(u32_t *sndtidret, u32_t *rcvtcret, spdid_t s, int owntc, cos_channelkey_t key);
-thdcap_t capmgr_thd_create_cserialized(thdid_t *tid, int *unused, thdclosure_index_t idx);
-thdcap_t capmgr_aep_create_cserialized(thdid_t *tid, u32_t *tcrcvret, thdclosure_index_t idx, int owntc, cos_channelkey_t key);
-thdcap_t capmgr_thd_create_ext_cserialized(thdid_t *tid, int *unused, spdid_t s, thdclosure_index_t idx);
-thdcap_t capmgr_aep_create_ext_cserialized(u32_t *drcvtidret, u32_t *rcvtcret, spdid_t s, thdclosure_index_t idx, u32_t owntc_aepkey);
+thdcap_t capmgr_thd_create_cserialized(struct cos_dcb_info **dcb, thdid_t *tid, thdclosure_index_t idx);
+u32_t    capmgr_aep_create_cserialized(struct cos_dcb_info **dcb, u32_t *tcrcvret, thdclosure_index_t idx, int owntc, cos_channelkey_t key);
+thdcap_t capmgr_thd_create_ext_cserialized(struct cos_dcb_info **dcb, thdid_t *tid, spdid_t s, thdclosure_index_t idx);
+/* rcvcap for spdid = s shall be obtained through a separate call to capmgr! */
+arcvcap_t capmgr_aep_rcv_retrieve_cserialized(spdid_t s, thdid_t tid);
+u32_t    capmgr_aep_create_ext_cserialized(struct cos_dcb_info **dcb, u32_t *rcvtcret, spdid_t s, thdclosure_index_t idx, u32_t owntc_aepkey);
 thdcap_t capmgr_thd_retrieve_next_cserialized(thdid_t *tid, int *unused, spdid_t s);
 thdcap_t capmgr_thd_retrieve_cserialized(thdid_t *inittid, int *unused, spdid_t s, thdid_t tid);
 
@@ -37,28 +39,25 @@ capmgr_initthd_create(spdid_t child, thdid_t *tid)
 }
 
 thdcap_t
-capmgr_thd_create(cos_thd_fn_t fn, void *data, thdid_t *tid)
+capmgr_thd_create(cos_thd_fn_t fn, void *data, thdid_t *tid, struct cos_dcb_info **dcb)
 {
-	int unused;
 	thdclosure_index_t idx = cos_thd_init_alloc(fn, data);
 
 	if (idx < 1) return 0;
 
-	return capmgr_thd_create_cserialized(tid, &unused, idx);
+	return capmgr_thd_create_cserialized(dcb, tid, idx);
 }
 
 thdcap_t
-capmgr_thd_create_ext(spdid_t child, thdclosure_index_t idx, thdid_t *tid)
+capmgr_thd_create_ext(spdid_t child, thdclosure_index_t idx, thdid_t *tid, struct cos_dcb_info **dcb)
 {
-	int unused;
-
-	return capmgr_thd_create_ext_cserialized(tid, &unused, child, idx);
+	return capmgr_thd_create_ext_cserialized(dcb, tid, child, idx);
 }
 
 thdcap_t
-capmgr_aep_create(struct cos_aep_info *aep, cos_aepthd_fn_t fn, void *data, int owntc, cos_channelkey_t key)
+capmgr_aep_create(struct cos_aep_info *aep, cos_aepthd_fn_t fn, void *data, int owntc, cos_channelkey_t key, struct cos_dcb_info **dcb)
 {
-	u32_t tcrcvret = 0;
+	u32_t tcrcvret = 0, thdtidret = 0;
 	thdcap_t thd = 0;
 	arcvcap_t rcv = 0;
 	tcap_t tc = 0;
@@ -67,8 +66,11 @@ capmgr_aep_create(struct cos_aep_info *aep, cos_aepthd_fn_t fn, void *data, int
 
 	if (idx < 1) return 0;
 
-	thd = capmgr_aep_create_cserialized(&tid, &tcrcvret, idx, owntc, key);
-	if (!thd) return 0;
+	thdtidret = capmgr_aep_create_cserialized(dcb, &tcrcvret, idx, owntc, key);
+	if (!thdtidret) return 0;
+	thd = thdtidret >> 16;
+	tid = (thdtidret << 16) >> 16;
+	if (!thd || !tid) return 0;
 
 	aep->fn   = fn;
 	aep->data = data;
@@ -81,24 +83,28 @@ capmgr_aep_create(struct cos_aep_info *aep, cos_aepthd_fn_t fn, void *data, int
 }
 
 thdcap_t
-capmgr_aep_create_ext(spdid_t child, struct cos_aep_info *aep, thdclosure_index_t idx, int owntc, cos_channelkey_t key, arcvcap_t *extrcv)
+capmgr_aep_create_ext(spdid_t child, struct cos_aep_info *aep, thdclosure_index_t idx, int owntc, cos_channelkey_t key, struct cos_dcb_info **dcb, arcvcap_t *extrcv)
 {
-	u32_t drcvtidret = 0;
+	u32_t thdtidret = 0;
 	u32_t tcrcvret = 0;
 	thdid_t tid = 0;
 	thdcap_t thd = 0;
 	u32_t owntc_aepkey = (owntc << 16) | (key);
 
-	thd = capmgr_aep_create_ext_cserialized(&drcvtidret, &tcrcvret, child, idx, owntc_aepkey);
-	if (!thd) return thd;
+	thdtidret = capmgr_aep_create_ext_cserialized(dcb, &tcrcvret, child, idx, owntc_aepkey);
+	if (!thdtidret) return thd;
+	thd = thdtidret >> 16;
+	tid = (thdtidret << 16) >> 16;
+	if (!thd || !tid) return 0;
 
 	aep->fn   = NULL;
 	aep->data = NULL;
 	aep->thd  = thd;
-	aep->tid  = (drcvtidret << 16) >> 16;
+	aep->tid  = tid;
 	aep->rcv  = tcrcvret >> 16;
 	aep->tc   = (tcrcvret << 16) >> 16;
-	*extrcv   = drcvtidret >> 16;
+	*extrcv = capmgr_aep_rcv_retrieve_cserialized(child, tid);
+	assert(*extrcv);
 
 	return aep->thd;
 }
diff --git a/src/components/interface/capmgr/stubs/s_stub.S b/src/components/interface/capmgr/stubs/s_stub.S
index fe4922d164..5719a4734c 100644
--- a/src/components/interface/capmgr/stubs/s_stub.S
+++ b/src/components/interface/capmgr/stubs/s_stub.S
@@ -7,6 +7,7 @@ cos_asm_server_stub_rets(capmgr_thd_create_cserialized)
 cos_asm_server_stub_rets(capmgr_aep_create_cserialized)
 cos_asm_server_stub_rets(capmgr_thd_create_ext_cserialized)
 cos_asm_server_stub_rets(capmgr_aep_create_ext_cserialized)
+cos_asm_server_stub(capmgr_aep_rcv_retrieve_cserialized)
 cos_asm_server_stub_rets(capmgr_thd_retrieve_cserialized)
 cos_asm_server_stub_rets(capmgr_thd_retrieve_next_cserialized)
 cos_asm_server_stub(capmgr_asnd_create)
@@ -14,7 +15,5 @@ cos_asm_server_stub(capmgr_asnd_rcv_create)
 cos_asm_server_stub(capmgr_asnd_key_create)
 
 cos_asm_server_stub(memmgr_heap_page_allocn)
-cos_asm_server_stub(memmgr_initdcbpage_retrieve)
-cos_asm_server_stub(memmgr_dcbpage_allocn)
 cos_asm_server_stub_rets(memmgr_shared_page_allocn_cserialized)
 cos_asm_server_stub_rets(memmgr_shared_page_map_cserialized)
diff --git a/src/components/lib/Makefile b/src/components/lib/Makefile
index db74ae9470..85903b5e82 100644
--- a/src/components/lib/Makefile
+++ b/src/components/lib/Makefile
@@ -1,6 +1,6 @@
 include Makefile.src Makefile.comp
 
-LIB_OBJS=heap.o cobj_format.o cos_kernel_api.o cos_defkernel_api.o cos_dcbraw.o cos_dcbcapmgr.o
+LIB_OBJS=heap.o cobj_format.o cos_kernel_api.o cos_defkernel_api.o cos_dcb.o
 LIBS=$(LIB_OBJS:%.o=%.a)
 MANDITORY=c_stub.o cos_asm_upcall.o cos_asm_ainv.o cos_component.o
 MAND=$(MANDITORY_LIB)
diff --git a/src/components/lib/cos_dcb.c b/src/components/lib/cos_dcb.c
new file mode 100644
index 0000000000..3924c21b38
--- /dev/null
+++ b/src/components/lib/cos_dcb.c
@@ -0,0 +1,92 @@
+#include <cos_component.h>
+#include <cos_dcb.h>
+#include <cos_defkernel_api.h>
+
+static struct cos_dcbinfo_data _cos_dcbinfo[NUM_CPU];
+
+void
+cos_dcb_info_init_ext(struct cos_dcbinfo_data *cdi, struct cos_compinfo *ci, dcbcap_t initdcbcap, vaddr_t initdcbaddr, dcboff_t start_off)
+{
+	memset(cdi, 0, sizeof(struct cos_dcbinfo_data));
+
+	cdi->dcbcaps[0]   = initdcbcap;
+	cdi->dcbaddr[0]   = (vaddr_t)cos_init_dcb_get();
+	cdi->curr_cap_off = start_off;
+	cdi->curr_cap     = 0;
+}
+
+void
+cos_dcb_info_init(struct cos_dcbinfo_data *cdi, struct cos_compinfo *ci)
+{
+	if (cos_spd_id() == 0) {
+		cos_dcb_info_init_ext(cdi, ci, LLBOOT_CAPTBL_CPU_INITDCB, (vaddr_t)cos_init_dcb_get(), 1);
+	} else {
+		cos_dcb_info_init_ext(cdi, ci, 0, 0, 0);
+	}
+}
+
+void
+cos_dcb_info_init_curr(void)
+{
+	cos_dcb_info_init_curr_ext(0, 0, 0);
+}
+
+void
+cos_dcb_info_init_curr_ext(dcbcap_t initdcbcap, vaddr_t initdcbaddr, dcboff_t st_off)
+{
+	struct cos_compinfo *ci = cos_compinfo_get(cos_defcompinfo_curr_get());
+
+	if (initdcbcap == 0 && initdcbaddr == 0) {
+
+		if (cos_spd_id() == 0) {
+			cos_dcb_info_init_ext(&_cos_dcbinfo[cos_cpuid()], ci, LLBOOT_CAPTBL_CPU_INITDCB, (vaddr_t)cos_init_dcb_get(), 1);
+
+			return;
+		} else {
+			initdcbaddr = cos_page_bump_intern_valloc(ci, PAGE_SIZE);
+			assert(initdcbaddr);
+			initdcbcap  = cos_dcb_alloc(ci, ci->pgtbl_cap, initdcbaddr);
+			assert(initdcbcap);
+			st_off = 0;
+		}
+	}
+	cos_dcb_info_init_ext(&_cos_dcbinfo[cos_cpuid()], ci, initdcbcap, initdcbaddr, st_off);
+}
+
+dcbcap_t
+cos_dcb_info_alloc_curr(dcboff_t *dcboff, vaddr_t *dcbaddr)
+{
+	return cos_dcb_info_alloc(&_cos_dcbinfo[cos_cpuid()], dcboff, dcbaddr);
+}
+
+dcbcap_t
+cos_dcb_info_alloc(struct cos_dcbinfo_data *cdi, dcboff_t *dcboff, vaddr_t *dcbaddr)
+{
+	if (unlikely(cdi->dcbcaps[cdi->curr_cap] == 0)) {
+		*dcboff = 0;
+		*dcbaddr = 0;
+
+		return 0;
+	}
+	if (cdi->curr_cap_off >= COS_DCB_PERPG_MAX) {
+		int ret;
+		unsigned short curr_off = cdi->curr_cap;
+
+		assert(curr_off + 1 < (unsigned short)COS_DCB_MAX_CAPS && cdi->dcbcaps[curr_off + 1] == 0);
+
+		cdi->dcbaddr[curr_off + 1] = cos_page_bump_intern_valloc(cdi->ci, PAGE_SIZE);
+		assert(cdi->dcbaddr[curr_off + 1]);
+		cdi->dcbcaps[curr_off + 1] = cos_dcb_alloc(cos_compinfo_get(cos_defcompinfo_curr_get()), cdi->ci->pgtbl_cap, cdi->dcbaddr[curr_off + 1]);
+
+		assert(cdi->dcbcaps[curr_off + 1]);
+		ret = ps_cas((unsigned long *)&cdi->curr_cap, curr_off, curr_off + 1);
+		assert(ret);
+		ret = ps_cas((unsigned long *)&cdi->curr_cap_off, cdi->curr_cap_off, 0);
+		assert(ret);
+	}
+
+	*dcboff  = ps_faa((unsigned long *)&cdi->curr_cap_off, 1);
+	*dcbaddr = cdi->dcbaddr[cdi->curr_cap] + (sizeof(struct cos_dcb_info) * (*dcboff));
+
+	return cdi->dcbcaps[cdi->curr_cap];
+}
diff --git a/src/components/lib/cos_dcbcapmgr.c b/src/components/lib/cos_dcbcapmgr.c
deleted file mode 100644
index 74db909b35..0000000000
--- a/src/components/lib/cos_dcbcapmgr.c
+++ /dev/null
@@ -1,45 +0,0 @@
-#include <cos_dcb.h>
-#include <cos_kernel_api.h>
-#include <../interface/capmgr/memmgr.h>
-
-static unsigned long free_off[NUM_CPU] CACHE_ALIGNED = { 0 };
-static struct cos_dcb_info *dcb_off[NUM_CPU] CACHE_ALIGNED = { NULL }, *initdcb[NUM_CPU] CACHE_ALIGNED = { NULL };
-
-void
-cos_dcb_info_init(void)
-{
-	dcb_off[cos_cpuid()] = initdcb[cos_cpuid()] = (struct cos_dcb_info *)memmgr_initdcbpage_retrieve();
-	assert(initdcb[cos_cpuid()]);
-
-	dcb_off[cos_cpuid()]++;
-	free_off[cos_cpuid()] = 1;
-}
-
-void
-cos_dcb_info_alloc(void)
-{
-	dcb_off[cos_cpuid()] = (struct cos_dcb_info *)memmgr_dcbpage_allocn(1);
-	assert(dcb_off[cos_cpuid()]);
-
-	free_off[cos_cpuid()] = 0;
-}
-
-struct cos_dcb_info *
-cos_dcb_info_assign(void)
-{
-	unsigned long curr_off = 0;
-
-	curr_off = ps_faa(&free_off[cos_cpuid()], 1);
-	if (curr_off >= COS_DCB_PERPG_MAX) {
-		cos_dcb_info_alloc();
-		curr_off = ps_faa(&free_off[cos_cpuid()], 1);
-	}
-
-	return (dcb_off[cos_cpuid()] + curr_off);
-}
-
-struct cos_dcb_info *
-cos_dcb_info_init_get(void)
-{
-	return initdcb[cos_cpuid()];
-}
diff --git a/src/components/lib/cos_dcbraw.c b/src/components/lib/cos_dcbraw.c
deleted file mode 100644
index 72f08f9294..0000000000
--- a/src/components/lib/cos_dcbraw.c
+++ /dev/null
@@ -1,47 +0,0 @@
-#include <cos_dcb.h>
-#include <cos_kernel_api.h>
-#include <cos_defkernel_api.h>
-
-static unsigned long free_off[NUM_CPU] CACHE_ALIGNED = { 0 };
-static struct cos_dcb_info *dcb_off[NUM_CPU] CACHE_ALIGNED = { NULL }, *initdcb[NUM_CPU] CACHE_ALIGNED = { NULL };
-
-void
-cos_dcb_info_init(void)
-{
-	dcb_off[cos_cpuid()] = initdcb[cos_cpuid()] = cos_init_dcb_get();
-	assert(initdcb[cos_cpuid()]);
-
-	dcb_off[cos_cpuid()]++;
-	free_off[cos_cpuid()] = 1;
-}
-
-void
-cos_dcb_info_alloc(void)
-{
-	struct cos_compinfo *ci_res = cos_compinfo_get(cos_defcompinfo_curr_get());
-
-	dcb_off[cos_cpuid()] = cos_dcbpg_bump_allocn(ci_res, PAGE_SIZE);
-	assert(dcb_off[cos_cpuid()]);
-
-	free_off[cos_cpuid()] = 0;
-}
-
-struct cos_dcb_info *
-cos_dcb_info_assign(void)
-{
-	unsigned long curr_off = 0;
-
-	curr_off = ps_faa(&free_off[cos_cpuid()], 1);
-	if (curr_off >= COS_DCB_PERPG_MAX) {
-		cos_dcb_info_alloc();
-		curr_off = ps_faa(&free_off[cos_cpuid()], 1);
-	}
-
-	return (dcb_off[cos_cpuid()] + curr_off);
-}
-
-struct cos_dcb_info *
-cos_dcb_info_init_get(void)
-{
-	return initdcb[cos_cpuid()];
-}
diff --git a/src/components/lib/cos_defkernel_api.c b/src/components/lib/cos_defkernel_api.c
index 97c8b364f8..ceed2c2fbf 100644
--- a/src/components/lib/cos_defkernel_api.c
+++ b/src/components/lib/cos_defkernel_api.c
@@ -42,20 +42,31 @@ cos_defcompinfo_init(void)
 
 	cos_defcompinfo_init_ext(BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE,
 	                         BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_SELF_CT,
-	                         BOOT_CAPTBL_SELF_COMP, BOOT_CAPTBL_SELF_SCB, (vaddr_t)cos_scb_info_get(), (vaddr_t)cos_get_heap_ptr(), BOOT_CAPTBL_FREE);
+	                         BOOT_CAPTBL_SELF_COMP, (vaddr_t)cos_get_heap_ptr(), BOOT_CAPTBL_FREE);
+
+}
+
+void
+cos_defcompinfo_llinit(void)
+{
+	if (curr_defci_init_status == INITIALIZED) return;
+
+	cos_defcompinfo_init_ext(BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE,
+	                         BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_SELF_CT,
+	                         BOOT_CAPTBL_SELF_COMP, (vaddr_t)cos_get_heap_ptr(), LLBOOT_CAPTBL_FREE);
 
 }
 
 void
 cos_defcompinfo_init_ext(tcap_t sched_tc, thdcap_t sched_thd, arcvcap_t sched_rcv, pgtblcap_t pgtbl_cap,
-                         captblcap_t captbl_cap, compcap_t comp_cap, scbcap_t scb_cap, vaddr_t scb_ptr, vaddr_t heap_ptr, capid_t cap_frontier)
+                         captblcap_t captbl_cap, compcap_t comp_cap, vaddr_t heap_ptr, capid_t cap_frontier)
 {
 	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
 	struct cos_compinfo    *ci    = cos_compinfo_get(defci);
 
 	if (curr_defci_init_status == INITIALIZED) return;
 
-	cos_compinfo_init(ci, pgtbl_cap, captbl_cap, comp_cap, scb_cap, scb_ptr, heap_ptr, cap_frontier, ci);
+	cos_compinfo_init(ci, pgtbl_cap, captbl_cap, comp_cap, heap_ptr, cap_frontier, ci);
 	curr_defci_init_status = INITIALIZED;
 	cos_defcompinfo_sched_init_ext(sched_tc, sched_thd, sched_rcv);
 }
@@ -87,7 +98,7 @@ cos_defcompinfo_sched_init(void)
 }
 
 static int
-cos_aep_alloc_intern(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci, tcap_t tc, struct cos_aep_info *sched, cos_aepthd_fn_t fn, void *data, thdclosure_index_t idx, vaddr_t dcbuaddr)
+cos_aep_alloc_intern(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci, tcap_t tc, struct cos_aep_info *sched, cos_aepthd_fn_t fn, void *data, thdclosure_index_t idx, dcbcap_t dcbcap, dcboff_t dcboff)
 {
 	struct cos_defcompinfo *defci   = cos_defcompinfo_curr_get();
 	struct cos_compinfo    *ci      = cos_compinfo_get(defci);
@@ -97,9 +108,9 @@ cos_aep_alloc_intern(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci,
 	assert(curr_defci_init_status == INITIALIZED);
 	memset(aep, 0, sizeof(struct cos_aep_info));
 
-	if (is_init)      aep->thd = cos_initthd_alloc(ci, dst_ci->comp_cap, dst_ci->pgtbl_cap, dcbuaddr);
-	else if (idx > 0) aep->thd = cos_thd_alloc_ext(ci, dst_ci->comp_cap, idx, dst_ci->pgtbl_cap, dcbuaddr);
-	else              aep->thd = cos_thd_alloc(ci, dst_ci->comp_cap, cos_aepthd_fn, (void *)aep, dst_ci->pgtbl_cap, dcbuaddr);
+	if (is_init)      aep->thd = cos_initthd_alloc(ci, dst_ci->comp_cap, dcbcap);
+	else if (idx > 0) aep->thd = cos_thd_alloc_ext(ci, dst_ci->comp_cap, idx, dcbcap, dcboff);
+	else              aep->thd = cos_thd_alloc(ci, dst_ci->comp_cap, cos_aepthd_fn, (void *)aep, dcbcap, dcboff);
 	assert(aep->thd);
 	aep->tid  = cos_introspect(ci, aep->thd, THD_GET_TID);
 	if (!sched && is_init) return 0;
@@ -121,7 +132,7 @@ cos_aep_alloc_intern(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci,
 
 int
 cos_defcompinfo_child_alloc(struct cos_defcompinfo *child_defci, vaddr_t entry, vaddr_t heap_ptr, capid_t cap_frontier,
-                            int is_sched, vaddr_t *dcbuaddr)
+                            int is_sched, dcbcap_t *initdcbcap)
 {
 	int                     ret;
 	struct cos_defcompinfo *defci     = cos_defcompinfo_curr_get();
@@ -129,13 +140,22 @@ cos_defcompinfo_child_alloc(struct cos_defcompinfo *child_defci, vaddr_t entry,
 	struct cos_compinfo    *ci        = cos_compinfo_get(defci);
 	struct cos_compinfo    *child_ci  = cos_compinfo_get(child_defci);
 	struct cos_aep_info    *child_aep = cos_sched_aep_get(child_defci);
+	vaddr_t dcbaddr = 0;
+	dcbcap_t dcbcap = 0;
+	scbcap_t scbcap = 0;
+
+	scbcap = cos_scb_alloc(ci);
+	assert(scbcap);
 
 	assert(curr_defci_init_status == INITIALIZED);
-	ret = cos_compinfo_alloc(child_ci, heap_ptr, cap_frontier, entry, ci);
+	ret = cos_compinfo_alloc(child_ci, scbcap, heap_ptr, cap_frontier, entry, ci);
 	if (ret) return ret;
-	*dcbuaddr = (vaddr_t)cos_dcbpg_bump_allocn(child_ci, PAGE_SIZE);
-	assert(*dcbuaddr);
-	ret = cos_aep_alloc_intern(child_aep, child_defci, 0, is_sched ? sched_aep : NULL, NULL, NULL, 0, *dcbuaddr);
+	dcbaddr = (vaddr_t)cos_page_bump_intern_valloc(child_ci, PAGE_SIZE);
+	assert(dcbaddr);
+	dcbcap = cos_dcb_alloc(ci, child_ci->pgtbl_cap, dcbaddr);
+	assert(dcbcap);
+	ret = cos_aep_alloc_intern(child_aep, child_defci, 0, is_sched ? sched_aep : NULL, NULL, NULL, 0, dcbcap, 0);
+	*initdcbcap = dcbcap;
 
 	return ret;
 }
@@ -149,29 +169,29 @@ cos_defcompinfo_childid_init(struct cos_defcompinfo *child_defci, spdid_t c)
 }
 
 int
-cos_initaep_alloc(struct cos_defcompinfo *dst_dci, struct cos_aep_info *sched, int is_sched, vaddr_t dcbuaddr)
+cos_initaep_alloc(struct cos_defcompinfo *dst_dci, struct cos_aep_info *sched, int is_sched, dcbcap_t dcap)
 {
 	struct cos_defcompinfo *defci     = cos_defcompinfo_curr_get();
 	struct cos_aep_info    *sched_aep = cos_sched_aep_get(defci);
 	struct cos_aep_info    *child_aep = cos_sched_aep_get(dst_dci);
 	struct cos_aep_info    *sched_use = is_sched ? (sched ? sched : sched_aep) : NULL;
 
-	return cos_aep_alloc_intern(child_aep, dst_dci, 0, sched_use, NULL, NULL, 0, dcbuaddr);
+	return cos_aep_alloc_intern(child_aep, dst_dci, 0, sched_use, NULL, NULL, 0, dcap, 0);
 }
 
 int
-cos_initaep_tcap_alloc(struct cos_defcompinfo *dst_dci, tcap_t tc, struct cos_aep_info *sched, vaddr_t dcbuaddr)
+cos_initaep_tcap_alloc(struct cos_defcompinfo *dst_dci, tcap_t tc, struct cos_aep_info *sched, dcbcap_t dcap)
 {
 	struct cos_defcompinfo *defci     = cos_defcompinfo_curr_get();
 	struct cos_aep_info    *sched_aep = cos_sched_aep_get(defci);
 	struct cos_aep_info    *child_aep = cos_sched_aep_get(dst_dci);
 	struct cos_aep_info    *sched_use = sched ? sched : sched_aep;
 
-	return cos_aep_alloc_intern(child_aep, dst_dci, tc, sched_use, NULL, NULL, 0, dcbuaddr);
+	return cos_aep_alloc_intern(child_aep, dst_dci, tc, sched_use, NULL, NULL, 0, dcap, 0);
 }
 
 int
-cos_aep_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci, struct cos_aep_info *sched, thdclosure_index_t idx, vaddr_t dcbuaddr)
+cos_aep_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci, struct cos_aep_info *sched, thdclosure_index_t idx, dcbcap_t dcap, dcboff_t doff)
 {
 	struct cos_defcompinfo *defci     = cos_defcompinfo_curr_get();
 	struct cos_aep_info    *sched_aep = cos_sched_aep_get(defci);
@@ -180,11 +200,11 @@ cos_aep_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci, str
 	if (!sched) sched_aep = cos_sched_aep_get(dst_dci);
 	else        sched_aep = sched;
 
-	return cos_aep_alloc_intern(aep, dst_dci, 0, sched_aep, NULL, NULL, idx, dcbuaddr);
+	return cos_aep_alloc_intern(aep, dst_dci, 0, sched_aep, NULL, NULL, idx, dcap, doff);
 }
 
 int
-cos_aep_tcap_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci, struct cos_aep_info *sched, tcap_t tc, thdclosure_index_t idx, vaddr_t dcbuaddr)
+cos_aep_tcap_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci, struct cos_aep_info *sched, tcap_t tc, thdclosure_index_t idx, dcbcap_t dcap, dcboff_t doff)
 {
 	struct cos_defcompinfo *defci     = cos_defcompinfo_curr_get();
 	struct cos_aep_info    *sched_aep = cos_sched_aep_get(defci);
@@ -194,25 +214,25 @@ cos_aep_tcap_alloc_ext(struct cos_aep_info *aep, struct cos_defcompinfo *dst_dci
 	if (!sched) sched_aep = cos_sched_aep_get(dst_dci);
 	else        sched_aep = sched;
 
-	return cos_aep_alloc_intern(aep, dst_dci, tc, sched_aep, NULL, NULL, idx, dcbuaddr);
+	return cos_aep_alloc_intern(aep, dst_dci, tc, sched_aep, NULL, NULL, idx, dcap, doff);
 }
 
 int
-cos_aep_alloc(struct cos_aep_info *aep, cos_aepthd_fn_t fn, void *data, vaddr_t dcbuaddr)
+cos_aep_alloc(struct cos_aep_info *aep, cos_aepthd_fn_t fn, void *data, dcbcap_t dcap, dcboff_t doff)
 {
 	struct cos_defcompinfo *defci     = cos_defcompinfo_curr_get();
 	struct cos_aep_info    *sched_aep = cos_sched_aep_get(defci);
 
-	return cos_aep_alloc_intern(aep, defci, 0, sched_aep, fn, data, 0, dcbuaddr);
+	return cos_aep_alloc_intern(aep, defci, 0, sched_aep, fn, data, 0, dcap, doff);
 }
 
 int
-cos_aep_tcap_alloc(struct cos_aep_info *aep, tcap_t tc, cos_aepthd_fn_t fn, void *data, vaddr_t dcbuaddr)
+cos_aep_tcap_alloc(struct cos_aep_info *aep, tcap_t tc, cos_aepthd_fn_t fn, void *data, dcbcap_t dcap, dcboff_t doff)
 {
 	struct cos_defcompinfo *defci     = cos_defcompinfo_curr_get();
 	struct cos_aep_info    *sched_aep = cos_sched_aep_get(defci);
 
-	return cos_aep_alloc_intern(aep, defci, tc, sched_aep, fn, data, 0, dcbuaddr);
+	return cos_aep_alloc_intern(aep, defci, tc, sched_aep, fn, data, 0, dcap, doff);
 }
 
 int
diff --git a/src/components/lib/cos_kernel_api.c b/src/components/lib/cos_kernel_api.c
index 17a61c629f..d3f3ad4626 100644
--- a/src/components/lib/cos_kernel_api.c
+++ b/src/components/lib/cos_kernel_api.c
@@ -71,7 +71,7 @@ cos_capfrontier_init(struct cos_compinfo *ci, capid_t cap_frontier)
 
 void
 cos_compinfo_init(struct cos_compinfo *ci, pgtblcap_t pgtbl_cap, captblcap_t captbl_cap, compcap_t comp_cap,
-		  scbcap_t scb_cap, vaddr_t scb_vas, vaddr_t heap_ptr, capid_t cap_frontier, struct cos_compinfo *ci_resources)
+		  vaddr_t heap_ptr, capid_t cap_frontier, struct cos_compinfo *ci_resources)
 {
 	assert(ci && ci_resources);
 	assert(cap_frontier % CAPMAX_ENTRY_SZ == 0);
@@ -85,10 +85,6 @@ cos_compinfo_init(struct cos_compinfo *ci, pgtblcap_t pgtbl_cap, captblcap_t cap
 	ci->pgtbl_cap  = pgtbl_cap;
 	ci->captbl_cap = captbl_cap;
 	ci->comp_cap   = comp_cap;
-	ci->scb_cap    = scb_cap;
-
-	assert(!scb_vas || scb_vas + COS_SCB_SIZE <= heap_ptr);
-	ci->scb_vas = scb_vas;
 
 	cos_capfrontier_init(ci, cap_frontier);
 	cos_vasfrontier_init(ci, heap_ptr);
@@ -583,7 +579,7 @@ __alloc_mem_cap(struct cos_compinfo *ci, cap_t ct, vaddr_t *kmem, capid_t *cap)
 }
 
 static thdcap_t
-__cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t init_data, pgtblcap_t ptcap, vaddr_t dcbaddr)
+__cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t init_data, dcbcap_t dc, dcboff_t off)
 {
 	vaddr_t kmem;
 	capid_t cap;
@@ -593,11 +589,12 @@ __cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t init
 	assert(ci && comp > 0);
 
 	if (__alloc_mem_cap(ci, CAP_THD, &kmem, &cap)) return 0;
-	assert(!(init_data & ~((1 << 12) - 1)));
+	assert(!(init_data & ~((1 << 16) - 1)));
+	assert(!(off & ~((1 << 9) - 1)));
 	assert(kmem && (round_to_page(kmem) == kmem));
 
 	if (call_cap_op(ci->captbl_cap, CAPTBL_OP_THDACTIVATE, __compinfo_metacap(ci)->mi.pgtbl_cap | (cap << 16),
-			kmem | init_data, comp << 16 | ptcap, dcbaddr))
+			kmem, comp << 16 | dc, off << 16 | init_data))
 		BUG();
 
 	return cap;
@@ -606,30 +603,49 @@ __cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t init
 #include <cos_thd_init.h>
 
 thdcap_t
-cos_thd_alloc_ext(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t idx, pgtblcap_t ptcap, vaddr_t dcbaddr)
+cos_thd_alloc_ext(struct cos_compinfo *ci, compcap_t comp, thdclosure_index_t idx, dcbcap_t dc, dcboff_t off)
 {
 	if (idx < 1) return 0;
 
-	return __cos_thd_alloc(ci, comp, idx, ptcap, dcbaddr);
+	return __cos_thd_alloc(ci, comp, idx, dc, off);
 }
 
 thdcap_t
-cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, cos_thd_fn_t fn, void *data, pgtblcap_t ptcap, vaddr_t dcbaddr)
+cos_thd_alloc(struct cos_compinfo *ci, compcap_t comp, cos_thd_fn_t fn, void *data, dcbcap_t dc, dcboff_t off)
 {
 	int      idx = cos_thd_init_alloc(fn, data);
 	thdcap_t ret;
 
 	if (idx < 1) return 0;
-	ret = __cos_thd_alloc(ci, comp, idx, ptcap, dcbaddr);
+	ret = __cos_thd_alloc(ci, comp, idx, dc, off);
 	if (!ret) cos_thd_init_free(idx);
 
 	return ret;
 }
 
 thdcap_t
-cos_initthd_alloc(struct cos_compinfo *ci, compcap_t comp, pgtblcap_t ptcap, vaddr_t dcbaddr)
+cos_initthd_alloc(struct cos_compinfo *ci, compcap_t comp, dcbcap_t dc)
+{
+	return __cos_thd_alloc(ci, comp, 0, dc, 0);
+}
+
+dcbcap_t
+cos_dcb_alloc(struct cos_compinfo *ci, pgtblcap_t ptcap, vaddr_t uaddr)
 {
-	return __cos_thd_alloc(ci, comp, 0, ptcap, dcbaddr);
+	vaddr_t kmem;
+	capid_t cap;
+	u32_t   lid = livenessid_bump_alloc();
+
+	printd("cos_dcb_alloc\n");
+
+	assert(ci);
+
+	if (__alloc_mem_cap(ci, CAP_DCB, &kmem, &cap)) return 0;
+	assert(kmem && (round_to_page(kmem) == kmem));
+	if (call_cap_op(ci->captbl_cap, CAPTBL_OP_DCB_ACTIVATE, cap << 16 | lid, (__compinfo_metacap(ci)->mi.pgtbl_cap) << 16 | ptcap, kmem, uaddr))
+		BUG();
+
+	return cap;
 }
 
 captblcap_t
@@ -678,6 +694,7 @@ cos_scb_alloc(struct cos_compinfo *ci)
 	assert(ci && lid);
 
 	if (__alloc_mem_cap(ci, CAP_SCB, &kmem, &cap)) return 0;
+	assert(kmem && (round_to_page(kmem) == kmem));
 	if (call_cap_op(ci->captbl_cap, CAPTBL_OP_SCB_ACTIVATE, cap, __compinfo_metacap(ci)->mi.pgtbl_cap, kmem, lid))
 		BUG();
 
@@ -705,13 +722,12 @@ cos_comp_alloc(struct cos_compinfo *ci, captblcap_t ctc, pgtblcap_t ptc, scbcap_
 }
 
 int
-cos_compinfo_alloc(struct cos_compinfo *ci, vaddr_t heap_ptr, capid_t cap_frontier, vaddr_t entry,
+cos_compinfo_alloc(struct cos_compinfo *ci, scbcap_t sc, vaddr_t heap_ptr, capid_t cap_frontier, vaddr_t entry,
                    struct cos_compinfo *ci_resources)
 {
 	pgtblcap_t  ptc;
 	captblcap_t ctc;
 	compcap_t   compc;
-	scbcap_t    scbc;
 	vaddr_t     scb_vaddr;
 
 	printd("cos_compinfo_alloc\n");
@@ -720,16 +736,14 @@ cos_compinfo_alloc(struct cos_compinfo *ci, vaddr_t heap_ptr, capid_t cap_fronti
 	assert(ptc);
 	ctc = cos_captbl_alloc(ci_resources);
 	assert(ctc);
-	scbc = cos_scb_alloc(ci_resources);
-	assert(scbc);
-	cos_compinfo_init(ci, ptc, ctc, 0, scbc, 0, heap_ptr, cap_frontier, ci_resources);
+	cos_compinfo_init(ci, ptc, ctc, 0, heap_ptr, cap_frontier, ci_resources);
 
+	/* FIXME: make sure this is right at the start of heap_ptr! */
 	scb_vaddr = (vaddr_t)__page_bump_valloc(ci, COS_SCB_SIZE);
 	assert(scb_vaddr);
-	compc     = cos_comp_alloc(ci_resources, ctc, ptc, scbc, entry, scb_vaddr);
+	compc     = cos_comp_alloc(ci_resources, ctc, ptc, sc, entry, scb_vaddr);
 	assert(compc);
 	ci->comp_cap = compc;
-	ci->scb_vas  = scb_vaddr;
 
 	return 0;
 }
diff --git a/src/components/lib/sl/sl_capmgr.c b/src/components/lib/sl/sl_capmgr.c
index 1d06543569..f76f21e455 100644
--- a/src/components/lib/sl/sl_capmgr.c
+++ b/src/components/lib/sl/sl_capmgr.c
@@ -99,10 +99,8 @@ sl_thd_alloc_no_cs(cos_thd_fn_t fn, void *data)
 
 	aep = sl_thd_alloc_aep_backend();
 	if (!aep) goto done;
-	dcb = cos_dcb_info_assign();
 
-	/* TODO: use dcb */
-	aep->thd = capmgr_thd_create(fn, data, &tid);
+	aep->thd = capmgr_thd_create(fn, data, &tid, &dcb);
 	if (!aep->thd) goto done;
 	aep->tid = tid;
 
@@ -141,7 +139,7 @@ sl_thd_comp_init_no_cs(struct cos_defcompinfo *comp, sl_thd_property_t prps, asn
 }
 
 static struct sl_thd *
-sl_thd_alloc_ext_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx, vaddr_t dcbuaddr)
+sl_thd_alloc_ext_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx, vaddr_t *dcbuaddr)
 {
 	struct cos_defcompinfo *dci    = cos_defcompinfo_curr_get();
 	struct cos_compinfo    *ci     = cos_compinfo_get(dci);
@@ -155,7 +153,7 @@ sl_thd_alloc_ext_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx, vad
 		aep = sl_thd_alloc_aep_backend();
 		if (!aep) goto done;
 
-		aep->thd = capmgr_thd_create_ext(comp->id, idx, &aep->tid);
+		aep->thd = capmgr_thd_create_ext(comp->id, idx, &aep->tid, (struct cos_dcb_info **)dcbuaddr);
 		if (!aep->thd) goto done;
 		aep->tc  = sl_thd_tcap(sl__globals_cpu()->sched_thd);
 
@@ -178,7 +176,7 @@ sl_thd_alloc_ext_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx, vad
 }
 
 static struct sl_thd *
-sl_thd_aep_alloc_ext_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, thdclosure_index_t idx, sl_thd_property_t prps, cos_channelkey_t key, vaddr_t dcbuaddr, arcvcap_t *extrcv)
+sl_thd_aep_alloc_ext_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, thdclosure_index_t idx, sl_thd_property_t prps, cos_channelkey_t key, vaddr_t *dcbuaddr, arcvcap_t *extrcv)
 {
 	struct cos_aep_info *aep = NULL;
 	struct sl_thd       *t   = NULL;
@@ -203,7 +201,7 @@ sl_thd_aep_alloc_ext_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, t
 		if (!aep) goto done;
 
 		if (prps & SL_THD_PROPERTY_OWN_TCAP) owntc = 1;
-		capmgr_aep_create_ext(comp->id, aep, idx, owntc, key, extrcv);
+		capmgr_aep_create_ext(comp->id, aep, idx, owntc, key, (struct cos_dcb_info **)dcbuaddr, extrcv);
 		if (!aep->thd) goto done;
 
 		t = sl_thd_alloc_init(aep, 0, prps, NULL);
@@ -224,10 +222,9 @@ sl_thd_aep_alloc_no_cs(cos_aepthd_fn_t fn, void *data, sl_thd_property_t prps, c
 
 	aep = sl_thd_alloc_aep_backend();
 	if (!aep) goto done;
-	dcb = cos_dcb_info_assign();
 
 	if (prps & SL_THD_PROPERTY_OWN_TCAP) owntc = 1;
-	capmgr_aep_create(aep, fn, data, owntc, key);
+	capmgr_aep_create(aep, fn, data, owntc, key, &dcb);
 	if (aep->thd == 0) goto done;
 
 	t = sl_thd_alloc_init(aep, 0, prps, dcb);
@@ -277,7 +274,15 @@ sl_thd_comp_init(struct cos_defcompinfo *comp, int is_sched)
 }
 
 struct sl_thd *
-sl_thd_initaep_alloc(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key, vaddr_t dcbuaddr)
+sl_thd_initaep_alloc_dcb(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key, dcbcap_t dcap)
+{
+	PRINTC("UNIMPLEMENTED: Using CAPMGR API which should manage the DCB capabilities\n");
+
+	return NULL;
+}
+
+struct sl_thd *
+sl_thd_initaep_alloc(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key, vaddr_t *dcbuaddr)
 {
 	struct sl_thd *t = NULL;
 
@@ -295,8 +300,17 @@ sl_thd_initaep_alloc(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int
 	return t;
 }
 
+
+struct sl_thd *
+sl_thd_aep_alloc_ext_dcb(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, dcbcap_t dcap, dcboff_t doff, arcvcap_t *extrcv)
+{
+	PRINTC("UNIMPLEMENTED: Using CAPMGR API which should manage the DCB capabilities\n");
+
+	return NULL;
+}
+
 struct sl_thd *
-sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, vaddr_t dcbuaddr, arcvcap_t *extrcv)
+sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, vaddr_t *dcbuaddr, arcvcap_t *extrcv)
 {
 	struct sl_thd *t = NULL;
 
diff --git a/src/components/lib/sl/sl_raw.c b/src/components/lib/sl/sl_raw.c
index 378a9c7969..dcb7919449 100644
--- a/src/components/lib/sl/sl_raw.c
+++ b/src/components/lib/sl/sl_raw.c
@@ -87,12 +87,15 @@ sl_thd_alloc_no_cs(cos_thd_fn_t fn, void *data)
 	struct sl_thd          *t   = NULL;
 	struct cos_aep_info    *aep = NULL;
 	struct cos_dcb_info    *dcb = NULL;
+	dcbcap_t dcap;
+	dcboff_t doff;
 
 	aep = sl_thd_alloc_aep_backend();
 	if (!aep) goto done;
-	dcb = cos_dcb_info_assign();
+	dcap = cos_dcb_info_alloc_curr(&doff, (vaddr_t *)&dcb);
+	assert(dcap);
 
-	aep->thd = cos_thd_alloc(ci, ci->comp_cap, fn, data, ci->pgtbl_cap, (vaddr_t)dcb);
+	aep->thd = cos_thd_alloc(ci, ci->comp_cap, fn, data, dcap, doff);
 	if (!aep->thd) goto done;
 	aep->tid = cos_introspect(ci, aep->thd, THD_GET_TID);
 	if (!aep->tid) goto done;
@@ -131,7 +134,7 @@ sl_thd_comp_init_no_cs(struct cos_defcompinfo *comp, sl_thd_property_t prps, asn
 }
 
 static struct sl_thd *
-sl_thd_alloc_ext_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx, vaddr_t dcbuaddr)
+sl_thd_alloc_ext_dcb_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx, dcbcap_t dcbcap, dcboff_t dcboff)
 {
 	struct cos_defcompinfo *dci    = cos_defcompinfo_curr_get();
 	struct cos_compinfo    *ci     = cos_compinfo_get(dci);
@@ -144,7 +147,7 @@ sl_thd_alloc_ext_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx, vad
 		aep = sl_thd_alloc_aep_backend();
 		if (!aep) goto done;
 
-		aep->thd = cos_thd_alloc_ext(ci, compci->comp_cap, idx, compci->pgtbl_cap, dcbuaddr);
+		aep->thd = cos_thd_alloc_ext(ci, compci->comp_cap, idx, dcbcap, dcboff);
 		if (!aep->thd) goto done;
 		aep->tid = cos_introspect(ci, aep->thd, THD_GET_TID);
 		if (!aep->tid) goto done;
@@ -153,7 +156,7 @@ sl_thd_alloc_ext_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx, vad
 		sl_mod_thd_create(sl_mod_thd_policy_get(t));
 	} else {
 		assert(idx == 0);
-		ret = cos_initaep_alloc(comp, NULL, 0, dcbuaddr);
+		ret = cos_initaep_alloc(comp, NULL, 0, dcbcap);
 		if (ret) goto done;
 
 		t = sl_thd_comp_init_no_cs(comp, 0, 0);
@@ -171,15 +174,18 @@ sl_thd_aep_alloc_no_cs(cos_aepthd_fn_t fn, void *data, sl_thd_property_t prps, c
 	struct cos_aep_info    *aep = NULL;
 	struct cos_dcb_info    *dcb = NULL;
 	int                     ret;
+	dcbcap_t dcap;
+	dcboff_t doff;
 
 	aep = sl_thd_alloc_aep_backend();
 	if (!aep) goto done;
-	dcb = cos_dcb_info_assign();
+	dcap = cos_dcb_info_alloc_curr(&doff, (vaddr_t *)&dcb);
+	assert(dcap);
 
 	/* NOTE: Cannot use stack-allocated cos_aep_info struct here */
-	if (prps & SL_THD_PROPERTY_OWN_TCAP) ret = cos_aep_alloc(aep, fn, data, (vaddr_t)dcb);
+	if (prps & SL_THD_PROPERTY_OWN_TCAP) ret = cos_aep_alloc(aep, fn, data, dcap, doff);
 	else                                 ret = cos_aep_tcap_alloc(aep, sl_thd_aepinfo(sl__globals_cpu()->sched_thd)->tc,
-			                                              fn, data, (vaddr_t)dcb);
+			                                              fn, data, dcap, doff);
 	if (ret) goto done;
 
 	t = sl_thd_alloc_init(aep, 0, prps, dcb);
@@ -190,7 +196,7 @@ sl_thd_aep_alloc_no_cs(cos_aepthd_fn_t fn, void *data, sl_thd_property_t prps, c
 }
 
 static struct sl_thd *
-sl_thd_aep_alloc_ext_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, thdclosure_index_t idx, sl_thd_property_t prps, cos_channelkey_t key, vaddr_t dcbuaddr, arcvcap_t *extrcv)
+sl_thd_aep_alloc_ext_dcb_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, thdclosure_index_t idx, sl_thd_property_t prps, cos_channelkey_t key, dcbcap_t dcap, dcboff_t doff, arcvcap_t *extrcv)
 {
 	struct cos_aep_info *aep = NULL;
 	struct sl_thd       *t   = NULL;
@@ -198,11 +204,11 @@ sl_thd_aep_alloc_ext_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, t
 	int                  ret = 0;
 
 	if (prps & SL_THD_PROPERTY_SEND) {
-		assert(sched);
+		assert(sched && !doff);
 		if (prps & SL_THD_PROPERTY_OWN_TCAP) {
-			ret = cos_initaep_alloc(comp, sl_thd_aepinfo(sched), prps & SL_THD_PROPERTY_SEND, dcbuaddr);
+			ret = cos_initaep_alloc(comp, sl_thd_aepinfo(sched), prps & SL_THD_PROPERTY_SEND, dcap);
 		} else {
-			ret = cos_initaep_tcap_alloc(comp, sl_thd_tcap(sched), sl_thd_aepinfo(sched), dcbuaddr);
+			ret = cos_initaep_tcap_alloc(comp, sl_thd_tcap(sched), sl_thd_aepinfo(sched), dcap);
 		}
 		if (ret) goto done;
 
@@ -214,9 +220,9 @@ sl_thd_aep_alloc_ext_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, t
 		if (!aep) goto done;
 
 		if (prps & SL_THD_PROPERTY_OWN_TCAP) {
-			ret = cos_aep_alloc_ext(aep, comp, sl_thd_aepinfo(sched), idx, dcbuaddr);
+			ret = cos_aep_alloc_ext(aep, comp, sl_thd_aepinfo(sched), idx, dcap, doff);
 		} else {
-			ret = cos_aep_tcap_alloc_ext(aep, comp, sl_thd_aepinfo(sched), sl_thd_tcap(sched), idx, dcbuaddr);
+			ret = cos_aep_tcap_alloc_ext(aep, comp, sl_thd_aepinfo(sched), sl_thd_tcap(sched), idx, dcap, doff);
 		}
 		if (ret) goto done;
 
@@ -270,23 +276,39 @@ sl_thd_comp_init(struct cos_defcompinfo *comp, int is_sched)
 }
 
 struct sl_thd *
-sl_thd_initaep_alloc(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key, vaddr_t dcbuaddr)
+sl_thd_initaep_alloc(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key, vaddr_t *dcbaddr)
+{
+	PRINTC("UNIMPLEMENTED: Using RAW API which cannot manage DCB resource for child components\n");
+
+	return NULL;
+}
+
+struct sl_thd *
+sl_thd_initaep_alloc_dcb(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, int is_sched, int own_tcap, cos_channelkey_t key, dcbcap_t dcap)
 {
 	struct sl_thd *t = NULL;
 
 	if (!comp) return NULL;
 
 	sl_cs_enter();
-	if (!is_sched) t = sl_thd_alloc_ext_no_cs(comp, 0, dcbuaddr);
-	else           t = sl_thd_aep_alloc_ext_no_cs(comp, sched_thd, 0, (is_sched ? SL_THD_PROPERTY_SEND : 0)
-						      | (own_tcap ? SL_THD_PROPERTY_OWN_TCAP : 0), key, dcbuaddr, NULL);
+	if (!is_sched) t = sl_thd_alloc_ext_dcb_no_cs(comp, 0, dcap, 0);
+	else           t = sl_thd_aep_alloc_ext_dcb_no_cs(comp, sched_thd, 0, (is_sched ? SL_THD_PROPERTY_SEND : 0)
+							  | (own_tcap ? SL_THD_PROPERTY_OWN_TCAP : 0), key, dcap, 0, NULL);
 	sl_cs_exit();
 
 	return t;
 }
 
 struct sl_thd *
-sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, vaddr_t dcbuaddr, arcvcap_t *extrcv)
+sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, vaddr_t *dcbaddr, arcvcap_t *extrcv)
+{
+	PRINTC("UNIMPLEMENTED: Using RAW API which cannot manage DCB resource for child components\n");
+
+	return NULL;
+}
+
+struct sl_thd *
+sl_thd_aep_alloc_ext_dcb(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thdclosure_index_t idx, int is_aep, int own_tcap, cos_channelkey_t key, dcbcap_t dcap, dcboff_t doff, arcvcap_t *extrcv)
 {
 	struct sl_thd *t = NULL;
 
@@ -294,9 +316,9 @@ sl_thd_aep_alloc_ext(struct cos_defcompinfo *comp, struct sl_thd *sched_thd, thd
 	sl_cs_enter();
 	if (!is_aep) own_tcap = 0;
 	if (is_aep) {
-		t = sl_thd_aep_alloc_ext_no_cs(comp, sched_thd, idx, own_tcap ? SL_THD_PROPERTY_OWN_TCAP : 0, key, dcbuaddr, extrcv);
+		t = sl_thd_aep_alloc_ext_dcb_no_cs(comp, sched_thd, idx, own_tcap ? SL_THD_PROPERTY_OWN_TCAP : 0, key, dcap, doff, extrcv);
 	} else {
-		t = sl_thd_alloc_ext_no_cs(comp, idx, dcbuaddr);
+		t = sl_thd_alloc_ext_dcb_no_cs(comp, idx, dcap, doff);
 	}
 	sl_cs_exit();
 
diff --git a/src/components/lib/sl/sl_sched.c b/src/components/lib/sl/sl_sched.c
index 48ad45f205..769702bbce 100644
--- a/src/components/lib/sl/sl_sched.c
+++ b/src/components/lib/sl/sl_sched.c
@@ -562,6 +562,7 @@ sl_init(microsec_t period)
 	struct cos_defcompinfo *dci = cos_defcompinfo_curr_get();
 	struct cos_compinfo    *ci  = cos_compinfo_get(dci);
 	struct sl_global_cpu   *g   = sl__globals_cpu();
+	struct cos_aep_info    *ga  = cos_sched_aep_get(dci);
 	u32_t cpu_bmp[(NUM_CPU + 7)/8] = { 0 }; /* TODO! pass from the user! */
 
 	if (ps_cas(&first, 1, 0)) {
@@ -586,12 +587,11 @@ sl_init(microsec_t period)
 	sl_timeout_init(period);
 
 	/* Create the scheduler thread for us. cos_sched_aep_get() is from global(static) memory */
-	cos_dcb_info_init();
 	g->sched_thd       = sl_thd_alloc_init(cos_sched_aep_get(dci), 0, 0, (struct cos_dcb_info *)cos_init_dcb_get());
 	assert(g->sched_thd);
-	g->sched_thdcap    = BOOT_CAPTBL_SELF_INITTHD_CPU_BASE;
-	g->sched_tcap      = BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE;
-	g->sched_rcv       = BOOT_CAPTBL_SELF_INITRCV_CPU_BASE;
+	g->sched_thdcap    = ga->thd;
+	g->sched_tcap      = ga->tc;
+	g->sched_rcv       = ga->rcv;
 	g->sched_thd->prio = 0;
 	ps_list_head_init(&g->event_head);
 	assert(cos_thdid() == sl_thd_thdid(g->sched_thd));
diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index 73fda151e5..9f7449f0e8 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -16,7 +16,7 @@
 #include "include/chal/defs.h"
 #include "include/hw.h"
 #include "include/scb.h"
-//#include "include/dcb.h"
+#include "include/dcb.h"
 
 #define COS_DEFAULT_RET_CAP 0
 
@@ -119,7 +119,7 @@ cap_ulthd_restore(struct pt_regs *regs, struct cos_cpu_local_info *cos_info, int
 	scb_core->curr_thd = 0;
 
 	ulthd = ch_ult->t;
-	assert(ulthd->dcbinfo);
+	if (unlikely(ulthd->dcbinfo = NULL)) goto done;
 	if (ulthd == thd) goto done;
 	/* TODO: check if the threads are running in the same component.. */
 
@@ -368,6 +368,8 @@ cap_cpy(struct captbl *t, capid_t cap_to, capid_t capin_to, capid_t cap_from, ca
 		type = ctfrom->type;
 		sz   = __captbl_cap2bytes(type);
 
+		/* don't allow cap copy on SCB/DCB */
+		if (type == CAP_SCB || type == CAP_DCB) return -EINVAL;
 		ctto = __cap_capactivate_pre(t, cap_to, capin_to, type, &ret);
 		if (!ctto) return -EINVAL;
 
@@ -1234,32 +1236,25 @@ static int __attribute__((noinline)) composite_syscall_slowpath(struct pt_regs *
 			break;
 		}
 		case CAPTBL_OP_THDACTIVATE: {
-			u32_t              reg2         = __userregs_get2(regs);
 			u32_t              reg3         = __userregs_get3(regs);
 			u32_t              reg4         = __userregs_get4(regs);
-			thdclosure_index_t init_data    = (reg2) & (~(~0 << 12));
-			capid_t            pgtbl_addr   = (reg2) & (~0 << 12);
+			capid_t            pgtbl_addr   = __userregs_get2(regs);
+			thdclosure_index_t init_data    = (reg4 << 16) >> 16;
 			capid_t            thd_cap      = (capin >> 16);
 			capid_t            pgtbl_cap    = (capin << 16) >> 16;
 			capid_t            compcap      = (reg3 >> 16);
-			capid_t            dcbpgtbl_cap = (reg3 << 16) >> 16;
-			vaddr_t            dcbuaddr     = reg4, dcbkaddr;
-			unsigned long     *tpte = NULL, *dcbpte = NULL, flags;
-			struct             thread *thd;
+			capid_t            dcb_cap      = (reg3 << 16) >> 16;
+			unsigned short     dcboff       = reg4 >> 16;
+			unsigned long     *tpte = NULL, flags;
+			struct thread     *thd;
 			struct cap_header *ctfrom;
 
 			ret = cap_kmem_activate(ct, pgtbl_cap, pgtbl_addr, (unsigned long *)&thd, &tpte);
 			if (unlikely(ret)) cos_throw(err, ret);
 			assert(thd && tpte);
 
-			ctfrom = captbl_lkup(ct, dcbpgtbl_cap);
-			if (unlikely(!ctfrom || ctfrom->type != CAP_PGTBL)) return -EINVAL;
-			dcbpte = pgtbl_lkup(((struct cap_pgtbl *)ctfrom)->pgtbl, (dcbuaddr & (~0 << 12)), (u32_t *)&flags);
-			if (!dcbpte) return -EINVAL;
-			dcbkaddr = ((unsigned long)dcbpte & (~0 << 12)) | (dcbuaddr & ~(~0 << 12));
-
 			/* ret is returned by the overall function */
-			ret = thd_activate(ct, cap, thd_cap, thd, compcap, init_data, dcbkaddr);
+			ret = thd_activate(ct, cap, thd_cap, thd, compcap, init_data, dcb_cap, dcboff);
 			if (ret) kmem_unalloc(tpte);
 
 			break;
@@ -1282,7 +1277,7 @@ static int __attribute__((noinline)) composite_syscall_slowpath(struct pt_regs *
 		case CAPTBL_OP_THDDEACTIVATE: {
 			livenessid_t lid = __userregs_get2(regs);
 
-			ret = thd_deactivate(ct, op_cap, capin, lid, 0, 0, 0);
+			ret = thd_deactivate(ct, op_cap, capin, lid, 0, 0, 0, 0);
 			break;
 		}
 		case CAPTBL_OP_THDTLSSET: {
@@ -1298,7 +1293,7 @@ static int __attribute__((noinline)) composite_syscall_slowpath(struct pt_regs *
 			capid_t      pgtbl_cap     = __userregs_get3(regs);
 			capid_t      cosframe_addr = __userregs_get4(regs);
 
-			ret = thd_deactivate(ct, op_cap, capin, lid, pgtbl_cap, cosframe_addr, 1);
+			ret = thd_deactivate(ct, op_cap, capin, lid, pgtbl_cap, cosframe_addr, 0, 1);
 			break;
 		}
 		case CAPTBL_OP_CAPKMEM_FREEZE: {
@@ -1310,9 +1305,9 @@ static int __attribute__((noinline)) composite_syscall_slowpath(struct pt_regs *
 		case CAPTBL_OP_COMPACTIVATE: {
 			capid_t      captbl_cap = __userregs_get2(regs) >> 16;
 			capid_t      pgtbl_cap  = __userregs_get2(regs) & 0xFFFF;
-			livenessid_t lid        = (capin >> 16);
+			livenessid_t lid        = capin >> 16;
 			capid_t      comp_cap   = (capin << 16) >> 16;
-			vaddr_t      scb_uaddr  = __userregs_get3(regs) | ~((1 << 12) - 1);
+			vaddr_t      scb_uaddr  = __userregs_get3(regs) & (~0 << 12);
 			vaddr_t      entry_addr = __userregs_get4(regs);
 			capid_t      scb_cap    = __userregs_get3(regs) & ((1 << 12) - 1);
 
@@ -1427,8 +1422,8 @@ static int __attribute__((noinline)) composite_syscall_slowpath(struct pt_regs *
 		}
 		case CAPTBL_OP_SCB_ACTIVATE: {
 			capid_t      ptcap  = __userregs_get2(regs);
-			livenessid_t lid    = __userregs_get3(regs);
-			vaddr_t      addr   = __userregs_get4(regs);
+			livenessid_t lid    = __userregs_get4(regs);
+			vaddr_t      addr   = __userregs_get3(regs);
 			unsigned long *pte;
 			struct cos_scb_info *scb;
 
@@ -1449,36 +1444,41 @@ static int __attribute__((noinline)) composite_syscall_slowpath(struct pt_regs *
 
 			break;
 		}
-//		case CAPTBL_OP_DCB_ACTIVATE: {
-//			u32_t        r1      = __userregs_get1(regs);
-//			u32_t        r2      = __userregs_get2(regs);
-//			u32_t        r3      = __userregs_get3(regs);
-//			u32_t        r4      = __userregs_get4(regs);
-//			capid_t      dcbcap  = r1 >> 16;
-//			capid_t      ptcap   = (r1 << 16) >> 16;
-//			livenessid_t lid     = r2 >> 16;
-//			capid_t      ptcapin = (r2 << 16) >> 16;
-//			vaddr_t      kaddr   = r3;
-//			vaddr_t      uaddrin = r4;
-//
-//			ret = dcb_activate(ct, cap, dcbcap, ptcap, kaddr, lid, ptcapin, uaddr);
-//
-//			break;
-//		}
-//		case CAPTBL_OP_DCB_DEACTIVATE: {
-//			u32_t        r2      = __userregs_get2(regs);
-//			u32_t        r3      = __userregs_get3(regs);
-//			u32_t        r4      = __userregs_get4(regs);
-//			livenessid_t lid     = r2 >> 16;
-//			capid_t      ptcap   = (r2 << 16) >> 16;
-//			vaddr_t      cf_addr = r3 & (~0 << 12);
-//			vaddr_t      uaddrin = r4 & (~0 << 12);
-//			capid_t      ptcapin = (r4 << 20) >> 12 | ((r3 << 20) >> 20);
-//
-//			ret = dcb_deactivate(ct, capin, lid, ptcap, cf_addr, ptcapin, uaddrin);
-//
-//			break;
-//		}
+		case CAPTBL_OP_DCB_ACTIVATE: {
+			u32_t        r1      = __userregs_get1(regs);
+			u32_t        r2      = __userregs_get2(regs);
+			u32_t        r3      = __userregs_get3(regs);
+			u32_t        r4      = __userregs_get4(regs);
+			capid_t      dcbcap  = r1 >> 16;
+			capid_t      ptcap   = r2 >> 16;
+			livenessid_t lid     = (r1 << 16) >> 16;
+			capid_t      ptcapin = (r2 << 16) >> 16;
+			vaddr_t      kaddr   = r3;
+			vaddr_t      uaddrin = r4;
+			struct cos_dcb_info *dcb;
+			unsigned long *pte;
+
+			ret = cap_kmem_activate(ct, ptcap, kaddr, (unsigned long *)&dcb, &pte);
+			if (ret) cos_throw(err, ret);
+
+			ret = dcb_activate(ct, cap, dcbcap, (vaddr_t)dcb, lid, ptcapin, uaddrin);
+
+			break;
+		}
+		case CAPTBL_OP_DCB_DEACTIVATE: {
+			u32_t        r2      = __userregs_get2(regs);
+			u32_t        r3      = __userregs_get3(regs);
+			u32_t        r4      = __userregs_get4(regs);
+			livenessid_t lid     = r2 >> 16;
+			capid_t      ptcap   = (r2 << 16) >> 16;
+			vaddr_t      cf_addr = r3 & (~0 << 12);
+			vaddr_t      uaddrin = r4 & (~0 << 12);
+			capid_t      ptcapin = (r4 << 20) >> 12 | ((r3 << 20) >> 20);
+
+			ret = dcb_deactivate(op_cap, capin, lid, ptcap, cf_addr, ptcapin, uaddrin);
+
+			break;
+		}
 		default:
 			goto err;
 		}
diff --git a/src/kernel/include/dcb.h b/src/kernel/include/dcb.h
index 3b64c4477e..cd466180dd 100644
--- a/src/kernel/include/dcb.h
+++ b/src/kernel/include/dcb.h
@@ -11,6 +11,7 @@
 #include "pgtbl.h"
 #include "retype_tbl.h"
 #include "component.h"
+#include "thd.h"
 
 #define DCB_ENTRIES_MAX_PER_PAGE (PAGE_SIZE/sizeof(struct cos_dcb_info))
 
@@ -23,25 +24,24 @@ struct cap_dcb {
 } __attribute__((packed));
 
 static int
-dcb_activate(struct captbl *t, capid_t ctcap, capid_t dcbcap, capid_t ptcap, vaddr_t kaddr, livenessid_t lid, capid_t ptcapin, vaddr_t uaddr)
+dcb_activate(struct captbl *t, capid_t ctcap, capid_t dcbcap, vaddr_t kaddr, livenessid_t lid, capid_t ptcapin, vaddr_t uaddr)
 {
 	struct cap_dcb      *dc;
-	struct cap_pgtbl    *ptc;
-	unsigned long       *tpte;
-	struct cos_dcb_info *di;
+	struct cap_pgtbl    *ptcin;
 	int                  ret;
+	paddr_t              pf = chal_va2pa((void *)kaddr);
 
-	ret = cap_kmem_activate(t, ptcap, kaddr, (unsigned long *)&di, &tpte);
-	if (unlikely(ret)) return -EINVAL;
-	assert(di && tpte);
+	ptcin = (struct cap_pgtbl *)captbl_lkup(t, ptcapin);
+	if (!ptcin || ptcin->h.type != CAP_PGTBL) return -EINVAL;
 
-	/* TODO: memactivate kaddr -> uaddr in ptcapin */
+	if (pgtbl_mapping_add(ptcin->pgtbl, uaddr, pf, PGTBL_USER_DEF)) return -EINVAL;
 
 	dc = (struct cap_dcb *)__cap_capactivate_pre(t, ctcap, dcbcap, CAP_DCB, &ret);
 	if (!dc) return -EINVAL;
 
 	ltbl_get(lid, &dc->liveness);
-	dc->kern_addr = (vaddr_t)di;
+	dc->kern_addr = kaddr;
+	memset((void *)kaddr, 0, PAGE_SIZE);
 	dc->refcnt    = 0;
 	dc->cpuid     = get_cpuid();
 
@@ -54,16 +54,24 @@ static int
 dcb_deactivate(struct cap_captbl *ct, capid_t dcbcap, livenessid_t lid, capid_t ptcap, capid_t cosframe_addr, capid_t ptcapin, vaddr_t uaddrin)
 {
 	struct cap_dcb *dc;
+	struct cap_pgtbl *ptcin;
+	unsigned long *pte, addr, flags, old_v;
 	int ret;
 
-	dc = (struct cap_comp *)captbl_lkup(ct->captbl, dcbcap);
-	if (dc->h.type != CAP_DCB) return -EINVAL;
+	dc = (struct cap_dcb *)captbl_lkup(ct->captbl, dcbcap);
+	if (!dc || dc->h.type != CAP_DCB) return -EINVAL;
+
+	if (!ptcapin || !uaddrin) return -EINVAL;
+	ptcin = (struct cap_pgtbl *)captbl_lkup(ct->captbl, ptcapin);
+	if (!ptcin || ptcin->h.type != CAP_PGTBL) return -EINVAL;
+	pte = pgtbl_lkup(ptcin->pgtbl, uaddrin, (u32_t *)&flags);
+	if (!pte) return -EINVAL;
+	if ((vaddr_t)pte != dc->kern_addr) return -EINVAL;
 
 	if (dc->refcnt) return -EPERM;
-	/* TODO: verify uaddrin in ptcapin maps to kaddr for this dcb and then unmap from ptcapin at uaddrin */
 
 	ltbl_expire(&dc->liveness);
-	ret = kmem_deact_pre(dc, ct, ptcap, cosframe_addr, &pte, &old_v);
+	ret = kmem_deact_pre((struct cap_header *)dc, ct->captbl, ptcap, cosframe_addr, &pte, &old_v);
 	if (ret) return ret;
 	ret = kmem_deact_post(pte, old_v);
 	if (ret) return ret;
@@ -89,9 +97,9 @@ dcb_thd_deref(struct cap_dcb *dc, struct thread *thd)
 {
 	if (!dc->refcnt) return -EINVAL;
 	if (dc->cpuid != thd->cpuid) return -EINVAL;
-	if (!ltbl_isalive(&dc->liveness)) return -EPERM;
 
-	assert((vaddr_t)thd->dcbinfo >= dc->kern_addr && (vaddr_t)thd->dcbinfo < (dc->kern_addr + PAGE_SIZE));
+	if ((vaddr_t)thd->dcbinfo < dc->kern_addr || (vaddr_t)thd->dcbinfo > (dc->kern_addr + PAGE_SIZE)) return -EINVAL;
+	if (!ltbl_isalive(&dc->liveness)) return -EPERM;
 
 	dc->refcnt--;
 
diff --git a/src/kernel/include/scb.h b/src/kernel/include/scb.h
index b3618bed12..ca80a7036a 100644
--- a/src/kernel/include/scb.h
+++ b/src/kernel/include/scb.h
@@ -48,7 +48,7 @@ scb_deactivate(struct cap_captbl *ct, capid_t scbcap, capid_t ptcap, capid_t cos
 	int ret;
 
 	sc = (struct cap_scb *)captbl_lkup(ct->captbl, scbcap);
-	if (sc->h.type != CAP_SCB) return -EINVAL;
+	if (!sc || sc->h.type != CAP_SCB) return -EINVAL;
 
 	/* FIXME: component using this scbcap is still active! how to handle this? */
 	if (sc->compc) return -EPERM;
diff --git a/src/kernel/include/shared/cos_types.h b/src/kernel/include/shared/cos_types.h
index ac86706d2a..0a47ee09de 100644
--- a/src/kernel/include/shared/cos_types.h
+++ b/src/kernel/include/shared/cos_types.h
@@ -267,8 +267,7 @@ enum
 	BOOT_CAPTBL_KM_PTE          = 18,
 
 	BOOT_CAPTBL_SINV_CAP           = 20,
-	BOOT_CAPTBL_SELF_SCB           = 24, /* FIXME: Do we need this? */
-	BOOT_CAPTBL_SELF_INITHW_BASE   = 26,
+	BOOT_CAPTBL_SELF_INITHW_BASE   = 24,
 	BOOT_CAPTBL_SELF_INITTHD_BASE  = 28,
 	/*
 	 * NOTE: kernel doesn't support sharing a cache-line across cores,
@@ -293,7 +292,16 @@ enum
 #define BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cpuid) (BOOT_CAPTBL_SELF_INITTHD_BASE + cpuid * CAP64B_IDSZ)
 #define BOOT_CAPTBL_SELF_INITTCAP_BASE_CPU(cpuid) (BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cpuid) + CAP16B_IDSZ)
 #define BOOT_CAPTBL_SELF_INITRCV_BASE_CPU(cpuid) (BOOT_CAPTBL_SELF_INITRCV_BASE + cpuid * CAP64B_IDSZ)
-#define BOOT_CAPTBL_SELF_INITDCB_BASE_CPU(cpuid) (BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cpuid) + CAP32B_IDSZ)
+
+enum llboot_scb_dcb_caps
+{
+	LLBOOT_CAPTBL_SCB     = round_up_to_pow2(BOOT_CAPTBL_LAST_CAP, CAPMAX_ENTRY_SZ),
+	LLBOOT_CAPTBL_INITDCB = LLBOOT_CAPTBL_SCB + CAP32B_IDSZ,
+	LLBOOT_CAPTBL_FREE    = round_up_to_pow2(LLBOOT_CAPTBL_INITDCB + (CAP32B_IDSZ * NUM_CPU), CAPMAX_ENTRY_SZ),
+};
+
+#define LLBOOT_CAPTBL_INITDCB_CPU(cpuid) (LLBOOT_CAPTBL_INITDCB + (CAP32B_IDSZ * cpuid))
+#define LLBOOT_CAPTBL_CPU_INITDCB        (LLBOOT_CAPTBL_INITDCB_CPU(cos_cpuid()))
 
 /*
  * The half of the first page of init captbl is devoted to root node. So, the
diff --git a/src/kernel/include/thd.h b/src/kernel/include/thd.h
index cb269e3f7c..797255dc50 100644
--- a/src/kernel/include/thd.h
+++ b/src/kernel/include/thd.h
@@ -90,6 +90,8 @@ struct cap_thd {
 	cpuid_t           cpuid;
 } __attribute__((packed));
 
+#include "dcb.h"
+
 static void
 thd_upcall_setup(struct thread *thd, u32_t entry_addr, int option, int arg1, int arg2, int arg3)
 {
@@ -334,16 +336,22 @@ thd_scheduler_set(struct thread *thd, struct thread *sched)
 }
 
 static int
-thd_activate(struct captbl *t, capid_t cap, capid_t capin, struct thread *thd, capid_t compcap, thdclosure_index_t init_data, unsigned long dcbkaddr)
+thd_activate(struct captbl *t, capid_t cap, capid_t capin, struct thread *thd, capid_t compcap, thdclosure_index_t init_data, capid_t dcbcap, unsigned short dcboff)
 {
 	struct cos_cpu_local_info *cli = cos_cpu_local_info();
-	struct cap_thd            *tc;
-	struct cap_comp           *compc;
+	struct cap_thd            *tc = NULL;
+	struct cap_comp           *compc = NULL;
+	struct cap_dcb            *dc = NULL;
 	int                        ret;
 
 	memset(thd, 0, sizeof(struct thread));
 	compc = (struct cap_comp *)captbl_lkup(t, compcap);
 	if (unlikely(!compc || compc->h.type != CAP_COMP)) return -EINVAL;
+	if (likely(dcbcap)) {
+		dc    = (struct cap_dcb *)captbl_lkup(t, dcbcap);
+		if (unlikely(!dc || dc->h.type != CAP_DCB)) return -EINVAL;
+		if (dcboff > PAGE_SIZE / sizeof(struct cos_dcb_info)) return -EINVAL;
+	}
 
 	tc = (struct cap_thd *)__cap_capactivate_pre(t, cap, capin, CAP_THD, &ret);
 	if (!tc) return ret;
@@ -355,8 +363,12 @@ thd_activate(struct captbl *t, capid_t cap, capid_t capin, struct thread *thd, c
 	thd->refcnt                           = 1;
 	thd->invstk_top                       = 0;
 	thd->cpuid                            = get_cpuid();
-	thd->dcbinfo                          = (struct cos_dcb_info *)dcbkaddr;
-	memset(thd->dcbinfo, 0, sizeof(struct cos_dcb_info));
+	if (likely(dc)) {
+		ret = dcb_thd_ref(dc, thd);
+		if (ret) goto err; /* TODO: cleanup captbl slot */
+		thd->dcbinfo = (struct cos_dcb_info *)(dc->kern_addr + (dcboff * sizeof(struct cos_dcb_info)));
+		memset(thd->dcbinfo, 0, sizeof(struct cos_dcb_info));
+	}
 	assert(thd->tid <= MAX_NUM_THREADS);
 	thd_scheduler_set(thd, thd_current(cli));
 
@@ -370,18 +382,21 @@ thd_activate(struct captbl *t, capid_t cap, capid_t capin, struct thread *thd, c
 	tc->t     = thd;
 	tc->cpuid = get_cpuid();
 	__cap_capactivate_post(&tc->h, CAP_THD);
-	/* TODO: dcb_thd_ref() */
 
 	return 0;
+
+err:
+	return ret;
 }
 
 static int
 thd_deactivate(struct captbl *ct, struct cap_captbl *dest_ct, unsigned long capin, livenessid_t lid, capid_t pgtbl_cap,
-               capid_t cosframe_addr, const int root)
+               capid_t cosframe_addr, capid_t dcbcap, const int root)
 {
 	struct cos_cpu_local_info *cli = cos_cpu_local_info();
-	struct cap_header *        thd_header;
-	struct thread *            thd;
+	struct cap_header         *thd_header;
+	struct thread             *thd;
+	struct cap_dcb            *dcb = NULL;
 	unsigned long              old_v = 0, *pte = NULL;
 	int                        ret;
 
@@ -389,6 +404,10 @@ thd_deactivate(struct captbl *ct, struct cap_captbl *dest_ct, unsigned long capi
 	if (!thd_header || thd_header->type != CAP_THD) cos_throw(err, -EINVAL);
 	thd = ((struct cap_thd *)thd_header)->t;
 	assert(thd->refcnt);
+	if (dcbcap) {
+		dcb = (struct cap_dcb *)captbl_lkup(ct, dcbcap);
+		if (!dcb || dcb->h.type != CAP_DCB) cos_throw(err, -EINVAL);
+	}
 
 	if (thd->refcnt == 1) {
 		if (!root) cos_throw(err, -EINVAL);
@@ -414,6 +433,10 @@ thd_deactivate(struct captbl *ct, struct cap_captbl *dest_ct, unsigned long capi
 		}
 	}
 
+	if (dcb) {
+		ret = dcb_thd_deref(dcb, thd);
+		if (ret) cos_throw(err, ret);
+	}
 	ret = cap_capdeactivate(dest_ct, capin, CAP_THD, lid);
 	if (ret) cos_throw(err, ret);
 
@@ -427,7 +450,6 @@ thd_deactivate(struct captbl *ct, struct cap_captbl *dest_ct, unsigned long capi
 		ret = kmem_deact_post(pte, old_v);
 		if (ret) cos_throw(err, ret);
 	}
-	/* TODO: dcb_thd_deref() */
 
 	return 0;
 err:
@@ -590,7 +612,7 @@ thd_switch_update(struct thread *thd, struct pt_regs *regs, int issame)
 		 */
 	}
 
-	if (thd->dcbinfo && thd->dcbinfo->sp) {
+	if (likely(thd->dcbinfo && thd->dcbinfo->sp)) {
 		if (!preempt) {
 			regs->dx = regs->ip = thd->dcbinfo->ip + DCB_IP_KERN_OFF;
 			regs->cx = regs->sp = thd->dcbinfo->sp;
diff --git a/src/platform/i386/boot_comp.c b/src/platform/i386/boot_comp.c
index 9599051ff5..b023d8e471 100644
--- a/src/platform/i386/boot_comp.c
+++ b/src/platform/i386/boot_comp.c
@@ -9,11 +9,13 @@
 #include <component.h>
 #include <inv.h>
 #include <hw.h>
+#include <scb.h>
+#include <dcb.h>
 
 extern u8_t *boot_comp_pgd;
 
-vaddr_t dcb_addr[NUM_CPU];
-void *thd_mem[NUM_CPU], *tcap_mem[NUM_CPU];
+vaddr_t dcb_addr, dcb_uaddr;
+void *thd_mem, *tcap_mem;
 struct captbl *glb_boot_ct;
 
 int
@@ -24,7 +26,7 @@ boot_nptes(unsigned int sz)
 
 int
 boot_pgtbl_mappings_add(struct captbl *ct, capid_t pgdcap, capid_t ptecap, const char *label, void *kern_vaddr,
-                        unsigned long user_vaddr, unsigned int range, int uvm, unsigned long *scb_uaddr)
+                        unsigned long user_vaddr, unsigned int range, int uvm)
 {
 	int               ret;
 	u8_t *            ptes;
@@ -87,44 +89,22 @@ boot_pgtbl_mappings_add(struct captbl *ct, capid_t pgdcap, capid_t ptecap, const
 		if (!uvm && pgtbl_cosframe_add(pgtbl, mapat, pf, PGTBL_COSFRAME)) assert(0);
 		assert((void *)p == pgtbl_lkup(pgtbl, user_vaddr + i * PAGE_SIZE, &flags));
 	}
-	if (uvm) {
-		unsigned int j;
-		u8_t   *p;
-		paddr_t pf;
-		u32_t   mapat = (u32_t)user_vaddr + i * PAGE_SIZE, flags = 0;
-
-		assert(i == range / PAGE_SIZE);
-		assert(COS_SCB_SIZE == PAGE_SIZE); /* FIXME: for prototype impl! */
-		*scb_uaddr = (unsigned long)mapat;
-		i++;
-
-		for (j = 0; j < NUM_CPU; j++, i++) {
-			unsigned long *pte = NULL, flags;
-			mapat = (u32_t)user_vaddr + i * PAGE_SIZE;
-			p = mem_boot_alloc(1);
-			assert(p);
-			pf = chal_va2pa(p);
-			if (pgtbl_mapping_add(pgtbl, mapat, pf, PGTBL_USER_DEF)) assert(0);
-
-			dcb_addr[j] = (unsigned long)p;
-			pte = pgtbl_lkup(pgtbl, mapat, (u32_t *)&flags);
-			assert((void *)p == pte);
-		}
-	}
 
 	return 0;
 }
 
-/* FIXME:  loops to create threads/tcaps/rcv caps per core. */
 static void
-kern_boot_thd(struct captbl *ct, void *thd_mem, void *tcap_mem, vaddr_t dcb_addr, const cpuid_t cpu_id)
+kern_boot_thd(struct captbl *ct, const cpuid_t cpu_id)
 {
+	void                      *tmem     = (void *)((vaddr_t)thd_mem + cpu_id * PAGE_SIZE);
+	void                      *tcmem    = (void *)((vaddr_t)tcap_mem + cpu_id * PAGE_SIZE);
+	vaddr_t                    dcbmem   = dcb_addr + cpu_id * PAGE_SIZE, dcbumem = dcb_uaddr + cpu_id * PAGE_SIZE;
 	struct cos_cpu_local_info *cos_info = cos_cpu_local_info();
-	struct thread *            t        = thd_mem;
-	struct tcap *              tc       = tcap_mem;
+	struct thread             *t        = tmem;
+	struct tcap               *tc       = tcmem;
 	tcap_res_t                 expended;
 	int                        ret;
-	struct cap_pgtbl *         cap_pt;
+	struct cap_pgtbl          *cap_pt;
 	pgtbl_t                    pgtbl;
 
 	assert(cpu_id >= 0);
@@ -134,16 +114,18 @@ kern_boot_thd(struct captbl *ct, void *thd_mem, void *tcap_mem, vaddr_t dcb_addr
 	cos_info->cpuid          = cpu_id;
 	cos_info->invstk_top     = 0;
 	cos_info->overflow_check = 0xDEADBEEF;
-	ret = thd_activate(ct, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cpu_id), thd_mem, BOOT_CAPTBL_SELF_COMP, 0, dcb_addr);
+	ret = dcb_activate(ct, BOOT_CAPTBL_SELF_CT, LLBOOT_CAPTBL_INITDCB_CPU(cpu_id), dcbmem, 0, BOOT_CAPTBL_SELF_PT, dcbumem);
+	assert(!ret);
+	ret = thd_activate(ct, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cpu_id), tmem, BOOT_CAPTBL_SELF_COMP, 0, LLBOOT_CAPTBL_INITDCB_CPU(cpu_id), 0);
 	assert(!ret);
 
 	tcap_active_init(cos_info);
-	ret = tcap_activate(ct, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_INITTCAP_BASE_CPU(cpu_id), tcap_mem);
+	ret = tcap_activate(ct, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_INITTCAP_BASE_CPU(cpu_id), tcmem);
 	assert(!ret);
 
 	tc->budget.cycles = TCAP_RES_INF; /* Chronos's got all the time in the world */
 	tc->perm_prio     = 0;
-	tcap_setprio(tc, 0);                              /* Chronos gets preempted by no one! */
+	tcap_setprio(tc, 0);              /* Chronos gets preempted by no one! */
 	list_enqueue(&cos_info->tcaps, &tc->active_list); /* Chronos on the TCap active list */
 	cos_info->tcap_uid  = 1;
 	cos_info->cycles    = tsc();
@@ -157,10 +139,7 @@ kern_boot_thd(struct captbl *ct, void *thd_mem, void *tcap_mem, vaddr_t dcb_addr
 	                    BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cpu_id), BOOT_CAPTBL_SELF_INITTCAP_BASE_CPU(cpu_id), 0, 1);
 	assert(!ret);
 
-	/*
-	 * boot component's mapped into SELF_PT,
-	 * switching to boot component's pgd
-	 */
+	/* boot component's mapped into SELF_PT, switching to boot component's pgd. */
 	cap_pt = (struct cap_pgtbl *)captbl_lkup(ct, BOOT_CAPTBL_SELF_PT);
 	if (!cap_pt || !CAP_TYPECHK(cap_pt, CAP_PGTBL)) assert(0);
 	pgtbl = cap_pt->pgtbl;
@@ -178,13 +157,13 @@ kern_boot_comp(const cpuid_t cpu_id)
 	u8_t *         boot_comp_captbl;
 	pgtbl_t        pgtbl     = (pgtbl_t)chal_va2pa(&boot_comp_pgd), boot_vm_pgd;
 	u32_t          hw_bitmap = 0xFFFFFFFF;
-	vaddr_t        scb_uaddr  = 0, scb_kaddr = 0;
+	vaddr_t        scb_uaddr = 0, scb_kaddr = 0;
 
 	assert(cpu_id >= 0);
 	if (NUM_CPU > 1 && cpu_id > 0) {
 		assert(glb_boot_ct);
 		pgtbl_update(pgtbl);
-		kern_boot_thd(glb_boot_ct, thd_mem[cpu_id], tcap_mem[cpu_id], dcb_addr[cpu_id], cpu_id);
+		kern_boot_thd(glb_boot_ct, cpu_id);
 		return;
 	}
 
@@ -205,11 +184,13 @@ kern_boot_comp(const cpuid_t cpu_id)
 		assert(!ret);
 	}
 
-	for (i = 0; i < NUM_CPU; i++) {
-		thd_mem[i]  = mem_boot_alloc(1);
-		tcap_mem[i] = mem_boot_alloc(1);
-		assert(thd_mem[i] && tcap_mem[i]);
-	}
+	scb_kaddr = (vaddr_t)mem_boot_alloc(1);
+	assert(scb_kaddr);
+
+	dcb_addr = (vaddr_t)mem_boot_alloc(NUM_CPU);
+	thd_mem  = mem_boot_alloc(NUM_CPU);
+	tcap_mem = mem_boot_alloc(NUM_CPU);
+	assert(thd_mem && tcap_mem && dcb_addr);
 
 	if (captbl_activate_boot(glb_boot_ct, BOOT_CAPTBL_SELF_CT)) assert(0);
 	if (sret_activate(glb_boot_ct, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SRET)) assert(0);
@@ -217,8 +198,6 @@ kern_boot_comp(const cpuid_t cpu_id)
 	hw_asndcap_init();
 	if (hw_activate(glb_boot_ct, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_INITHW_BASE, hw_bitmap)) assert(0);
 
-	scb_kaddr = (vaddr_t)mem_boot_alloc(1);
-	assert(scb_kaddr);
 	/*
 	 * separate pgd for boot component virtual memory
 	 */
@@ -230,8 +209,11 @@ kern_boot_comp(const cpuid_t cpu_id)
 		assert(0);
 
 	ret = boot_pgtbl_mappings_add(glb_boot_ct, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_BOOTVM_PTE, "booter VM", mem_bootc_start(),
-	                              (unsigned long)mem_bootc_vaddr(), mem_bootc_end() - mem_bootc_start(), 1, &scb_uaddr);
-	assert(ret == 0 && scb_uaddr);
+	                              (unsigned long)mem_bootc_vaddr(), mem_bootc_end() - mem_bootc_start(), 1);
+	assert(ret == 0);
+	scb_uaddr = (vaddr_t)(mem_bootc_vaddr() + (mem_bootc_end() - mem_bootc_start()));
+	assert(COS_SCB_SIZE == PAGE_SIZE);
+	dcb_uaddr = scb_uaddr + COS_SCB_SIZE;
 
 	/*
 	 * This _must_ be the last allocation.  The bump pointer
@@ -244,21 +226,22 @@ kern_boot_comp(const cpuid_t cpu_id)
 	nkmemptes = boot_nptes(mem_utmem_end() - mem_boot_end());
 	ret       = boot_pgtbl_mappings_add(glb_boot_ct, BOOT_CAPTBL_SELF_UNTYPED_PT, BOOT_CAPTBL_KM_PTE, "untyped memory",
                                       mem_boot_nalloc_end(nkmemptes), BOOT_MEM_KM_BASE,
-                                      mem_utmem_end() - mem_boot_nalloc_end(nkmemptes), 0, 0);
+                                      mem_utmem_end() - mem_boot_nalloc_end(nkmemptes), 0);
 	assert(ret == 0);
 
 	/* Shut off further bump allocations */
 	glb_memlayout.allocs_avail = 0;
-	if (scb_activate(glb_boot_ct, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_SCB, scb_kaddr, 0)) assert(0);
+	if (scb_activate(glb_boot_ct, BOOT_CAPTBL_SELF_CT, LLBOOT_CAPTBL_SCB, scb_kaddr, 0)) assert(0);
+
 	printk("\tCapability table and page-table created.\n");
 
 	if (comp_activate(glb_boot_ct, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_COMP, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_PT,
-	                  BOOT_CAPTBL_SELF_SCB, 0, (vaddr_t)mem_bootc_entry(), scb_uaddr))
+			  LLBOOT_CAPTBL_SCB, 0, (vaddr_t)mem_bootc_entry(), scb_uaddr))
 		assert(0);
 
 	printk("\tCreated boot component structure from page-table and capability-table.\n");
 
-	kern_boot_thd(glb_boot_ct, thd_mem[cpu_id], tcap_mem[cpu_id], dcb_addr[cpu_id], cpu_id);
+	kern_boot_thd(glb_boot_ct, cpu_id);
 
 	printk("\tBoot component initialization complete.\n");
 }

From 80231049c6cfd4606853364ef288277a98056b30 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Sat, 16 Mar 2019 22:04:04 -0400
Subject: [PATCH 033/127] bug in restore function

---
 src/kernel/capinv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index 9f7449f0e8..fc2e56ed64 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -119,7 +119,7 @@ cap_ulthd_restore(struct pt_regs *regs, struct cos_cpu_local_info *cos_info, int
 	scb_core->curr_thd = 0;
 
 	ulthd = ch_ult->t;
-	if (unlikely(ulthd->dcbinfo = NULL)) goto done;
+	if (unlikely(ulthd->dcbinfo == NULL)) goto done;
 	if (ulthd == thd) goto done;
 	/* TODO: check if the threads are running in the same component.. */
 

From 230cec9c9b0b367309ae4c12b14c9ae357fc3463 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Sat, 16 Mar 2019 22:13:38 -0400
Subject: [PATCH 034/127] adds same component check for current kernel and user
 threads in lazy update function

---
 src/kernel/capinv.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index fc2e56ed64..2d1765ab2c 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -97,9 +97,6 @@ cap_ulthd_restore(struct pt_regs *regs, struct cos_cpu_local_info *cos_info, int
 
 	*ci_ptr = thd_invstk_current_compinfo(thd, cos_info, &invstk_top);
 
-	/* no user-level thread switches in invocations! */
-	/* if (unlikely(invstk_top)) goto done; */
-
 	assert(*ci_ptr && (*ci_ptr)->captbl);
 
 	if (unlikely(!(*ci_ptr)->scb_data)) goto done;
@@ -121,7 +118,8 @@ cap_ulthd_restore(struct pt_regs *regs, struct cos_cpu_local_info *cos_info, int
 	ulthd = ch_ult->t;
 	if (unlikely(ulthd->dcbinfo == NULL)) goto done;
 	if (ulthd == thd) goto done;
-	/* TODO: check if the threads are running in the same component.. */
+	/* check if kcurr and ucurr threads are both in the same page-table(component) */
+	if (thd_current_pgtbl(ulthd) != thd_current_pgtbl(thd)) goto done;
 
 	thd_current_update(ulthd, thd, cos_info);
 	thd = ulthd;

From 08174b1131657fc4a2ececf1e2b841bd81ea39f9 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Sat, 16 Mar 2019 22:16:21 -0400
Subject: [PATCH 035/127] change function name to lazyupdate

---
 src/kernel/capinv.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index 2d1765ab2c..8222ff2a50 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -86,7 +86,7 @@ printfn(struct pt_regs *regs)
 
 /* TODO: inline fast path and force non-inlined slow-path */
 static inline struct thread *
-cap_ulthd_restore(struct pt_regs *regs, struct cos_cpu_local_info *cos_info, int interrupt, struct comp_info **ci_ptr)
+cap_ulthd_lazyupdate(struct pt_regs *regs, struct cos_cpu_local_info *cos_info, int interrupt, struct comp_info **ci_ptr)
 {
 	struct thread       *thd = thd_current(cos_info);
 	struct cap_thd      *ch_ult;
@@ -843,7 +843,7 @@ cap_hw_asnd(struct cap_asnd *asnd, struct pt_regs *regs)
 
 	cos_info = cos_cpu_local_info();
 	assert(cos_info);
-	thd = cap_ulthd_restore(regs, cos_info, 1, &ci);
+	thd = cap_ulthd_lazyupdate(regs, cos_info, 1, &ci);
 	assert(thd && ci && ci->captbl);
 	assert(!(thd->state & THD_STATE_PREEMPTED));
 	tcap = tcap_current(cos_info);
@@ -894,7 +894,7 @@ timer_process(struct pt_regs *regs)
 
 	cos_info = cos_cpu_local_info();
 	assert(cos_info);
-	thd_curr = cap_ulthd_restore(regs, cos_info, 1, &comp);
+	thd_curr = cap_ulthd_lazyupdate(regs, cos_info, 1, &comp);
 	assert(thd_curr && thd_curr->cpuid == get_cpuid());
 	assert(comp);
 
@@ -1009,7 +1009,7 @@ composite_syscall_handler(struct pt_regs *regs)
 	int                        thd_switch = 0;
 
 	/* Definitely do it for all the fast-path calls. */
-	thd = cap_ulthd_restore(regs, cos_info, 0, &ci);
+	thd = cap_ulthd_lazyupdate(regs, cos_info, 0, &ci);
 	assert(thd);
 	cap = __userregs_getcap(regs);
 

From 28c3f14322f6100b1e8cfadc795ca37595e3dd9f Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Mon, 18 Mar 2019 12:50:51 -0400
Subject: [PATCH 036/127] Bugfix in capmgr for copying initaep caps

---
 src/components/implementation/capmgr/naive/cap_mgr.c     | 9 +++++----
 .../implementation/no_interface/llbooter/boot_deps.h     | 3 ++-
 src/kernel/include/shared/cos_types.h                    | 2 --
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/components/implementation/capmgr/naive/cap_mgr.c b/src/components/implementation/capmgr/naive/cap_mgr.c
index 5b8a35ba9c..79b01b10b9 100644
--- a/src/components/implementation/capmgr/naive/cap_mgr.c
+++ b/src/components/implementation/capmgr/naive/cap_mgr.c
@@ -130,6 +130,7 @@ capmgr_initaep_create_cserialized(u32_t *sndtidret, u32_t *rcvtcret, spdid_t s,
 	struct cap_comp_info     *rs      = cap_info_comp_find(s);
 	struct cap_comp_cpu_info *rs_cpu  = cap_info_cpu_local(rs);
 	struct cos_compinfo      *rs_ci   = cap_info_ci(rs);
+	struct cos_compinfo      *rc_ci   = cap_info_ci(rc);
 	struct sl_thd            *t       = NULL, *rinit = NULL;
 	thdcap_t                  thdcap  = 0;
 	dcbcap_t                  dcbcap  = 0;
@@ -167,13 +168,13 @@ capmgr_initaep_create_cserialized(u32_t *sndtidret, u32_t *rcvtcret, spdid_t s,
 	}
 
 	/* parent needs tcap/rcv to manage time. thd/asnd to activate. */
-	ret = cos_cap_cpy(rs_ci, cap_ci, CAP_THD, sl_thd_thdcap(t));
+	ret = cos_cap_cpy(rc_ci, cap_ci, CAP_THD, sl_thd_thdcap(t));
 	if (!ret) goto err;
-	rcv = cos_cap_cpy(rs_ci, cap_ci, CAP_ARCV, sl_thd_rcvcap(t));
+	rcv = cos_cap_cpy(rc_ci, cap_ci, CAP_ARCV, sl_thd_rcvcap(t));
 	if (!rcv) goto err;
-	tc = cos_cap_cpy(rs_ci, cap_ci, CAP_TCAP, sl_thd_tcap(t));
+	tc = cos_cap_cpy(rc_ci, cap_ci, CAP_TCAP, sl_thd_tcap(t));
 	if (!tc) goto err;
-	snd = cos_cap_cpy(rs_ci, cap_ci, CAP_ASND, sl_thd_asndcap(t));
+	snd = cos_cap_cpy(rc_ci, cap_ci, CAP_ASND, sl_thd_asndcap(t));
 	if (!snd) goto err;
 
 	cap_info_thd_init(rc, t, key);
diff --git a/src/components/implementation/no_interface/llbooter/boot_deps.h b/src/components/implementation/no_interface/llbooter/boot_deps.h
index 4d72beb8b1..9d74c77b17 100644
--- a/src/components/implementation/no_interface/llbooter/boot_deps.h
+++ b/src/components/implementation/no_interface/llbooter/boot_deps.h
@@ -304,7 +304,8 @@ boot_sched_caps_init(spdid_t spdid)
 	struct cos_aep_info    *child_aep = boot_spd_initaep_get(spdid);
 	int ret, i;
 
-	if (!capmgr_spdid || capmgr_spdid != spdid) return;
+	/* booter uses capmgr to create initthds in root-schedulers */
+	if (compsi->parent_spdid || (capmgr_spdid && spdid != capmgr_spdid)) return;
 
 	boot_newcomp_defcinfo_init(spdid);
 	ret = cos_cap_cpy_at(ci, BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, boot_info, child_aep->thd);
diff --git a/src/kernel/include/shared/cos_types.h b/src/kernel/include/shared/cos_types.h
index 0a47ee09de..4193830ebb 100644
--- a/src/kernel/include/shared/cos_types.h
+++ b/src/kernel/include/shared/cos_types.h
@@ -282,12 +282,10 @@ enum
 };
 
 #define BOOT_CAPTBL_SELF_INITTCAP_BASE (BOOT_CAPTBL_SELF_INITTHD_BASE + CAP16B_IDSZ)
-#define BOOT_CAPTBL_SELF_INITDCB_BASE (BOOT_CAPTBL_SELF_INITTHD_BASE + CAP32B_IDSZ)
 
 #define BOOT_CAPTBL_SELF_INITTHD_CPU_BASE (BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cos_cpuid()))
 #define BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE (BOOT_CAPTBL_SELF_INITTCAP_BASE_CPU(cos_cpuid()))
 #define BOOT_CAPTBL_SELF_INITRCV_CPU_BASE (BOOT_CAPTBL_SELF_INITRCV_BASE_CPU(cos_cpuid()))
-#define BOOT_CAPTBL_SELF_INITDCB_CPU_BASE (BOOT_CAPTBL_SELF_INITDCB_BASE_CPU(cos_cpuid()))
 
 #define BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cpuid) (BOOT_CAPTBL_SELF_INITTHD_BASE + cpuid * CAP64B_IDSZ)
 #define BOOT_CAPTBL_SELF_INITTCAP_BASE_CPU(cpuid) (BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cpuid) + CAP16B_IDSZ)

From 7f40fe05e95868881bf1a2ad940f67c8c35d10f9 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Mon, 18 Mar 2019 15:33:12 -0400
Subject: [PATCH 037/127] Fixed user-level dispatch in root-scheduler component

---
 .../implementation/capmgr/naive/cap_mgr.c     |   4 +
 .../Makefile                                  |   2 +-
 .../unit_schedappaep.c}                       |   0
 .../tests/unit_schedappcomp/Makefile          |   8 +
 .../unit_schedappcomp.c}                      |   0
 .../tests/unit_schedcomp/Makefile             |   4 +-
 .../tests/unit_schedcomp/unit_schedlib.c      | 160 ++++++++++++++++++
 src/components/include/sl.h                   |   1 +
 src/components/lib/cos_dcb.c                  |   2 +-
 src/components/lib/sl/sl_capmgr.c             |   2 +
 .../i386/runscripts/unit_hierschedcomps.sh    |  10 +-
 .../i386/runscripts/unit_schedappcomps.sh     |   5 +
 .../i386/runscripts/unit_schedcomp.sh         |   6 +-
 13 files changed, 193 insertions(+), 11 deletions(-)
 rename src/components/implementation/tests/{unit_schedaep => unit_schedappaep}/Makefile (85%)
 rename src/components/implementation/tests/{unit_schedaep/unit_schedaep.c => unit_schedappaep/unit_schedappaep.c} (100%)
 create mode 100644 src/components/implementation/tests/unit_schedappcomp/Makefile
 rename src/components/implementation/tests/{unit_schedcomp/unit_schedcomp.c => unit_schedappcomp/unit_schedappcomp.c} (100%)
 create mode 100644 src/components/implementation/tests/unit_schedcomp/unit_schedlib.c
 create mode 100644 src/platform/i386/runscripts/unit_schedappcomps.sh

diff --git a/src/components/implementation/capmgr/naive/cap_mgr.c b/src/components/implementation/capmgr/naive/cap_mgr.c
index 79b01b10b9..1d6455717a 100644
--- a/src/components/implementation/capmgr/naive/cap_mgr.c
+++ b/src/components/implementation/capmgr/naive/cap_mgr.c
@@ -31,6 +31,7 @@ capmgr_thd_create_cserialized(struct cos_dcb_info **dcb, thdid_t *tid, thdclosur
 
 	cap_info_thd_init(r, t, 0);
 	*tid = sl_thd_thdid(t);
+	*dcb = (struct cos_dcb_info *)dcbaddr;
 
 	return thdcap;
 err:
@@ -69,6 +70,7 @@ capmgr_thd_create_ext_cserialized(struct cos_dcb_info **dcb, thdid_t *tid, spdid
 	cap_info_thd_init(rc, t, 0);
 	cap_info_thd_init(rs, t, 0);
 	*tid = sl_thd_thdid(t);
+	*dcb = (struct cos_dcb_info *)dcbaddr;
 	/* child is not a scheduler, don't copy into child */
 
 	return thdcap;
@@ -272,6 +274,7 @@ capmgr_aep_create_ext_cserialized(struct cos_dcb_info **dcb, u32_t *rcvtcret, sp
 	cap_info_thd_init(rc, t, key);
 	cap_info_thd_init(rs, t, 0);
 	thdcap = ret << 16 | sl_thd_thdid(t);
+	*dcb = (struct cos_dcb_info *)dcbaddr;
 
 	return thdcap;
 err:
@@ -325,6 +328,7 @@ capmgr_aep_create_cserialized(struct cos_dcb_info **dcb, u32_t *tcrcvret, thdclo
 	cap_info_thd_init(rc, t, key);
 	*tcrcvret = (tc << 16 | rcv);
 	thdcap    = ret << 16 | sl_thd_thdid(t);
+	*dcb = (struct cos_dcb_info *)dcbaddr;
 
 	return thdcap;
 err:
diff --git a/src/components/implementation/tests/unit_schedaep/Makefile b/src/components/implementation/tests/unit_schedappaep/Makefile
similarity index 85%
rename from src/components/implementation/tests/unit_schedaep/Makefile
rename to src/components/implementation/tests/unit_schedappaep/Makefile
index b6f56f58bf..da9e217045 100644
--- a/src/components/implementation/tests/unit_schedaep/Makefile
+++ b/src/components/implementation/tests/unit_schedappaep/Makefile
@@ -1,4 +1,4 @@
-COMPONENT=unit_schedaep_test.o
+COMPONENT=unit_schedappaep_test.o
 INTERFACES=
 DEPENDENCIES=sched capmgr
 IF_LIB=
diff --git a/src/components/implementation/tests/unit_schedaep/unit_schedaep.c b/src/components/implementation/tests/unit_schedappaep/unit_schedappaep.c
similarity index 100%
rename from src/components/implementation/tests/unit_schedaep/unit_schedaep.c
rename to src/components/implementation/tests/unit_schedappaep/unit_schedappaep.c
diff --git a/src/components/implementation/tests/unit_schedappcomp/Makefile b/src/components/implementation/tests/unit_schedappcomp/Makefile
new file mode 100644
index 0000000000..dfe5cbcf92
--- /dev/null
+++ b/src/components/implementation/tests/unit_schedappcomp/Makefile
@@ -0,0 +1,8 @@
+COMPONENT=unit_schedappcomp_test.o
+INTERFACES=
+DEPENDENCIES=sched
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api #required for cos_sinv in llboot.h!
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/tests/unit_schedcomp/unit_schedcomp.c b/src/components/implementation/tests/unit_schedappcomp/unit_schedappcomp.c
similarity index 100%
rename from src/components/implementation/tests/unit_schedcomp/unit_schedcomp.c
rename to src/components/implementation/tests/unit_schedappcomp/unit_schedappcomp.c
diff --git a/src/components/implementation/tests/unit_schedcomp/Makefile b/src/components/implementation/tests/unit_schedcomp/Makefile
index 3edcf1b36d..1134e9cb60 100644
--- a/src/components/implementation/tests/unit_schedcomp/Makefile
+++ b/src/components/implementation/tests/unit_schedcomp/Makefile
@@ -1,8 +1,8 @@
 COMPONENT=unit_schedcomp_test.o
 INTERFACES=
-DEPENDENCIES=sched
+DEPENDENCIES=capmgr
 IF_LIB=
-ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api #required for cos_sinv in llboot.h!
+ADDITIONAL_LIBS=$(LIBSLCAPMGR) -lsl_thd_static_backend -lsl_mod_rr -lcos_defkernel_api
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/tests/unit_schedcomp/unit_schedlib.c b/src/components/implementation/tests/unit_schedcomp/unit_schedlib.c
new file mode 100644
index 0000000000..908f12a0cd
--- /dev/null
+++ b/src/components/implementation/tests/unit_schedcomp/unit_schedlib.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright 2016, Phani Gadepalli and Gabriel Parmer, GWU, gparmer@gwu.edu.
+ *
+ * This uses a two clause BSD License.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <llprint.h>
+#include <sl.h>
+#include <hypercall.h>
+
+/* sl also defines a SPIN macro */
+#undef SPIN
+#define SPIN(iters)                                \
+	do {                                       \
+		if (iters > 0) {                   \
+			for (; iters > 0; iters--) \
+				;                  \
+		} else {                           \
+			while (1)                  \
+				;                  \
+		}                                  \
+	} while (0)
+
+
+#define N_TESTTHDS 8
+#define WORKITERS 10000
+
+#define N_TESTTHDS_PERF 2
+#define PERF_ITERS 1000000
+
+static volatile cycles_t mid_cycs = 0;
+static volatile int testing = 1;
+
+void
+test_thd_perffn(void *data)
+{
+	cycles_t start_cycs = 0, end_cycs = 0, wc_cycs = 0, total_cycs = 0;
+	unsigned int i = 0;
+
+	rdtscll(start_cycs);
+	sl_thd_yield(0);
+	rdtscll(end_cycs);
+	assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs);
+
+	for (i = 0; i < PERF_ITERS; i++) {
+		cycles_t diff1_cycs = 0, diff2_cycs = 0;
+
+		mid_cycs = 0;
+		rdtscll(start_cycs);
+		sl_thd_yield(0);
+		rdtscll(end_cycs);
+		assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs);
+
+		diff1_cycs = mid_cycs - start_cycs;
+		diff2_cycs = end_cycs - mid_cycs;
+
+		if (diff1_cycs > wc_cycs) wc_cycs = diff1_cycs;
+		if (diff2_cycs > wc_cycs) wc_cycs = diff2_cycs;
+		total_cycs += (diff1_cycs + diff2_cycs);
+	}
+
+	PRINTC("SWITCH UBENCH: avg: %llu, wc: %llu, iters:%u\n", (total_cycs / (2 * PERF_ITERS)), wc_cycs, PERF_ITERS);
+	testing = 0;
+	/* done testing! let the spinfn cleanup! */
+	sl_thd_yield(0);
+
+	sl_thd_exit();
+}
+
+void
+test_thd_spinfn(void *data)
+{
+	while (likely(testing)) {
+		rdtscll(mid_cycs);
+		sl_thd_yield(0);
+	}
+
+	sl_thd_exit();
+}
+
+void
+test_thd_fn(void *data)
+{
+	while (1) {
+		int workiters = WORKITERS * ((int)data);
+
+		printc("%c", 'a' + (int)data);
+		//SPIN(workiters);
+		sl_thd_yield(0);
+	}
+}
+
+void
+test_yield_perf(void)
+{
+	int                     i;
+	struct sl_thd          *threads[N_TESTTHDS_PERF];
+	union sched_param_union sp = {.c = {.type = SCHEDP_PRIO, .value = 31}};
+
+	for (i = 0; i < N_TESTTHDS_PERF; i++) {
+		if (i == 1) threads[i] = sl_thd_alloc(test_thd_perffn, (void *)&threads[0]);
+		else        threads[i] = sl_thd_alloc(test_thd_spinfn, NULL);
+		assert(threads[i]);
+		sl_thd_param_set(threads[i], sp.v);
+		PRINTC("Thread %u:%lu created\n", sl_thd_thdid(threads[i]), sl_thd_thdcap(threads[i]));
+	}
+}
+
+void
+test_yields(void)
+{
+	int                     i;
+	struct sl_thd *         threads[N_TESTTHDS];
+	union sched_param_union sp = {.c = {.type = SCHEDP_PRIO, .value = 10}};
+
+	for (i = 0; i < N_TESTTHDS; i++) {
+		threads[i] = sl_thd_alloc(test_thd_fn, (void *)i);
+		assert(threads[i]);
+		sl_thd_param_set(threads[i], sp.v);
+		PRINTC("Thread %u:%lu created\n", sl_thd_thdid(threads[i]), sl_thd_thdcap(threads[i]));
+	}
+}
+
+void
+cos_init(void)
+{
+	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
+	struct cos_compinfo *   ci    = cos_compinfo_get(defci);
+	static int first_time = 1, init_done = 0;
+
+	PRINTC("Unit-test for the scheduling library (sl) with capmgr usage\n");
+	PRINTC("CPU cycles per sec: %u\n", cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE));
+
+	if (first_time) {
+		first_time = 0;
+		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
+		cos_defcompinfo_init();
+		init_done = 1;
+	} else {
+		while (!init_done) ;
+
+		cos_defcompinfo_sched_init();
+	}
+
+	sl_init(SL_MIN_PERIOD_US);
+
+	//test_yield_perf();
+	test_yields();
+	hypercall_comp_init_done();
+
+	sl_sched_loop_nonblock();
+
+	assert(0);
+
+	return;
+}
diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index 09620b0054..3d417794f5 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -413,6 +413,7 @@ sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 {
 	struct cos_scb_info *scb = sl_scb_info_cpu();
 
+	assert(sl_thd_dcbinfo(curr) && sl_thd_dcbinfo(next));
 	/*
 	 * jump labels in the asm routine:
 	 *
diff --git a/src/components/lib/cos_dcb.c b/src/components/lib/cos_dcb.c
index 3924c21b38..576d1dc2b2 100644
--- a/src/components/lib/cos_dcb.c
+++ b/src/components/lib/cos_dcb.c
@@ -10,7 +10,7 @@ cos_dcb_info_init_ext(struct cos_dcbinfo_data *cdi, struct cos_compinfo *ci, dcb
 	memset(cdi, 0, sizeof(struct cos_dcbinfo_data));
 
 	cdi->dcbcaps[0]   = initdcbcap;
-	cdi->dcbaddr[0]   = (vaddr_t)cos_init_dcb_get();
+	cdi->dcbaddr[0]   = initdcbaddr;
 	cdi->curr_cap_off = start_off;
 	cdi->curr_cap     = 0;
 }
diff --git a/src/components/lib/sl/sl_capmgr.c b/src/components/lib/sl/sl_capmgr.c
index f76f21e455..e3c6c878c6 100644
--- a/src/components/lib/sl/sl_capmgr.c
+++ b/src/components/lib/sl/sl_capmgr.c
@@ -103,6 +103,7 @@ sl_thd_alloc_no_cs(cos_thd_fn_t fn, void *data)
 	aep->thd = capmgr_thd_create(fn, data, &tid, &dcb);
 	if (!aep->thd) goto done;
 	aep->tid = tid;
+	assert(tid && dcb);
 
 	t = sl_thd_alloc_init(aep, 0, 0, dcb);
 	sl_mod_thd_create(sl_mod_thd_policy_get(t));
@@ -226,6 +227,7 @@ sl_thd_aep_alloc_no_cs(cos_aepthd_fn_t fn, void *data, sl_thd_property_t prps, c
 	if (prps & SL_THD_PROPERTY_OWN_TCAP) owntc = 1;
 	capmgr_aep_create(aep, fn, data, owntc, key, &dcb);
 	if (aep->thd == 0) goto done;
+	assert(aep->tid && dcb);
 
 	t = sl_thd_alloc_init(aep, 0, prps, dcb);
 	sl_mod_thd_create(sl_mod_thd_policy_get(t));
diff --git a/src/platform/i386/runscripts/unit_hierschedcomps.sh b/src/platform/i386/runscripts/unit_hierschedcomps.sh
index ba032033bf..5122af6f50 100644
--- a/src/platform/i386/runscripts/unit_hierschedcomps.sh
+++ b/src/platform/i386/runscripts/unit_hierschedcomps.sh
@@ -5,8 +5,8 @@ cp root_fprr.o boot.o
 cp hier_fprr.o hier_fprr1.o
 cp hier_fprr.o hier_fprr2.o
 cp hier_fprr.o hier_fprr3.o
-cp unit_schedcomp_test.o unit_schedcomp_test1.o
-cp unit_schedcomp_test.o unit_schedcomp_test2.o
-cp unit_schedcomp_test.o unit_schedcomp_test3.o
-cp unit_schedcomp_test.o unit_schedcomp_test4.o
-./cos_linker "llboot.o, ;*hier_fprr1.o, ;capmgr.o, ;*hier_fprr2.o, ;*boot.o, ;*hier_fprr3.o, ;unit_schedcomp_test1.o, ;unit_schedcomp_test2.o, ;unit_schedcomp_test3.o, ;unit_schedcomp_test4.o, :boot.o-capmgr.o;hier_fprr1.o-capmgr.o|[parent_]boot.o;hier_fprr2.o-capmgr.o|[parent_]boot.o;hier_fprr3.o-capmgr.o|[parent_]hier_fprr1.o;unit_schedcomp_test1.o-boot.o;unit_schedcomp_test2.o-hier_fprr1.o;unit_schedcomp_test3.o-hier_fprr2.o;unit_schedcomp_test4.o-hier_fprr3.o" ./gen_client_stub
+cp unit_schedappcomp_test.o unit_schedappcomp_test1.o
+cp unit_schedappcomp_test.o unit_schedappcomp_test2.o
+cp unit_schedappcomp_test.o unit_schedappcomp_test3.o
+cp unit_schedappcomp_test.o unit_schedappcomp_test4.o
+./cos_linker "llboot.o, ;*hier_fprr1.o, ;capmgr.o, ;*hier_fprr2.o, ;*boot.o, ;*hier_fprr3.o, ;unit_schedappcomp_test1.o, ;unit_schedappcomp_test2.o, ;unit_schedappcomp_test3.o, ;unit_schedappcomp_test4.o, :boot.o-capmgr.o;hier_fprr1.o-capmgr.o|[parent_]boot.o;hier_fprr2.o-capmgr.o|[parent_]boot.o;hier_fprr3.o-capmgr.o|[parent_]hier_fprr1.o;unit_schedappcomp_test1.o-boot.o;unit_schedappcomp_test2.o-hier_fprr1.o;unit_schedappcomp_test3.o-hier_fprr2.o;unit_schedappcomp_test4.o-hier_fprr3.o" ./gen_client_stub
diff --git a/src/platform/i386/runscripts/unit_schedappcomps.sh b/src/platform/i386/runscripts/unit_schedappcomps.sh
new file mode 100644
index 0000000000..5792230896
--- /dev/null
+++ b/src/platform/i386/runscripts/unit_schedappcomps.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+cp llboot_comp.o llboot.o
+cp root_fprr.o boot.o
+./cos_linker "llboot.o, ;unit_schedappcomp_test.o, ;capmgr.o, ;unit_schedappaep_test.o, ;*boot.o, :boot.o-capmgr.o;unit_schedappcomp_test.o-boot.o;unit_schedappaep_test.o-boot.o|capmgr.o" ./gen_client_stub
diff --git a/src/platform/i386/runscripts/unit_schedcomp.sh b/src/platform/i386/runscripts/unit_schedcomp.sh
index 9327f2ae50..7665041768 100644
--- a/src/platform/i386/runscripts/unit_schedcomp.sh
+++ b/src/platform/i386/runscripts/unit_schedcomp.sh
@@ -1,5 +1,7 @@
 #!/bin/sh
 
 cp llboot_comp.o llboot.o
-cp root_fprr.o boot.o
-./cos_linker "llboot.o, ;unit_schedcomp_test.o, ;capmgr.o, ;unit_schedaep_test.o, ;*boot.o, :boot.o-capmgr.o;unit_schedcomp_test.o-boot.o;unit_schedaep_test.o-boot.o|capmgr.o" ./gen_client_stub
+cp unit_schedcomp_test.o boot.o
+cp test_boot.o dummy1.o
+cp test_boot.o dummy2.o
+./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub

From cc233485e132bf18642b44d8e74022f1b6ba0530 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Mon, 1 Apr 2019 12:27:22 -0400
Subject: [PATCH 038/127] Current state, with optimized sl_thd_yield

* TODO: cleanup sl_thd_yield functions that are there for debugging.
* TODO: uncomment sl_schedule side of functionality
---
 .../tests/unit_schedcomp/unit_schedlib.c      | 119 +++++++---
 .../include/cos_asm_simple_stacks.h           |   3 +-
 src/components/include/cos_component.h        |  32 ++-
 src/components/include/cos_rdtsc.h            |  65 ++++++
 src/components/include/sl.h                   | 203 ++++++++++++------
 .../interface/capmgr/stubs/c_stub.c           |  32 ++-
 src/components/lib/sl/sl_capmgr.c             |   6 +-
 src/components/lib/sl/sl_mod_rr.c             |   2 +-
 src/components/lib/sl/sl_raw.c                |   6 +-
 src/components/lib/sl/sl_sched.c              | 111 ++++++----
 src/kernel/capinv.c                           |   1 +
 src/kernel/include/shared/consts.h            |   3 +-
 src/kernel/include/shared/cos_types.h         |   2 +
 13 files changed, 426 insertions(+), 159 deletions(-)
 create mode 100644 src/components/include/cos_rdtsc.h

diff --git a/src/components/implementation/tests/unit_schedcomp/unit_schedlib.c b/src/components/implementation/tests/unit_schedcomp/unit_schedlib.c
index 908f12a0cd..0c93f16cdf 100644
--- a/src/components/implementation/tests/unit_schedcomp/unit_schedlib.c
+++ b/src/components/implementation/tests/unit_schedcomp/unit_schedlib.c
@@ -11,6 +11,7 @@
 #include <llprint.h>
 #include <sl.h>
 #include <hypercall.h>
+#include <cos_rdtsc.h>
 
 /* sl also defines a SPIN macro */
 #undef SPIN
@@ -29,57 +30,96 @@
 #define N_TESTTHDS 8
 #define WORKITERS 10000
 
-#define N_TESTTHDS_PERF 2
 #define PERF_ITERS 1000000
 
+static cycles_t rdtscp_min = 0, rdtscp_max = 0, rdtscp_avg = 0;
+static volatile int switched = 0;
 static volatile cycles_t mid_cycs = 0;
 static volatile int testing = 1;
+static struct sl_thd *perf_thd, *spin_thd;
 
 void
 test_thd_perffn(void *data)
 {
-	cycles_t start_cycs = 0, end_cycs = 0, wc_cycs = 0, total_cycs = 0;
+	thdid_t yield_to = sl_thd_thdid(spin_thd);
+	cycles_t start_cycs = 0, end_cycs = 0, wc_cycs = 0, total_cycs = 0, bc_cycs = 500;
 	unsigned int i = 0;
+	int ret = 0;
 
+	assert(perf_thd == sl_thd_curr());
 	rdtscll(start_cycs);
-	sl_thd_yield(0);
+	printc("a");
+	sl_thd_yield(yield_to);
+	//ret = sl_thd_dispatch(spin_thd, cos_sched_sync(), perf_thd);
+	//sl_thd_yield_thd_c(perf_thd, spin_thd);
+	//sl_thd_yield_thd(spin_thd);
+	//assert(ret == 0);
 	rdtscll(end_cycs);
-	assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs);
+	//assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs);
+	assert(switched);
+	sl_scb_info_cpu()->reserved_debugging = 1;
 
 	for (i = 0; i < PERF_ITERS; i++) {
 		cycles_t diff1_cycs = 0, diff2_cycs = 0;
 
-		mid_cycs = 0;
-		rdtscll(start_cycs);
-		sl_thd_yield(0);
-		rdtscll(end_cycs);
-		assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs);
-
-		diff1_cycs = mid_cycs - start_cycs;
-		diff2_cycs = end_cycs - mid_cycs;
-
-		if (diff1_cycs > wc_cycs) wc_cycs = diff1_cycs;
+		end_cycs = start_cycs = 0;
+		//mid_cycs = 0;
+		switched = 0;
+		cos_rdtscp(start_cycs);
+		//rdtscll(start_cycs);
+		//ret = sl_thd_dispatch(spin_thd, cos_sched_sync(), perf_thd);
+		printc("a");
+		sl_thd_yield(yield_to);
+		//sl_thd_yield_thd_c(perf_thd, spin_thd);
+		//sl_thd_yield_thd(spin_thd);
+		//rdtscll(end_cycs);
+		cos_rdtscp(end_cycs);
+		assert(switched);
+		assert(ret == 0);
+		//assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs);
+
+		//diff1_cycs = mid_cycs - start_cycs;
+		diff2_cycs = end_cycs - start_cycs;
+		assert(diff2_cycs > rdtscp_min);
+		diff2_cycs -= rdtscp_min;
+
+		//if (diff1_cycs > wc_cycs) wc_cycs = diff1_cycs;
 		if (diff2_cycs > wc_cycs) wc_cycs = diff2_cycs;
-		total_cycs += (diff1_cycs + diff2_cycs);
+		if (diff2_cycs < bc_cycs) bc_cycs = diff2_cycs;
+		total_cycs += diff2_cycs;
 	}
 
-	PRINTC("SWITCH UBENCH: avg: %llu, wc: %llu, iters:%u\n", (total_cycs / (2 * PERF_ITERS)), wc_cycs, PERF_ITERS);
+	assert(sl_scb_info_cpu()->reserved_debugging == 1);
+	PRINTC("SWITCH UBENCH (2 switches): avg: %llu, wc: %llu, bc: %llu, iters:%u\n", (total_cycs / (PERF_ITERS)), wc_cycs, bc_cycs, PERF_ITERS);
 	testing = 0;
-	/* done testing! let the spinfn cleanup! */
-	sl_thd_yield(0);
+	/* done testing! free the spin thread! */
+	while (1) ;
+//	sl_thd_free(spin_thd);
 
-	sl_thd_exit();
+//	sl_thd_exit();
 }
 
 void
 test_thd_spinfn(void *data)
 {
+	thdid_t yield_to = sl_thd_thdid(perf_thd);
+	assert(sl_thd_curr() == spin_thd);
+
 	while (likely(testing)) {
-		rdtscll(mid_cycs);
-		sl_thd_yield(0);
+		//rdtscll(mid_cycs);
+		switched = 1;
+		//sl_thd_dispatch(perf_thd, cos_sched_sync(), spin_thd);
+		printc("b");
+		sl_thd_yield(yield_to);
+		//sl_thd_yield_thd_c(spin_thd, perf_thd);
+		//sl_thd_yield_thd(perf_thd);
 	}
 
-	sl_thd_exit();
+	//sl_thd_dispatch(perf_thd, cos_sched_sync(), spin_thd);
+	sl_thd_yield(yield_to);
+	//sl_thd_yield_thd_c(spin_thd, perf_thd);
+	//sl_thd_yield_thd(perf_thd);
+	//assert(0);
 }
 
 void
@@ -97,17 +137,25 @@ test_thd_fn(void *data)
 void
 test_yield_perf(void)
 {
-	int                     i;
-	struct sl_thd          *threads[N_TESTTHDS_PERF];
 	union sched_param_union sp = {.c = {.type = SCHEDP_PRIO, .value = 31}};
 
-	for (i = 0; i < N_TESTTHDS_PERF; i++) {
-		if (i == 1) threads[i] = sl_thd_alloc(test_thd_perffn, (void *)&threads[0]);
-		else        threads[i] = sl_thd_alloc(test_thd_spinfn, NULL);
-		assert(threads[i]);
-		sl_thd_param_set(threads[i], sp.v);
-		PRINTC("Thread %u:%lu created\n", sl_thd_thdid(threads[i]), sl_thd_thdcap(threads[i]));
-	}
+	assert(NUM_CPU == 1);
+
+	spin_thd = sl_thd_alloc(test_thd_spinfn, NULL);
+	assert(spin_thd);
+	sl_thd_param_set(spin_thd, sp.v);
+	PRINTC("Spin thread %u:%lu created\n", sl_thd_thdid(spin_thd), sl_thd_thdcap(spin_thd));
+
+	perf_thd = sl_thd_alloc(test_thd_perffn, NULL);
+	assert(perf_thd);
+	sl_thd_param_set(perf_thd, sp.v);
+	PRINTC("Perf thread %u:%lu created\n", sl_thd_thdid(perf_thd), sl_thd_thdcap(perf_thd));
+
+	sl_thd_yield(sl_thd_thdid(perf_thd));
+	//sl_thd_dispatch(perf_thd, cos_sched_sync(), sl_thd_curr());
+	//sl_thd_yield_thd_c(sl_thd_curr(), perf_thd);
+	//sl_thd_yield_thd(perf_thd);
+	while (1);
 }
 
 void
@@ -139,6 +187,9 @@ cos_init(void)
 		first_time = 0;
 		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
 		cos_defcompinfo_init();
+		cos_rdtscp_calib(&rdtscp_min, &rdtscp_avg, &rdtscp_max);
+		PRINTC("RDTSCP MIN:%llu MAX:%llu AVG:%llu\n", rdtscp_min, rdtscp_max, rdtscp_avg);
+
 		init_done = 1;
 	} else {
 		while (!init_done) ;
@@ -147,11 +198,11 @@ cos_init(void)
 	}
 
 	sl_init(SL_MIN_PERIOD_US);
-
-	//test_yield_perf();
-	test_yields();
 	hypercall_comp_init_done();
 
+	test_yield_perf();
+	//test_yields();
+
 	sl_sched_loop_nonblock();
 
 	assert(0);
diff --git a/src/components/include/cos_asm_simple_stacks.h b/src/components/include/cos_asm_simple_stacks.h
index 2cbe954045..ba27d284a9 100644
--- a/src/components/include/cos_asm_simple_stacks.h
+++ b/src/components/include/cos_asm_simple_stacks.h
@@ -18,7 +18,8 @@
 	shr $MAX_STACK_SZ_BYTE_ORDER, %eax; \
 	shr $16, %edx;			    \
 	pushl %edx;			    \
-	pushl %eax;
+	pushl %eax;			    \
+	pushl $0;
 
 #define COS_ASM_GET_STACK       \
 	COS_ASM_GET_STACK_BASIC \
diff --git a/src/components/include/cos_component.h b/src/components/include/cos_component.h
index 5929c0b5aa..f8b6870352 100644
--- a/src/components/include/cos_component.h
+++ b/src/components/include/cos_component.h
@@ -81,7 +81,7 @@ static inline int
 call_cap_2retvals_asm(u32_t cap_no, u32_t op, int arg1, int arg2, int arg3, int arg4,
 			 unsigned long *r1, unsigned long *r2)
 {
-	long fault = 0;
+	long fault = 0, ret2, ret3;
 	int  ret;
 
 	cap_no = (cap_no + 1) << COS_CAPABILITY_OFFSET;
@@ -101,10 +101,13 @@ call_cap_2retvals_asm(u32_t cap_no, u32_t op, int arg1, int arg2, int arg3, int
 	                     "movl $1, %%ecx\n\t"	\
 	                     "3:\n\t"			\
 	                     "popl %%ebp\n\t"		\
-	                     : "=a"(ret), "=c"(fault), "=S"(*r1), "=D"(*r2)
+	                     : "=a"(ret), "=c"(fault), "=S"(ret2), "=D"(ret3)
 	                     : "a"(cap_no), "b"(arg1), "S"(arg2), "D"(arg3), "d"(arg4)
 	                     : "memory", "cc");
 
+	*r1 = ret2;
+	*r2 = ret3;
+
 	return ret;
 }
 
@@ -138,9 +141,8 @@ extern struct cos_component_information cos_comp_info;
 static inline long
 get_stk_data(int offset)
 {
-	unsigned long curr_stk_pointer;
+	unsigned long curr_stk_pointer = 0;
 
-	__asm__("movl %%esp, %0;" : "=r"(curr_stk_pointer));
 	/*
 	 * We save the CPU_ID and thread id in the stack for fast
 	 * access.  We want to find the struct cos_stk (see the stkmgr
@@ -148,7 +150,15 @@ get_stk_data(int offset)
 	 * cpu_id.  This struct is at the _top_ of the current stack,
 	 * and cpu_id is at the top of the struct (it is a u32_t).
 	 */
-	return *(long *)((curr_stk_pointer & ~(COS_STACK_SZ - 1)) + COS_STACK_SZ - offset * sizeof(u32_t));
+	return *(long *)((((unsigned long)(&curr_stk_pointer)) & ~(COS_STACK_SZ - 1)) + COS_STACK_SZ - offset * sizeof(u32_t));
+}
+
+static inline void
+set_stk_data(int offset, long val)
+{
+	unsigned long curr_stk_pointer = 0;
+
+	*(long *)((((unsigned long)&curr_stk_pointer) & ~(COS_STACK_SZ - 1)) + COS_STACK_SZ - offset * sizeof(u32_t)) = val;
 }
 
 #define GET_CURR_CPU cos_cpuid()
@@ -188,6 +198,18 @@ cos_thdid(void)
 	return cos_get_thd_id();
 }
 
+static void *
+cos_get_slthd_ptr(void)
+{
+	return (void *)get_stk_data(SLTHDPTR_OFFSET);
+}
+
+static void
+cos_set_slthd_ptr(void *ptr)
+{
+	set_stk_data(SLTHDPTR_OFFSET, (long)ptr);
+}
+
 #define ERR_THROW(errval, label) \
 	do {                     \
 		ret = errval;    \
diff --git a/src/components/include/cos_rdtsc.h b/src/components/include/cos_rdtsc.h
new file mode 100644
index 0000000000..d8ebfad445
--- /dev/null
+++ b/src/components/include/cos_rdtsc.h
@@ -0,0 +1,65 @@
+#ifndef COS_RDTSC_H
+#define COS_RDTSC_H
+
+#include <cos_types.h>
+
+#define COS_RDTSCP_CALIB_ITERS 1000000
+
+#define cos_rdtsc rdtscll
+
+/* Copied from seL4bench */
+#define cos_rdtscp(var) do { 					\
+	u32_t low, high; 					\
+	asm volatile( 						\
+			"movl $0, %%eax \n" 			\
+			"movl $0, %%ecx \n" 			\
+			"cpuid \n" 				\
+			"rdtsc \n" 				\
+			"movl %%edx, %0 \n" 			\
+			"movl %%eax, %1 \n" 			\
+			"movl $0, %%eax \n" 			\
+			"movl $0, %%ecx \n" 			\
+			"cpuid \n" 				\
+			: 					\
+			"=r"(high), 				\
+			"=r"(low) 				\
+			: 					\
+			: "eax", "ebx", "ecx", "edx" 		\
+		    ); 						\
+	(var) = (((u64_t)high) << 32ull) | ((u64_t)low); 	\
+} while(0)
+
+/*
+ * use this to calibrate the rdtscp and perhaps use
+ * min value to remove from your benchmarks
+ */
+static inline void
+cos_rdtscp_calib(cycles_t *min, cycles_t *avg, cycles_t *max)
+{
+	int i;
+	volatile cycles_t st, en, mn = 0, mx = 0, total = 0;
+
+	cos_rdtscp(st);
+	cos_rdtscp(en);
+	mn = mx = en - st;
+
+	for (i = 0; i < COS_RDTSCP_CALIB_ITERS; i++) {
+		cycles_t diff;
+
+		cos_rdtscp(st);
+		cos_rdtscp(en);
+
+		diff = en - st;
+		total += diff;
+		if (diff < mn) mn = diff;
+		if (diff > mx) mx = diff;
+	}
+
+	if (min) *min = mn;
+	if (max) *max = mx;
+	if (avg) *avg = total / COS_RDTSCP_CALIB_ITERS;
+
+	return;
+}
+
+#endif /* COS_RDTSC_H */
diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index 3d417794f5..3939d164cd 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -40,6 +40,9 @@
 #include <sl_xcpu.h>
 #include <heap.h>
 
+#undef SL_TIMEOUTS
+#define SL_CS
+
 /* Critical section (cs) API to protect scheduler data-structures */
 struct sl_cs {
 	union sl_cs_intern {
@@ -90,15 +93,18 @@ sl_thd_setprio(struct sl_thd *t, tcap_prio_t p)
 }
 
 /* for lazy retrieval of a child component thread in the parent */
-extern struct sl_thd *sl_thd_retrieve(thdid_t tid);
+extern struct sl_thd *sl_thd_retrieve_lazy(thdid_t tid);
 
 static inline struct sl_thd *
 sl_thd_lkup(thdid_t tid)
 {
-	assert(tid != 0);
-	if (unlikely(tid > MAX_NUM_THREADS)) return NULL;
+	struct sl_thd *t;
+
+	if (unlikely(tid < 1 || tid > MAX_NUM_THREADS)) return NULL;
+	t = sl_mod_thd_get(sl_thd_lookup_backend(tid));
+	if (likely(t && sl_thd_aepinfo(t))) return t;
 
-	return sl_thd_retrieve(tid);
+	return sl_thd_retrieve_lazy(tid);
 }
 
 /* only see if it's already sl_thd initialized */
@@ -107,8 +113,7 @@ sl_thd_try_lkup(thdid_t tid)
 {
 	struct sl_thd *t = NULL;
 
-	assert(tid != 0);
-	if (unlikely(tid > MAX_NUM_THREADS)) return NULL;
+	if (unlikely(tid < 1 || tid > MAX_NUM_THREADS)) return NULL;
 
 	t = sl_mod_thd_get(sl_thd_lookup_backend(tid));
 	if (!sl_thd_aepinfo(t)) return NULL;
@@ -119,19 +124,21 @@ sl_thd_try_lkup(thdid_t tid)
 static inline thdid_t
 sl_thdid(void)
 {
-	thdid_t tid = cos_thdid();
-
-	assert(tid != 0);
-	assert(tid < MAX_NUM_THREADS);
-
-	return tid;
+	return cos_thdid();
 }
 
 
 static inline struct sl_thd *
 sl_thd_curr(void)
 {
-	return sl_thd_lkup(sl_thdid());
+	struct sl_thd *t = (struct sl_thd *)cos_get_slthd_ptr();
+
+	if (likely(t)) return t;
+
+	t = sl_thd_lkup(sl_thdid());
+	cos_set_slthd_ptr((void *)t);
+
+	return t;
 }
 
 /* are we the owner of the critical section? */
@@ -154,7 +161,7 @@ sl_cs_owner(void)
  *     -ve from cos_defswitch failure, allowing caller for ex: the scheduler thread to
  *     check if it was -EBUSY to first recieve pending notifications before retrying lock.
  */
-int sl_cs_enter_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, thdcap_t curr, sched_tok_t tok);
+int sl_cs_enter_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, struct sl_global_cpu *gcpu, struct sl_thd *curr, sched_tok_t tok);
 /*
  * @csi: current critical section value
  * @cached: a cached copy of @csi
@@ -162,28 +169,29 @@ int sl_cs_enter_contention(union sl_cs_intern *csi, union sl_cs_intern *cached,
  *
  * @ret: returns 1 if we need a retry, 0 otherwise
  */
-int sl_cs_exit_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, sched_tok_t tok);
+int sl_cs_exit_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, struct sl_global_cpu *gcpu, sched_tok_t tok);
 
 /* Enter into the scheduler critical section */
 static inline int
 sl_cs_enter_nospin(void)
 {
+#ifdef SL_CS
+	struct sl_global_cpu *gcpu = sl__globals_cpu();
+	struct sl_thd         *t   = sl_thd_curr();
 	union sl_cs_intern csi, cached;
-	struct sl_thd *    t = sl_thd_curr();
-	sched_tok_t        tok;
 
 	assert(t);
-	tok      = cos_sched_sync();
-	csi.v    = sl__globals_cpu()->lock.u.v;
+	csi.v    = gcpu->lock.u.v;
 	cached.v = csi.v;
 
 	if (unlikely(csi.s.owner)) {
-		return sl_cs_enter_contention(&csi, &cached, sl_thd_thdcap(t), tok);
+		assert(0);
+		return sl_cs_enter_contention(&csi, &cached, gcpu, t, cos_sched_sync());
 	}
 
 	csi.s.owner = sl_thd_thdcap(t);
-	if (!ps_cas(&sl__globals_cpu()->lock.u.v, cached.v, csi.v)) return 1;
-
+	if (!ps_upcas(&gcpu->lock.u.v, cached.v, csi.v)) return 1;
+#endif
 	return 0;
 }
 
@@ -218,22 +226,24 @@ sl_cs_enter_sched(void)
 static inline void
 sl_cs_exit(void)
 {
+#ifdef SL_CS
+	struct sl_global_cpu *gcpu = sl__globals_cpu();
 	union sl_cs_intern csi, cached;
-	sched_tok_t        tok;
 
 	assert(sl_cs_owner());
-
 retry:
-	tok      = cos_sched_sync();
-	csi.v    = sl__globals_cpu()->lock.u.v;
+	csi.v    = gcpu->lock.u.v;
 	cached.v = csi.v;
 
 	if (unlikely(csi.s.contention)) {
-		if (sl_cs_exit_contention(&csi, &cached, tok)) goto retry;
+		assert(0);
+		if (sl_cs_exit_contention(&csi, &cached, gcpu, cos_sched_sync())) goto retry;
+
 		return;
 	}
 
-	if (!ps_cas(&sl__globals_cpu()->lock.u.v, cached.v, 0)) goto retry;
+	if (!ps_upcas(&gcpu->lock.u.v, cached.v, 0)) goto retry;
+#endif
 }
 
 /*
@@ -277,7 +287,8 @@ int  sl_thd_sched_wakeup_no_cs(struct sl_thd *t);
 /* wakeup thread and do not remove from timeout queue if blocked on timeout */
 int  sl_thd_wakeup_no_cs_rm(struct sl_thd *t);
 
-void sl_thd_yield(thdid_t tid);
+void sl_thd_yield_intern(thdid_t tid);
+
 void sl_thd_yield_cs_exit(thdid_t tid);
 
 /* The entire thread allocation and free API */
@@ -345,6 +356,7 @@ sl_timeout_period_get(void)
 	return sl__globals_cpu()->period;
 }
 
+#ifdef SL_TIMEOUTS
 static inline void
 sl_timeout_oneshot(cycles_t absolute_us)
 {
@@ -399,6 +411,7 @@ sl_timeout_wakeup_expired(cycles_t now)
 		sl_thd_wakeup_no_cs_rm(th);
 	} while (heap_size(sl_timeout_heap()));
 }
+#endif
 
 static inline int
 sl_thd_is_runnable(struct sl_thd *t)
@@ -431,6 +444,7 @@ sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 	 */
 
 	__asm__ __volatile__ (				\
+		"pushl %%ebp\n\t"			\
 		"movl $2f, (%%eax)\n\t"			\
 		"movl %%esp, 4(%%eax)\n\t"		\
 		"cmp $0, 4(%%ebx)\n\t"			\
@@ -439,46 +453,65 @@ sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 		"movl 4(%%ebx), %%esp\n\t"		\
 		"jmp *(%%ebx)\n\t"			\
 		"1:\n\t"				\
-		"pushl %%ebp\n\t"			\
 		"movl %%esp, %%ebp\n\t"			\
 		"pushl %%edx\n\t"			\
 		"call sl_thd_kern_dispatch\n\t"		\
 		"addl $4, %%esp\n\t"			\
-		"popl %%ebp\n\t"			\
 		"jmp 3f\n\t"				\
 		".align 4\n\t"				\
 		"2:\n\t"				\
 		"movl $0, 4(%%ebx)\n\t"			\
 		".align 4\n\t"				\
 		"3:\n\t"				\
+		"popl %%ebp\n\t"			\
 		:
 		: "a" (sl_thd_dcbinfo(curr)), "b" (sl_thd_dcbinfo(next)),
 		  "S" ((u32_t)((u64_t)tok >> 32)), "D" ((u32_t)(((u64_t)tok << 32) >> 32)),
 		  "c" (&(scb->curr_thd)), "d" (sl_thd_thdcap(next))
 		: "memory", "cc");
 
-	return sl_scb_info_cpu()->sched_tok != tok ? -EAGAIN : 0;
+	if (likely(sl_scb_info_cpu()->sched_tok == tok)) return 0;
+
+	return -EAGAIN;
 }
 
 static inline int
 sl_thd_activate(struct sl_thd *t, sched_tok_t tok)
 {
-	struct cos_defcompinfo *dci = cos_defcompinfo_curr_get();
-	struct cos_compinfo    *ci  = &dci->ci;
-	struct sl_global_cpu   *g   = sl__globals_cpu();
-	int ret = 0;
+//	struct cos_defcompinfo *dci = cos_defcompinfo_curr_get();
+//	struct cos_compinfo    *ci  = &dci->ci;
+//	struct sl_global_cpu   *g   = sl__globals_cpu();
+//	int ret = 0;
 
+#if 0
 	if (t->properties & SL_THD_PROPERTY_SEND) {
 		return cos_sched_asnd(t->sndcap, g->timeout_next, g->sched_rcv, tok);
 	} else if (t->properties & SL_THD_PROPERTY_OWN_TCAP) {
 		return cos_switch(sl_thd_thdcap(t), sl_thd_tcap(t), t->prio,
 				  g->timeout_next, g->sched_rcv, tok);
 	} else {
+#endif
 		/* TODO: can't use if you're reprogramming a timer/prio */
 		return sl_thd_dispatch(t, tok, sl_thd_curr());
 		//return cos_switch(sl_thd_thdcap(t), g->sched_tcap, t->prio,
 		//		  g->timeout_next, g->sched_rcv, tok);
+#if 0
 	}
+#endif
+}
+
+static inline int
+sl_cs_exit_schedule_nospin_arg_c(struct sl_thd *curr, struct sl_thd *next)
+{
+	sched_tok_t tok;
+#ifdef SL_CS
+	if (likely(!sl_cs_owner())) sl_cs_enter();
+#endif
+	tok = cos_sched_sync();
+#ifdef SL_CS
+	sl_cs_exit();
+#endif
+	return sl_thd_dispatch(next, tok, curr);
 }
 
 /*
@@ -508,24 +541,28 @@ sl_thd_activate(struct sl_thd *t, sched_tok_t tok)
 static inline int
 sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 {
-	struct cos_defcompinfo *dci = cos_defcompinfo_curr_get();
-	struct cos_compinfo *ci = &dci->ci;
-	struct sl_thd_policy *pt;
-	struct sl_thd *       t;
-	struct sl_global_cpu *globals = sl__globals_cpu();
+//	return sl_thd_dispatch(to, cos_sched_sync(), sl_thd_curr());
+#if 1
+	struct sl_thd        *t = to;
+//	struct sl_global_cpu *globals = sl__globals_cpu();
 	sched_tok_t           tok;
-	cycles_t              now;
-	s64_t                 offset;
-	int                   ret;
+//	cycles_t              now;
+//	s64_t                 offset;
+//	int                   ret;
 
 	/* Don't abuse this, it is only to enable the tight loop around this function for races... */
-	if (unlikely(!sl_cs_owner())) sl_cs_enter();
+#ifdef SL_CS
+	if (likely(!sl_cs_owner())) sl_cs_enter();
+#endif
 
 	tok    = cos_sched_sync();
-	now    = sl_now();
+//	now    = sl_now();
+
+#ifdef SL_TIMEOUTS
 	offset = (s64_t)(globals->timer_next - now);
 	if (globals->timer_next && offset <= 0) sl_timeout_expended(now, globals->timer_next);
 	sl_timeout_wakeup_expired(now);
+#endif
 
 	/*
 	 * Once we exit, we can't trust t's memory as it could be
@@ -534,19 +571,23 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 	 * catch it.  This is a little twitchy and subtle, so lets put
 	 * it in a function, here.
 	 */
-	if (unlikely(to)) {
-		t = to;
-		if (!sl_thd_is_runnable(t)) to = NULL;
-	}
-	if (likely(!to)) {
-		pt = sl_mod_schedule();
-		if (unlikely(!pt))
-			t = sl__globals_cpu()->idle_thd;
-		else
-			t = sl_mod_thd_get(pt);
-	}
-
+//	if (likely(to)) {
+//		t = to;
+//		if (unlikely(!sl_thd_is_runnable(t))) to = NULL;
+//	}
+//	if (unlikely(!to)) {
+//		struct sl_thd_policy *pt = sl_mod_schedule();
+//
+//		if (unlikely(!pt))
+//			t = sl__globals_cpu()->idle_thd;
+//		else
+//			t = sl_mod_thd_get(pt);
+//	}
+
+#if 0
 	if (t->properties & SL_THD_PROPERTY_OWN_TCAP && t->budget) {
+		struct cos_compinfo *ci = cos_compinfo_get(cos_defcompinfo_curr_get());
+
 		assert(t->period);
 		assert(sl_thd_tcap(t) != sl__globals_cpu()->sched_tcap);
 
@@ -566,12 +607,16 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 			if (likely(ret == 0)) t->last_replenish = replenish;
 		}
 	}
+#endif
 
-	assert(sl_thd_is_runnable(t));
+//	assert(t && sl_thd_is_runnable(t));
+#ifdef SL_CS
 	sl_cs_exit();
-	if (t == sl_thd_curr()) return 0;
+#endif
 
-	ret = sl_thd_activate(t, tok);
+	return sl_thd_dispatch(t, tok, sl_thd_curr());
+//	ret = sl_thd_activate(t, tok);
+#if 0
 	/*
 	 * dispatch failed with -EPERM because tcap associated with thread t does not have budget.
 	 * Block the thread until it's next replenishment and return to the scheduler thread.
@@ -583,8 +628,10 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 		sl_thd_block_expiry(t);
 		if (unlikely(sl_thd_curr() != globals->sched_thd)) ret = sl_thd_activate(globals->sched_thd, tok);
 	}
+#endif
 
-	return ret;
+//	return ret;
+#endif
 }
 
 static inline int
@@ -614,6 +661,14 @@ sl_cs_exit_switchto(struct sl_thd *to)
 	}
 }
 
+static inline void
+sl_cs_exit_switchto_c(struct sl_thd *c, struct sl_thd *n)
+{
+	if (sl_cs_exit_schedule_nospin_arg_c(c, n)) {
+		sl_cs_exit_schedule();
+	}
+}
+
 /*
  * Initialization protocol in cos_init: initialization of
  * library-internal data-structures, and then the ability for the
@@ -643,5 +698,29 @@ void sl_sched_loop(void) __attribute__((noreturn));
  * booter receive (INITRCV) end-point at the kernel level.
  */
 void sl_sched_loop_nonblock(void) __attribute__((noreturn));
+static inline void
+sl_thd_yield_thd_c(struct sl_thd *c, struct sl_thd *n)
+{
+	if (likely(c && n)) sl_cs_exit_switchto_c(c, n);
+	else                sl_thd_yield_intern(0);
+}
+
+static inline void
+sl_thd_yield_thd(struct sl_thd *n)
+{
+	if (likely(n)) sl_cs_exit_switchto(n);
+	else           sl_thd_yield_intern(0);
+}
+
+static inline void
+sl_thd_yield(thdid_t tid)
+{
+	if (likely(tid)) {
+		sl_cs_enter();
+		sl_cs_exit_switchto(sl_thd_lkup(tid));
+	} else {
+		sl_thd_yield_intern(0);
+	}
+}
 
 #endif /* SL_H */
diff --git a/src/components/interface/capmgr/stubs/c_stub.c b/src/components/interface/capmgr/stubs/c_stub.c
index d1dbfc9606..fa0490dfcd 100644
--- a/src/components/interface/capmgr/stubs/c_stub.c
+++ b/src/components/interface/capmgr/stubs/c_stub.c
@@ -17,35 +17,51 @@ thdcap_t capmgr_thd_retrieve_cserialized(thdid_t *inittid, int *unused, spdid_t
 thdcap_t
 capmgr_thd_retrieve(spdid_t child, thdid_t tid, thdid_t *inittid)
 {
-	int unused;
+	int r1, r2, r3;
 
-	return capmgr_thd_retrieve_cserialized(inittid, &unused, child, tid);
+	r1 = capmgr_thd_retrieve_cserialized((thdid_t *)&r2, &r3, child, tid);
+	*inittid = r2;
+
+	return r1;
 }
 
 thdcap_t
 capmgr_thd_retrieve_next(spdid_t child, thdid_t *tid)
 {
-	int unused;
+	int r1, r2, r3;
+
+	r1 = capmgr_thd_retrieve_next_cserialized((thdid_t *)&r2, &r3, child);
+	*tid = r2;
 
-	return capmgr_thd_retrieve_next_cserialized(tid, &unused, child);
+	return r1;
 }
 
 thdcap_t
 capmgr_initthd_create(spdid_t child, thdid_t *tid)
 {
-	int unused;
+	int r1, r2, r3;
+
+	r1 = capmgr_initthd_create_cserialized((thdid_t *)&r2, &r3, child);
+	*tid = r2;
 
-	return capmgr_initthd_create_cserialized(tid, &unused, child);
+	return r1;
 }
 
 thdcap_t
 capmgr_thd_create(cos_thd_fn_t fn, void *data, thdid_t *tid, struct cos_dcb_info **dcb)
 {
+	int r1, r2, r3;
 	thdclosure_index_t idx = cos_thd_init_alloc(fn, data);
 
-	if (idx < 1) return 0;
+	if (unlikely(idx < 1)) return 0;
+
+	r1 = capmgr_thd_create_cserialized((struct cos_dcb_info **)&r2, (thdid_t *)&r3, idx);
+	*dcb = (struct cos_dcb_info *)r2;
+	*tid = r3;
+
+	return r1;
 
-	return capmgr_thd_create_cserialized(dcb, tid, idx);
+	//return capmgr_thd_create_cserialized(dcb, tid, idx);
 }
 
 thdcap_t
diff --git a/src/components/lib/sl/sl_capmgr.c b/src/components/lib/sl/sl_capmgr.c
index e3c6c878c6..7cb20e5ad3 100644
--- a/src/components/lib/sl/sl_capmgr.c
+++ b/src/components/lib/sl/sl_capmgr.c
@@ -102,6 +102,7 @@ sl_thd_alloc_no_cs(cos_thd_fn_t fn, void *data)
 
 	aep->thd = capmgr_thd_create(fn, data, &tid, &dcb);
 	if (!aep->thd) goto done;
+	PRINTC("%s:%d %u %p\n", __func__, __LINE__, aep->tid, dcb);
 	aep->tid = tid;
 	assert(tid && dcb);
 
@@ -366,15 +367,14 @@ sl_thd_init_ext(struct cos_aep_info *aepthd, struct sl_thd *sched)
 }
 
 struct sl_thd *
-sl_thd_retrieve(thdid_t tid)
+sl_thd_retrieve_lazy(thdid_t tid)
 {
-	struct sl_thd       *t      = sl_mod_thd_get(sl_thd_lookup_backend(tid));
+	struct sl_thd       *t;
 	spdid_t              client = cos_inv_token();
 	thdid_t              itid   = 0;
 	struct sl_thd       *it     = NULL;
 	struct cos_aep_info  aep;
 
-	if (t && sl_thd_aepinfo(t)) return t;
 	if (tid >= SL_MAX_NUM_THDS) return NULL;
 	assert(client);
 
diff --git a/src/components/lib/sl/sl_mod_rr.c b/src/components/lib/sl/sl_mod_rr.c
index bd796a1346..3db300a735 100644
--- a/src/components/lib/sl/sl_mod_rr.c
+++ b/src/components/lib/sl/sl_mod_rr.c
@@ -17,7 +17,7 @@ sl_mod_schedule(void)
 {
 	struct sl_thd_policy *t = NULL;
 
-	if (ps_list_head_empty(&threads[cos_cpuid()])) goto done;
+	if (unlikely(ps_list_head_empty(&threads[cos_cpuid()]))) goto done;
 	t = ps_list_head_first_d(&threads[cos_cpuid()], struct sl_thd_policy);
 	ps_list_rem_d(t);
 	ps_list_head_append_d(&threads[cos_cpuid()], t);
diff --git a/src/components/lib/sl/sl_raw.c b/src/components/lib/sl/sl_raw.c
index dcb7919449..111d6956a6 100644
--- a/src/components/lib/sl/sl_raw.c
+++ b/src/components/lib/sl/sl_raw.c
@@ -348,9 +348,11 @@ sl_thd_init_ext(struct cos_aep_info *aepthd, struct sl_thd *sched)
 }
 
 struct sl_thd *
-sl_thd_retrieve(thdid_t tid)
+sl_thd_retrieve_lazy(thdid_t tid)
 {
-	return sl_mod_thd_get(sl_thd_lookup_backend(tid));
+	/* without capmgr, there is no lazy retrieval of threads! */
+	assert(0);
+	return NULL;
 }
 
 void
diff --git a/src/components/lib/sl/sl_sched.c b/src/components/lib/sl/sl_sched.c
index 769702bbce..1adb385901 100644
--- a/src/components/lib/sl/sl_sched.c
+++ b/src/components/lib/sl/sl_sched.c
@@ -27,40 +27,40 @@ extern void sl_xcpu_asnd_alloc(void);
  * critical section (cs) code to save on code size/locality
  */
 int
-sl_cs_enter_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, thdcap_t curr, sched_tok_t tok)
+sl_cs_enter_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, struct sl_global_cpu *gcpu, struct sl_thd *curr, sched_tok_t tok)
 {
-	struct sl_thd        *t = sl_thd_curr();
-	struct sl_global_cpu *g = sl__globals_cpu();
+#ifdef SL_CS
 	int ret;
 
 	/* recursive locks are not allowed */
-	assert(csi->s.owner != sl_thd_thdcap(t));
+	assert(csi->s.owner != sl_thd_thdcap(curr));
 	if (!csi->s.contention) {
 		csi->s.contention = 1;
-		if (!ps_cas(&g->lock.u.v, cached->v, csi->v)) return 1;
+		if (!ps_upcas(&gcpu->lock.u.v, cached->v, csi->v)) return 1;
 	}
 	/* Switch to the owner of the critical section, with inheritance using our tcap/priority */
-	if ((ret = cos_defswitch(csi->s.owner, t->prio, csi->s.owner == sl_thd_thdcap(g->sched_thd) ?
-				 TCAP_TIME_NIL : g->timeout_next, tok))) return ret;
+	if ((ret = cos_defswitch(csi->s.owner, curr->prio, csi->s.owner == sl_thd_thdcap(gcpu->sched_thd) ?
+				 TCAP_TIME_NIL : gcpu->timeout_next, tok))) return ret;
 	/* if we have an outdated token, then we want to use the same repeat loop, so return to that */
+#endif
 
 	return 1;
 }
 
 /* Return 1 if we need a retry, 0 otherwise */
 int
-sl_cs_exit_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, sched_tok_t tok)
+sl_cs_exit_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, struct sl_global_cpu *gcpu, sched_tok_t tok)
 {
-	struct sl_thd        *t = sl_thd_curr();
-	struct sl_global_cpu *g = sl__globals_cpu();
-
-	if (!ps_cas(&g->lock.u.v, cached->v, 0)) return 1;
+#ifdef SL_CS
+	if (!ps_upcas(&gcpu->lock.u.v, cached->v, 0)) return 1;
 	/* let the scheduler thread decide which thread to run next, inheriting our budget/priority */
-	cos_defswitch(g->sched_thdcap, t->prio, TCAP_TIME_NIL, tok);
+	cos_defswitch(gcpu->sched_thdcap, sl_thd_curr()->prio, TCAP_TIME_NIL, tok);
+#endif
 
 	return 0;
 }
 
+#ifdef SL_TIMEOUTS
 /* Timeout and wakeup functionality */
 /*
  * TODO:
@@ -110,27 +110,6 @@ sl_timeout_remove(struct sl_thd *t)
 	t->timeout_idx = -1;
 }
 
-void
-sl_thd_free_no_cs(struct sl_thd *t)
-{
-        struct sl_thd *ct = sl_thd_curr();
-
-        assert(t);
-        assert(t->state != SL_THD_FREE);
-        if (t->state == SL_THD_BLOCKED_TIMEOUT) sl_timeout_remove(t);
-        sl_thd_index_rem_backend(sl_mod_thd_policy_get(t));
-        sl_mod_thd_delete(sl_mod_thd_policy_get(t));
-        t->state = SL_THD_FREE;
-        /* TODO: add logic for the graveyard to delay this deallocation if t == current */
-        sl_thd_free_backend(sl_mod_thd_policy_get(t));
-
-        /* thread should not continue to run if it deletes itself. */
-        if (unlikely(t == ct)) {
-                while (1) sl_cs_exit_schedule();
-                /* FIXME: should never get here, but tcap mechanism can let a child scheduler run! */
-        }
-}
-
 static int
 __sl_timeout_compare_min(void *a, void *b)
 {
@@ -151,7 +130,46 @@ sl_timeout_init(microsec_t period)
 	memset(&timeout_heap[cos_cpuid()], 0, sizeof(struct timeout_heap));
 	heap_init(sl_timeout_heap(), SL_MAX_NUM_THDS, __sl_timeout_compare_min, __sl_timeout_update_idx);
 }
+#else
+static inline void
+sl_timeout_remove(struct sl_thd *t)
+{ }
+
+static inline void
+sl_timeout_block(struct sl_thd *t, cycles_t timeout)
+{ }
+
+static void
+sl_timeout_init(microsec_t period)
+{
+	assert(period >= SL_MIN_PERIOD_US);
 
+	sl_timeout_period(period);
+}
+#endif
+
+void
+sl_thd_free_no_cs(struct sl_thd *t)
+{
+        struct sl_thd *ct = sl_thd_curr();
+
+        assert(t);
+        assert(t->state != SL_THD_FREE);
+        if (t->state == SL_THD_BLOCKED_TIMEOUT) sl_timeout_remove(t);
+        sl_thd_index_rem_backend(sl_mod_thd_policy_get(t));
+        sl_mod_thd_delete(sl_mod_thd_policy_get(t));
+        t->state = SL_THD_FREE;
+        /* TODO: add logic for the graveyard to delay this deallocation if t == current */
+        sl_thd_free_backend(sl_mod_thd_policy_get(t));
+
+        /* thread should not continue to run if it deletes itself. */
+        if (unlikely(t == ct)) {
+                while (1) {
+			sl_cs_exit_schedule();
+		}
+                /* FIXME: should never get here, but tcap mechanism can let a child scheduler run! */
+        }
+}
 /*
  * This API is only used by the scheduling thread to block an AEP thread.
  * AEP thread scheduling events could be redundant.
@@ -431,17 +449,17 @@ sl_thd_wakeup(thdid_t tid)
 	return;
 }
 
-void
-sl_thd_yield_cs_exit(thdid_t tid)
+static inline void
+sl_thd_yield_cs_exit_intern(thdid_t tid)
 {
 	struct sl_thd *t = sl_thd_curr();
 
 	/* reset rcv_suspended if the scheduler thinks "curr" was suspended on cos_rcv previously */
-	sl_thd_sched_unblock_no_cs(t);
-	if (tid) {
+	//sl_thd_sched_unblock_no_cs(t);
+	if (likely(tid)) {
 		struct sl_thd *to = sl_thd_lkup(tid);
 
-		assert(to);
+		//assert(to);
 		sl_cs_exit_switchto(to);
 	} else {
 		sl_mod_yield(sl_mod_thd_policy_get(t), NULL);
@@ -449,11 +467,18 @@ sl_thd_yield_cs_exit(thdid_t tid)
 	}
 }
 
+
+void
+sl_thd_yield_cs_exit(thdid_t tid)
+{
+	sl_thd_yield_cs_exit_intern(tid);
+}
+
 void
-sl_thd_yield(thdid_t tid)
+sl_thd_yield_intern(thdid_t tid)
 {
 	sl_cs_enter();
-	sl_thd_yield_cs_exit(tid);
+	sl_thd_yield_cs_exit_intern(tid);
 }
 
 void
@@ -527,7 +552,9 @@ sl_timeout_period(microsec_t period)
 	cycles_t p = sl_usec2cyc(period);
 
 	sl__globals_cpu()->period = p;
+#ifdef SL_TIMEOUTS
 	sl_timeout_relative(p);
+#endif
 }
 
 /* engage space heater mode */
@@ -686,7 +713,6 @@ sl_sched_loop_intern(int non_block)
 
 			/* process notifications from the parent of my threads */
 			while (sl_child_notif_dequeue(&notif)) {
-				PRINTC("NOTIF FROM PARENT FOR %d\n", notif.tid);
 				struct sl_thd *t = sl_thd_lkup(notif.tid);
 
 				if (notif.type == SL_CHILD_THD_BLOCK) sl_thd_block_no_cs(t, SL_THD_BLOCKED, 0);
@@ -720,6 +746,7 @@ sl_sched_loop_nonblock(void)
 int
 sl_thd_kern_dispatch(thdcap_t t)
 {
+	PRINTC("K");
 	//return cos_switch(t, sl__globals_cpu()->sched_tcap, 0, sl__globals_cpu()->timeout_next, sl__globals_cpu()->sched_rcv, cos_sched_sync());
 	return cos_thd_switch(t);
 }
diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index 8222ff2a50..8ce9ac2bff 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -101,6 +101,7 @@ cap_ulthd_lazyupdate(struct pt_regs *regs, struct cos_cpu_local_info *cos_info,
 
 	if (unlikely(!(*ci_ptr)->scb_data)) goto done;
 	scb_core = (((*ci_ptr)->scb_data) + get_cpuid());
+	scb_core->reserved_debugging = 0;
 
 	if (unlikely(interrupt)) {
 		assert(scb_core->sched_tok < ~0U);
diff --git a/src/kernel/include/shared/consts.h b/src/kernel/include/shared/consts.h
index e059c507a7..dddbb93a23 100644
--- a/src/kernel/include/shared/consts.h
+++ b/src/kernel/include/shared/consts.h
@@ -136,6 +136,7 @@ struct pt_regs {
  * offsets below are used to access CPU and thread IDs. */
 #define CPUID_OFFSET 1
 #define THDID_OFFSET 2
-#define INVTOKEN_OFFSET 3
+#define SLTHDPTR_OFFSET 3
+#define INVTOKEN_OFFSET 4
 
 #endif
diff --git a/src/kernel/include/shared/cos_types.h b/src/kernel/include/shared/cos_types.h
index 4193830ebb..0ccb8ebb4d 100644
--- a/src/kernel/include/shared/cos_types.h
+++ b/src/kernel/include/shared/cos_types.h
@@ -429,11 +429,13 @@ struct cos_scb_info {
 	capid_t     curr_thd;
 	cycles_t    timer_next;
 	sched_tok_t sched_tok;
+	int         reserved_debugging;
 } CACHE_ALIGNED;
 
 struct cos_dcb_info {
 	unsigned long ip;
 	unsigned long sp;
+	int           reserved_debugging;
 } __attribute__((packed));
 
 /*

From 89adbbce558a6ae9ca0f9fbc7d628eb247e934f7 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Mon, 1 Apr 2019 14:01:09 -0400
Subject: [PATCH 039/127] Fix errors with GCC7, -r and -pie cannot be used
 together

---
 src/components/Makefile.comp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/components/Makefile.comp b/src/components/Makefile.comp
index 9f4f96dc8c..116c5443fd 100644
--- a/src/components/Makefile.comp
+++ b/src/components/Makefile.comp
@@ -44,8 +44,8 @@ OPT= -g -fvar-tracking
 OPT= -O3
 CFLAGS=-m32 -D__x86__ -Wall -Wextra -Wno-unused-parameter -Wno-unused-function -fno-stack-protector -fno-omit-frame-pointer -Wno-unused-variable $(INC_PATH) $(MUSLINC) $(LWIPINC) $(LUAINC) $(OPT) $(SHARED_FLAGS)
 CXXFLAGS=-fno-exceptions -fno-threadsafe-statics -Wno-write-strings $(CFLAGS)
-LDFLAGS=-melf_i386
-MUSLCFLAGS=$(CFLAGS) -lc -lgcc -Xlinker -r
+LDFLAGS=-melf_i386 -no-pie
+MUSLCFLAGS=$(CFLAGS) -lc -lgcc -Xlinker -r -no-pie
 ASFLAGS=-m32 $(INC_PATH) $(SHARED_FLAGS)
 
 SERVER_STUB=s_stub.o

From 1b8c8c1fd789fa10571717864f8609072d1da4ec Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Mon, 1 Apr 2019 14:15:13 -0400
Subject: [PATCH 040/127] Removed debug printc

---
 src/components/lib/sl/sl_capmgr.c | 1 -
 src/components/lib/sl/sl_sched.c  | 1 -
 2 files changed, 2 deletions(-)

diff --git a/src/components/lib/sl/sl_capmgr.c b/src/components/lib/sl/sl_capmgr.c
index 7cb20e5ad3..6bf0b432c6 100644
--- a/src/components/lib/sl/sl_capmgr.c
+++ b/src/components/lib/sl/sl_capmgr.c
@@ -102,7 +102,6 @@ sl_thd_alloc_no_cs(cos_thd_fn_t fn, void *data)
 
 	aep->thd = capmgr_thd_create(fn, data, &tid, &dcb);
 	if (!aep->thd) goto done;
-	PRINTC("%s:%d %u %p\n", __func__, __LINE__, aep->tid, dcb);
 	aep->tid = tid;
 	assert(tid && dcb);
 
diff --git a/src/components/lib/sl/sl_sched.c b/src/components/lib/sl/sl_sched.c
index 1adb385901..b90ebdd394 100644
--- a/src/components/lib/sl/sl_sched.c
+++ b/src/components/lib/sl/sl_sched.c
@@ -746,7 +746,6 @@ sl_sched_loop_nonblock(void)
 int
 sl_thd_kern_dispatch(thdcap_t t)
 {
-	PRINTC("K");
 	//return cos_switch(t, sl__globals_cpu()->sched_tcap, 0, sl__globals_cpu()->timeout_next, sl__globals_cpu()->sched_rcv, cos_sched_sync());
 	return cos_thd_switch(t);
 }

From f1dd90b306e31d179dd66577d336ccbc1166b754 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Tue, 2 Apr 2019 14:53:05 -0400
Subject: [PATCH 041/127] Fixed usage of interfaces with return values

* The pointers passed to the "SINV" call must be at least as small
  as the register sizes as the assembly for SINV writes registers
  back to those pointers.
---
 .../tests/unit_schedcomp/unit_schedlib.c               |  6 +++---
 src/components/include/cos_component.h                 |  9 ++++-----
 src/components/interface/capmgr/stubs/c_stub.c         | 10 +++++++---
 src/components/interface/sched/stubs/c_stub.c          |  7 +++----
 4 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/components/implementation/tests/unit_schedcomp/unit_schedlib.c b/src/components/implementation/tests/unit_schedcomp/unit_schedlib.c
index 0c93f16cdf..83300dfd64 100644
--- a/src/components/implementation/tests/unit_schedcomp/unit_schedlib.c
+++ b/src/components/implementation/tests/unit_schedcomp/unit_schedlib.c
@@ -48,7 +48,7 @@ test_thd_perffn(void *data)
 
 	assert(perf_thd == sl_thd_curr());
 	rdtscll(start_cycs);
-	printc("a");
+	//printc("a");
 	sl_thd_yield(yield_to);
 	//ret = sl_thd_dispatch(spin_thd, cos_sched_sync(), perf_thd);
 	//sl_thd_yield_thd_c(perf_thd, spin_thd);
@@ -68,7 +68,7 @@ test_thd_perffn(void *data)
 		cos_rdtscp(start_cycs);
 		//rdtscll(start_cycs);
 		//ret = sl_thd_dispatch(spin_thd, cos_sched_sync(), perf_thd);
-		printc("a");
+		//printc("a");
 		sl_thd_yield(yield_to);
 		//sl_thd_yield_thd_c(perf_thd, spin_thd);
 		//sl_thd_yield_thd(spin_thd);
@@ -109,7 +109,7 @@ test_thd_spinfn(void *data)
 		//rdtscll(mid_cycs);
 		switched = 1;
 		//sl_thd_dispatch(perf_thd, cos_sched_sync(), spin_thd);
-		printc("b");
+		//printc("b");
 		sl_thd_yield(yield_to);
 		//sl_thd_yield_thd_c(spin_thd, perf_thd);
 		//sl_thd_yield_thd(perf_thd);
diff --git a/src/components/include/cos_component.h b/src/components/include/cos_component.h
index f8b6870352..8508e5bf99 100644
--- a/src/components/include/cos_component.h
+++ b/src/components/include/cos_component.h
@@ -46,6 +46,7 @@ call_cap_asm(u32_t cap_no, u32_t op, int arg1, int arg2, int arg3, int arg4)
 	return ret;
 }
 
+/* NOTE: make sure the memory locations r1, r2 & r3 are at least word-sized as the register stores are word-sized! */
 static inline int
 call_cap_retvals_asm(u32_t cap_no, u32_t op, int arg1, int arg2, int arg3, int arg4,
 			 unsigned long *r1, unsigned long *r2, unsigned long *r3)
@@ -77,11 +78,12 @@ call_cap_retvals_asm(u32_t cap_no, u32_t op, int arg1, int arg2, int arg3, int a
 	return ret;
 }
 
+/* NOTE: make sure the memory locations r1 & r2 are at least word-sized as the register stores are word-sized! */
 static inline int
 call_cap_2retvals_asm(u32_t cap_no, u32_t op, int arg1, int arg2, int arg3, int arg4,
 			 unsigned long *r1, unsigned long *r2)
 {
-	long fault = 0, ret2, ret3;
+	long fault = 0;
 	int  ret;
 
 	cap_no = (cap_no + 1) << COS_CAPABILITY_OFFSET;
@@ -101,13 +103,10 @@ call_cap_2retvals_asm(u32_t cap_no, u32_t op, int arg1, int arg2, int arg3, int
 	                     "movl $1, %%ecx\n\t"	\
 	                     "3:\n\t"			\
 	                     "popl %%ebp\n\t"		\
-	                     : "=a"(ret), "=c"(fault), "=S"(ret2), "=D"(ret3)
+	                     : "=a"(ret), "=c"(fault), "=S"(*r1), "=D"(*r2)
 	                     : "a"(cap_no), "b"(arg1), "S"(arg2), "D"(arg3), "d"(arg4)
 	                     : "memory", "cc");
 
-	*r1 = ret2;
-	*r2 = ret3;
-
 	return ret;
 }
 
diff --git a/src/components/interface/capmgr/stubs/c_stub.c b/src/components/interface/capmgr/stubs/c_stub.c
index fa0490dfcd..3fb2cf200f 100644
--- a/src/components/interface/capmgr/stubs/c_stub.c
+++ b/src/components/interface/capmgr/stubs/c_stub.c
@@ -60,14 +60,18 @@ capmgr_thd_create(cos_thd_fn_t fn, void *data, thdid_t *tid, struct cos_dcb_info
 	*tid = r3;
 
 	return r1;
-
-	//return capmgr_thd_create_cserialized(dcb, tid, idx);
 }
 
 thdcap_t
 capmgr_thd_create_ext(spdid_t child, thdclosure_index_t idx, thdid_t *tid, struct cos_dcb_info **dcb)
 {
-	return capmgr_thd_create_ext_cserialized(dcb, tid, child, idx);
+	int r1, r2, r3;
+
+	r1 = capmgr_thd_create_ext_cserialized((struct cos_dcb_info **)&r2, (thdid_t *)&r3, child, idx);
+	*tid = r3;
+	*dcb = (struct cos_dcb_info *)r2;
+
+	return r1;
 }
 
 thdcap_t
diff --git a/src/components/interface/sched/stubs/c_stub.c b/src/components/interface/sched/stubs/c_stub.c
index 9a2066e002..8fb79af9fb 100644
--- a/src/components/interface/sched/stubs/c_stub.c
+++ b/src/components/interface/sched/stubs/c_stub.c
@@ -33,19 +33,18 @@ thdid_t
 sched_aep_create(struct cos_aep_info *aep, cos_aepthd_fn_t fn, void *data, int owntc, cos_channelkey_t key)
 {
 	thdclosure_index_t idx = cos_thd_init_alloc(cos_aepthd_fn, (void *)aep);
-	arcvcap_t rcv;
-	int ret;
+	int ret, ret2;
 	int unused;
 
 	if (idx < 1) return 0;
 
 	memset(aep, 0, sizeof(struct cos_aep_info));
-	ret = sched_aep_create_cserialized(&rcv, &unused, idx, owntc, key);
+	ret = sched_aep_create_cserialized((arcvcap_t *)&ret2, &unused, idx, owntc, key);
 	if (!ret) return 0;
 
 	aep->fn   = fn;
 	aep->data = data;
-	aep->rcv  = rcv;
+	aep->rcv  = ret2;
 	aep->tid  = ret;
 
 	return ret;

From c5f258962228cec5cc89ffc6ecac344c1d6cae97 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Thu, 4 Apr 2019 12:55:23 -0400
Subject: [PATCH 042/127] 32bit compilation in gcc

---
 src/components/Makefile.comp | 12 ++++++++++--
 src/platform/linker/Makefile |  2 +-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/components/Makefile.comp b/src/components/Makefile.comp
index 116c5443fd..78ece3ad0e 100644
--- a/src/components/Makefile.comp
+++ b/src/components/Makefile.comp
@@ -44,10 +44,18 @@ OPT= -g -fvar-tracking
 OPT= -O3
 CFLAGS=-m32 -D__x86__ -Wall -Wextra -Wno-unused-parameter -Wno-unused-function -fno-stack-protector -fno-omit-frame-pointer -Wno-unused-variable $(INC_PATH) $(MUSLINC) $(LWIPINC) $(LUAINC) $(OPT) $(SHARED_FLAGS)
 CXXFLAGS=-fno-exceptions -fno-threadsafe-statics -Wno-write-strings $(CFLAGS)
-LDFLAGS=-melf_i386 -no-pie
-MUSLCFLAGS=$(CFLAGS) -lc -lgcc -Xlinker -r -no-pie
+LDFLAGS=-melf_i386
+MUSLCFLAGS=$(CFLAGS) -lc -lgcc -Xlinker -r
 ASFLAGS=-m32 $(INC_PATH) $(SHARED_FLAGS)
 
+GCC_PIE=$(shell gcc -v 2>&1 | grep -c "\--enable-default-pie")
+ifeq ($(GCC_PIE),1)
+MUSLCFLAGS+=-no-pie
+LDFLAGS+=-no-pie
+CFLAGS+=-fno-pie
+CXXFLAGS+=-fno-pie
+endif
+
 SERVER_STUB=s_stub.o
 CLIENT_STUB=c_stub.o
 
diff --git a/src/platform/linker/Makefile b/src/platform/linker/Makefile
index 98394046d6..bafdde4485 100644
--- a/src/platform/linker/Makefile
+++ b/src/platform/linker/Makefile
@@ -2,7 +2,7 @@ include Makefile.src
 
 CC=gcc
 LD=ld
-CFLAGS=-D__x86__ -D_GNU_SOURCE -lpthread -Wall -Wextra -Wno-unused-parameter -Wno-unused-function -Wno-pointer-to-int-cast -Wno-int-to-pointer-cast -Wno-format -ggdb3 -I$(SHAREDINC)
+CFLAGS=-m32 -D__x86__ -D_GNU_SOURCE -lpthread -Wall -Wextra -Wno-unused-parameter -Wno-unused-function -Wno-pointer-to-int-cast -Wno-int-to-pointer-cast -Wno-format -ggdb3 -I$(SHAREDINC)
 LDFLAGS=-melf_i386
 PRODUCTS=cos_linker gen_client_stub
 

From dd3a1f6d2efc157ee637a8791b3098b18051a475 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Thu, 4 Apr 2019 20:01:06 -0400
Subject: [PATCH 043/127] optimizing sched_events and pending : code not
 working yet

---
 .../no_interface/vkernel/vm_booter.c          |   4 +-
 .../tests/micro_booter/mb_tests.c             |  54 ++++----
 .../tests/unit_defcompinfo/unit_defcompinfo.c |   8 +-
 .../tests/unit_schedappaep/unit_schedappaep.c |   4 +-
 .../tests/unit_schedcomp/unit_schedlib.c      |   2 -
 src/components/include/cos_component.h        |   1 +
 src/components/include/cos_kernel_api.h       |   6 +-
 src/components/include/sl.h                   |  95 ++++++++++++++
 src/components/include/sl_thd.h               |   8 +-
 src/components/lib/cos_kernel_api.c           |  14 +--
 src/components/lib/sl/sl_sched.c              |  51 ++++----
 src/kernel/capinv.c                           |  14 +--
 src/kernel/include/component.h                |   1 +
 src/kernel/include/shared/cos_sched.h         |  53 ++++++++
 src/kernel/include/shared/cos_types.h         |  28 -----
 src/kernel/include/thd.h                      | 117 ++++++++++--------
 16 files changed, 288 insertions(+), 172 deletions(-)
 create mode 100644 src/kernel/include/shared/cos_sched.h

diff --git a/src/components/implementation/no_interface/vkernel/vm_booter.c b/src/components/implementation/no_interface/vkernel/vm_booter.c
index a64cd656b4..9ece4ca700 100644
--- a/src/components/implementation/no_interface/vkernel/vm_booter.c
+++ b/src/components/implementation/no_interface/vkernel/vm_booter.c
@@ -46,7 +46,7 @@ dom0_io_fn(void *id)
 {
 	arcvcap_t rcvcap = dom0_vio_rcvcap((unsigned int)id);
 	while (1) {
-		cos_rcv(rcvcap, 0, NULL);
+		cos_rcv(rcvcap, 0);
 	}
 }
 
@@ -55,6 +55,6 @@ vm_io_fn(void *id)
 {
 	arcvcap_t rcvcap = VM_CAPTBL_SELF_IORCV_BASE;
 	while (1) {
-		cos_rcv(rcvcap, 0, NULL);
+		cos_rcv(rcvcap, 0);
 	}
 }
diff --git a/src/components/implementation/tests/micro_booter/mb_tests.c b/src/components/implementation/tests/micro_booter/mb_tests.c
index 7795ce0a2b..f4ee1af06d 100644
--- a/src/components/implementation/tests/micro_booter/mb_tests.c
+++ b/src/components/implementation/tests/micro_booter/mb_tests.c
@@ -146,10 +146,10 @@ async_thd_fn_perf(void *thdcap)
 	arcvcap_t rc = rcc_global[cos_cpuid()];
 	int       i;
 
-	cos_rcv(rc, 0, NULL);
+	cos_rcv(rc, 0);
 
 	for (i = 0; i < ITER + 1; i++) {
-		cos_rcv(rc, 0, NULL);
+		cos_rcv(rc, 0);
 	}
 
 	cos_thd_switch(tc);
@@ -187,35 +187,27 @@ async_thd_fn(void *thdcap)
 {
 	thdcap_t  tc = (thdcap_t)thdcap;
 	arcvcap_t rc = rcc_global[cos_cpuid()];
-	int       pending, rcvd;
+	int       pending;
 
 	PRINTC("Asynchronous event thread handler.\n");
 	PRINTC("<-- rcving (non-blocking)...\n");
-	pending = cos_rcv(rc, RCV_NON_BLOCKING, NULL);
+	pending = cos_rcv(rc, RCV_NON_BLOCKING);
 	PRINTC("<-- pending %d\n", pending);
 
-	PRINTC("<-- rcving (non-blocking & all pending)...\n");
-	pending = cos_rcv(rc, RCV_NON_BLOCKING | RCV_ALL_PENDING, &rcvd);
-	PRINTC("<-- rcvd %d\n", rcvd);
-
-	PRINTC("<-- rcving (all pending)...\n");
-	pending = cos_rcv(rc, RCV_ALL_PENDING, &rcvd);
-	PRINTC("<-- rcvd %d\n", rcvd);
-
 	PRINTC("<-- rcving...\n");
-	pending = cos_rcv(rc, 0, NULL);
+	pending = cos_rcv(rc, 0);
 	PRINTC("<-- pending %d\n", pending);
 	PRINTC("<-- rcving...\n");
-	pending = cos_rcv(rc, 0, NULL);
+	pending = cos_rcv(rc, 0);
 	PRINTC("<-- pending %d\n", pending);
 
 	PRINTC("<-- rcving (non-blocking)...\n");
-	pending = cos_rcv(rc, RCV_NON_BLOCKING, NULL);
+	pending = cos_rcv(rc, RCV_NON_BLOCKING);
 	PRINTC("<-- pending %d\n", pending);
 	assert(pending == -EAGAIN);
 	PRINTC("<-- rcving\n");
 
-	pending = cos_rcv(rc, 0, NULL);
+	pending = cos_rcv(rc, 0);
 	PRINTC("<-- Error: manually returning to snding thread.\n");
 
 	cos_thd_switch(tc);
@@ -231,7 +223,7 @@ async_thd_parent(void *thdcap)
 	asndcap_t   sc = scp_global[cos_cpuid()];
 	int         ret, pending;
 	thdid_t     tid;
-	int         blocked, rcvd;
+	int         blocked;
 	cycles_t    cycles, now;
 	tcap_time_t thd_timeout;
 
@@ -258,7 +250,7 @@ async_thd_parent(void *thdcap)
 
 	PRINTC("--> Back in the asnder.\n");
 	PRINTC("--> receiving to get notifications\n");
-	pending = cos_sched_rcv(rc, RCV_ALL_PENDING, 0, &rcvd, &tid, &blocked, &cycles, &thd_timeout);
+	pending = cos_sched_rcv(rc, 0, 0, &tid, &blocked, &cycles, &thd_timeout);
 	rdtscll(now);
 	PRINTC("--> pending %d, thdid %d, blocked %d, cycles %lld, timeout %lu (now=%llu, abs:%llu)\n",
 	       pending, tid, blocked, cycles, thd_timeout, now, tcap_time2cyc(thd_timeout, now));
@@ -371,7 +363,7 @@ test_timer(void)
 
 	for (i = 0; i <= 16; i++) {
 		thdid_t     tid;
-		int         blocked, rcvd;
+		int         blocked;
 		cycles_t    cycles, now;
 		tcap_time_t timer, thd_timeout;
 
@@ -383,8 +375,8 @@ test_timer(void)
 		rdtscll(c);
 		if (i > 0) t += c - p;
 
-		while (cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, RCV_ALL_PENDING, 0,
-				     &rcvd, &tid, &blocked, &cycles, &thd_timeout) != 0)
+		while (cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, 0, 0,
+				     &tid, &blocked, &cycles, &thd_timeout) != 0)
 			;
 	}
 
@@ -467,7 +459,7 @@ test_budgets_single(void)
 		PRINTC("\t%lld\n", e - s);
 
 		/* FIXME: we should avoid calling this two times in the common case, return "more evts" */
-		while (cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, 0, 0, NULL, &tid, &blocked, &cycles, &thd_timeout) != 0)
+		while (cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, 0, 0, &tid, &blocked, &cycles, &thd_timeout) != 0)
 			;
 	}
 	PRINTC("Done.\n");
@@ -509,7 +501,7 @@ test_budgets_multi(void)
 			assert(0);
 		rdtscll(e);
 
-		cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, 0, 0, NULL, &tid, &blocked, &cycles, &thd_timeout);
+		cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, 0, 0, &tid, &blocked, &cycles, &thd_timeout);
 		PRINTC("g:%llu c:%llu p:%llu => %llu, %d=%llu\n", mbt[cos_cpuid()].g.cyc - s, mbt[cos_cpuid()].c.cyc - s,
 		       mbt[cos_cpuid()].p.cyc - s, e - s, tid, cycles);
 	}
@@ -582,7 +574,7 @@ intr_thd(void *d)
 	struct exec_cluster *w = &(((struct activation_test_data *)d)->w);
 
 	while (1) {
-		cos_rcv(e->rc, 0, NULL);
+		cos_rcv(e->rc, 0);
 		seq_order_check(e);
 		cos_thd_wakeup(w->tc, w->tcc, w->prio, wakeup_budget_test[cos_cpuid()] ? TEST_WAKEUP_BUDGET : 0);
 	}
@@ -599,7 +591,7 @@ intr_sched_thd(void *d)
 	tcap_time_t          thd_timeout;
 
 	while (1) {
-		cos_sched_rcv(e->rc, 0, 0, NULL, &tid, &blocked, &cycs, &thd_timeout);
+		cos_sched_rcv(e->rc, 0, 0, &tid, &blocked, &cycs, &thd_timeout);
 		seq_order_check(e);
 		if (wakeup_budget_test[cos_cpuid()]) {
 			struct exec_cluster *w = &(((struct activation_test_data *)d)->w);
@@ -715,7 +707,7 @@ receiver_thd(void *d)
 	struct exec_cluster *e = &(((struct activation_test_data *)d)->w);
 
 	while (1) {
-		cos_rcv(e->rc, 0, NULL);
+		cos_rcv(e->rc, 0);
 		seq_order_check(e);
 	}
 }
@@ -729,7 +721,7 @@ sender_thd(void *d)
 	while (1) {
 		cos_asnd(r->sc, 0);
 		seq_order_check(e);
-		cos_rcv(e->rc, 0, NULL);
+		cos_rcv(e->rc, 0);
 	}
 }
 
@@ -919,19 +911,19 @@ test_run_mb(void)
 static void
 block_vm(void)
 {
-	int blocked, rcvd;
+	int blocked;
 	cycles_t cycles, now;
 	tcap_time_t timeout, thd_timeout;
 	thdid_t tid;
 
-	while (cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, RCV_ALL_PENDING | RCV_NON_BLOCKING, 0,
-			     &rcvd, &tid, &blocked, &cycles, &thd_timeout) > 0)
+	while (cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, RCV_NON_BLOCKING, 0,
+			     &tid, &blocked, &cycles, &thd_timeout) > 0)
 		;
 
 	rdtscll(now);
 	now += (1000 * cyc_per_usec);
 	timeout = tcap_cyc2time(now);
-	cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, RCV_ALL_PENDING, timeout, &rcvd, &tid, &blocked, &cycles, &thd_timeout);
+	cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, 0, timeout, &tid, &blocked, &cycles, &thd_timeout);
 }
 
 /*
diff --git a/src/components/implementation/tests/unit_defcompinfo/unit_defcompinfo.c b/src/components/implementation/tests/unit_defcompinfo/unit_defcompinfo.c
index b6db7a0af6..ebad3754fd 100644
--- a/src/components/implementation/tests/unit_defcompinfo/unit_defcompinfo.c
+++ b/src/components/implementation/tests/unit_defcompinfo/unit_defcompinfo.c
@@ -35,7 +35,7 @@ aep_thd_fn(arcvcap_t rcv, void *data)
 {
 	printc("\tSwitched to aep %d\n", (int)data);
 	while (1) {
-		cos_rcv(rcv, 0, NULL);
+		cos_rcv(rcv, 0);
 	}
 }
 
@@ -66,7 +66,7 @@ test_aeps(void)
 		                        TCAP_DELEG_YIELD);
 		assert(ret == 0);
 
-		while (cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_BASE, 0, 0, NULL, &tid, &blocked, &cycs, &thd_timeout))
+		while (cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_BASE, 0, 0, &tid, &blocked, &cycs, &thd_timeout))
 			;
 	}
 
@@ -85,7 +85,7 @@ test_childcomps(void)
 		thdid_t     tid;
 		tcap_time_t thd_timeout;
 
-		while (cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_BASE, 0, 0, NULL, &tid, &blocked, &cycs, &thd_timeout))
+		while (cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_BASE, 0, 0, &tid, &blocked, &cycs, &thd_timeout))
 			;
 		printc("\tSwitching to [%d] component\n", id);
 		if (id == CHILD_SCHED_ID) {
@@ -208,7 +208,7 @@ cos_init(void)
 		/* TEST BLOCKING */
 		/* TODO: Challenge - how does a component know at runtime if can call cos_rcv or not? - It does not at
 		 * runtime. */
-		cos_rcv(BOOT_CAPTBL_SELF_INITRCV_BASE, 0, NULL);
+		cos_rcv(BOOT_CAPTBL_SELF_INITRCV_BASE, 0);
 		printc("\tThis is a simple component\n");
 
 		SPIN();
diff --git a/src/components/implementation/tests/unit_schedappaep/unit_schedappaep.c b/src/components/implementation/tests/unit_schedappaep/unit_schedappaep.c
index 8e09a2c49e..898077b8b4 100644
--- a/src/components/implementation/tests/unit_schedappaep/unit_schedappaep.c
+++ b/src/components/implementation/tests/unit_schedappaep/unit_schedappaep.c
@@ -24,7 +24,7 @@ __test_child(arcvcap_t rcv, void *data)
 	int ret;
 
 	assert(taeps[cos_cpuid()][(int)data].rcv == rcv);
-	ret = cos_rcv(rcv, 0, NULL);
+	ret = cos_rcv(rcv, 0);
 	assert(ret == 0);
 
 	/* do nothing */
@@ -39,7 +39,7 @@ __test_parent(arcvcap_t rcv, void *data)
 	int ret;
 
 	assert(taeps[cos_cpuid()][(int)data].rcv == rcv);
-	ret = cos_rcv(rcv, 0, NULL);
+	ret = cos_rcv(rcv, 0);
 	assert(ret == 0);
 
 	parent_sent[cos_cpuid()] = 1;
diff --git a/src/components/implementation/tests/unit_schedcomp/unit_schedlib.c b/src/components/implementation/tests/unit_schedcomp/unit_schedlib.c
index 83300dfd64..c8980eafc0 100644
--- a/src/components/implementation/tests/unit_schedcomp/unit_schedlib.c
+++ b/src/components/implementation/tests/unit_schedcomp/unit_schedlib.c
@@ -57,7 +57,6 @@ test_thd_perffn(void *data)
 	rdtscll(end_cycs);
 	//assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs);
 	assert(switched);
-	sl_scb_info_cpu()->reserved_debugging = 1;
 
 	for (i = 0; i < PERF_ITERS; i++) {
 		cycles_t diff1_cycs = 0, diff2_cycs = 0;
@@ -89,7 +88,6 @@ test_thd_perffn(void *data)
 		total_cycs += diff2_cycs;
 	}
 
-	assert(sl_scb_info_cpu()->reserved_debugging == 1);
 	PRINTC("SWITCH UBENCH (2 switches): avg: %llu, wc: %llu, bc: %llu, iters:%u\n", (total_cycs / (PERF_ITERS)), wc_cycs, bc_cycs, PERF_ITERS);
 	testing = 0;
 	/* done testing! free the spin thread! */
diff --git a/src/components/include/cos_component.h b/src/components/include/cos_component.h
index 8508e5bf99..777c490c7c 100644
--- a/src/components/include/cos_component.h
+++ b/src/components/include/cos_component.h
@@ -10,6 +10,7 @@
 
 #include <consts.h>
 #include <cos_types.h>
+#include <cos_sched.h>
 #include <errno.h>
 #include <util.h>
 
diff --git a/src/components/include/cos_kernel_api.h b/src/components/include/cos_kernel_api.h
index ffc5d2c6c1..baa6aa28d7 100644
--- a/src/components/include/cos_kernel_api.h
+++ b/src/components/include/cos_kernel_api.h
@@ -163,10 +163,10 @@ int cos_thd_mod(struct cos_compinfo *ci, thdcap_t c, void *tls_addr); /* set tls
 int cos_sched_asnd(asndcap_t snd, tcap_time_t timeout, arcvcap_t srcv, sched_tok_t stok);
 /* returns 0 on success and -EINVAL on failure */
 int cos_asnd(asndcap_t snd, int yield);
-/* returns non-zero if there are still pending events (i.e. there have been pending snds) */
-int cos_rcv(arcvcap_t rcv, rcv_flags_t flags, int *rcvd);
+/* returns 0 on success */
+int cos_rcv(arcvcap_t rcv, rcv_flags_t flags);
 /* returns the same value as cos_rcv, but also information about scheduling events */
-int cos_sched_rcv(arcvcap_t rcv, rcv_flags_t flags, tcap_time_t timeout, int *rcvd, thdid_t *thdid, int *blocked,
+int cos_sched_rcv(arcvcap_t rcv, rcv_flags_t flags, tcap_time_t timeout, thdid_t *thdid, int *blocked,
 		  cycles_t *cycles, tcap_time_t *thd_timeout);
 
 int cos_introspect(struct cos_compinfo *ci, capid_t cap, unsigned long op);
diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index 3939d164cd..67b8ffebc3 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -723,4 +723,99 @@ sl_thd_yield(thdid_t tid)
 	}
 }
 
+/* TODO: where to put this code? */
+static inline int
+__cos_sched_events_present(struct cos_sched_ring *r)
+{
+	PRINTC("%s:%d\n", __func__, __LINE__);
+	return !(r->tail == r->head);
+}
+
+static inline int
+__cos_sched_event_consume(struct cos_sched_ring *r, struct cos_sched_event *e)
+{
+	int f = 0;
+
+	PRINTC("%s:%d\n", __func__, __LINE__);
+	if (!r || !e || !__cos_sched_events_present(r)) return 0;
+	PRINTC("%s:%d\n", __func__, __LINE__);
+
+	f = ps_upfaa((unsigned long *)&r->head, 1);
+
+	memcpy((void *)e, (void *)&(r->event_buf[f]), sizeof(struct cos_sched_event));
+	PRINTC("%s:%d\n", __func__, __LINE__);
+
+	return 1;
+}
+
+static inline int
+sl_sched_rcv_intern(struct cos_sched_event *e, int nonblock)
+{
+	int ret = 0;
+	struct sl_global_cpu *g  = sl__globals_cpu();
+	struct cos_sched_ring *r = &sl_scb_info_cpu()->sched_events;
+
+	PRINTC("%s:%d %p %p %p\n", __func__, __LINE__, g, sl_scb_info_cpu(), r);
+	//memset(e, 0, sizeof(struct cos_sched_event));
+//	if (unlikely(__cos_sched_event_consume(r, e) == 0)) {
+//	PRINTC("%s:%d\n", __func__, __LINE__);
+//		int blocked;
+//		thdid_t tid;
+//		cycles_t cycs;
+//		tcap_time_t timeout;
+//
+//		ret = cos_sched_rcv(g->sched_rcv, nonblock ? RCV_NON_BLOCKING : 0, g->timeout_next,
+//				    &tid, &blocked, &cycs, &timeout);
+//	PRINTC("%s:%d\n", __func__, __LINE__);
+//		if (ret < 0) return ret;
+//
+//	PRINTC("%s:%d\n", __func__, __LINE__);
+//		e->tid = tid;
+//		e->evt.elapsed_cycs = cycs;
+//		e->evt.blocked      = blocked;
+//		e->evt.next_timeout = timeout;
+//	PRINTC("%s:%d\n", __func__, __LINE__);
+//	}
+
+	PRINTC("%s:%d\n", __func__, __LINE__);
+	return __cos_sched_events_present(r);
+}
+
+static inline int
+sl_sched_rcv_nonblock(struct cos_sched_event *e)
+{
+	return sl_sched_rcv_intern(e, 1);
+}
+
+static inline int
+sl_sched_rcv(struct cos_sched_event *e)
+{
+	return sl_sched_rcv_intern(e, 0);
+}
+
+static inline int
+sl_thd_rcv(void)
+{
+	int count = 0;
+	struct sl_thd *t = sl_thd_curr();
+	unsigned long *p = &sl_thd_dcbinfo(t)->pending, q = 0;
+
+check:
+	sl_cs_enter();
+	q = *p;
+	if (q == 0) {
+		count++;
+
+		sl_thd_sched_block_no_cs(t, SL_THD_BLOCKED, 0);
+		sl_cs_exit_switchto(sl__globals_cpu()->sched_thd);
+
+		goto check;
+	}
+
+	ps_upcas(p, q, 0);
+	sl_cs_exit();
+
+	return q;
+}
+
 #endif /* SL_H */
diff --git a/src/components/include/sl_thd.h b/src/components/include/sl_thd.h
index 67c1d97ff1..f2c0107484 100644
--- a/src/components/include/sl_thd.h
+++ b/src/components/include/sl_thd.h
@@ -27,12 +27,6 @@ typedef enum {
 	SL_THD_PROPERTY_SEND     = (1<<1), /* use asnd to dispatch to this thread */
 } sl_thd_property_t;
 
-struct event_info {
-	int         blocked; /* 1 - blocked. 0 - awoken */
-	cycles_t    cycles;
-	tcap_time_t timeout;
-};
-
 struct sl_thd {
 	sl_thd_state_t       state;
 	/*
@@ -93,7 +87,7 @@ struct sl_thd {
 	cycles_t    wakeup_cycs;   /* actual last wakeup - used in timeout API for jitter information, etc */
 	int         timeout_idx;   /* timeout heap index, used in timeout API */
 
-	struct event_info event_info;
+	struct cos_thd_event event_info;
 	struct ps_list    SL_THD_EVENT_LIST; /* list of events for the scheduler end-point */
 
 	struct cos_dcb_info *dcb;
diff --git a/src/components/lib/cos_kernel_api.c b/src/components/lib/cos_kernel_api.c
index d3f3ad4626..f516030e63 100644
--- a/src/components/lib/cos_kernel_api.c
+++ b/src/components/lib/cos_kernel_api.c
@@ -933,28 +933,26 @@ cos_asnd(asndcap_t snd, int yield)
 
 int
 cos_sched_rcv(arcvcap_t rcv, rcv_flags_t flags, tcap_time_t timeout,
-	      int *rcvd, thdid_t *thdid, int *blocked, cycles_t *cycles, tcap_time_t *thd_timeout)
+	      thdid_t *thdid, int *blocked, cycles_t *cycles, tcap_time_t *thd_timeout)
 {
 	unsigned long thd_state = 0;
 	unsigned long cyc       = 0;
 	int           ret;
 
+	PRINTC("%s:%d\n", __func__, __LINE__);
+
 	ret = call_cap_retvals_asm(rcv, 0, flags, timeout, 0, 0, &thd_state, &cyc, thd_timeout);
 
 	*blocked = (int)(thd_state >> (sizeof(thd_state) * 8 - 1));
 	*thdid   = (thdid_t)(thd_state & ((1 << (sizeof(thdid_t) * 8)) - 1));
 	*cycles  = cyc;
-
-	if (ret >= 0 && flags & RCV_ALL_PENDING) {
-		*rcvd = (ret >> 1);
-		ret &= 1;
-	}
+	PRINTC("%s:%d\n", __func__, __LINE__);
 
 	return ret;
 }
 
 int
-cos_rcv(arcvcap_t rcv, rcv_flags_t flags, int *rcvd)
+cos_rcv(arcvcap_t rcv, rcv_flags_t flags)
 {
 	thdid_t     tid = 0;
 	int         blocked;
@@ -962,7 +960,7 @@ cos_rcv(arcvcap_t rcv, rcv_flags_t flags, int *rcvd)
 	int         ret;
 	tcap_time_t thd_timeout;
 
-	ret = cos_sched_rcv(rcv, flags, 0, rcvd, &tid, &blocked, &cyc, &thd_timeout);
+	ret = cos_sched_rcv(rcv, flags, 0, &tid, &blocked, &cyc, &thd_timeout);
 	assert(tid == 0);
 
 	return ret;
diff --git a/src/components/lib/sl/sl_sched.c b/src/components/lib/sl/sl_sched.c
index b90ebdd394..62a936b034 100644
--- a/src/components/lib/sl/sl_sched.c
+++ b/src/components/lib/sl/sl_sched.c
@@ -484,31 +484,31 @@ sl_thd_yield_intern(thdid_t tid)
 void
 sl_thd_event_info_reset(struct sl_thd *t)
 {
-	t->event_info.blocked = 0;
-	t->event_info.cycles  = 0;
-	t->event_info.timeout = 0;
+	t->event_info.blocked      = 0;
+	t->event_info.elapsed_cycs = 0;
+	t->event_info.next_timeout = 0;
 }
 
 static inline void
-sl_thd_event_enqueue(struct sl_thd *t, int blocked, cycles_t cycles, tcap_time_t timeout)
+sl_thd_event_enqueue(struct sl_thd *t, struct cos_thd_event *e)
 {
 	struct sl_global_cpu *g = sl__globals_cpu();
 
 	if (ps_list_singleton(t, SL_THD_EVENT_LIST)) ps_list_head_append(&g->event_head, t, SL_THD_EVENT_LIST);
 
-	t->event_info.blocked  = blocked;
-	t->event_info.cycles  += cycles;
-	t->event_info.timeout  = timeout;
+	t->event_info.blocked       = e->blocked;
+	t->event_info.elapsed_cycs += e->elapsed_cycs;
+	t->event_info.next_timeout  = e->next_timeout;
 }
 
 static inline void
-sl_thd_event_dequeue(struct sl_thd *t, int *blocked, cycles_t *cycles, tcap_time_t *timeout)
+sl_thd_event_dequeue(struct sl_thd *t, struct cos_thd_event *e)
 {
 	ps_list_rem(t, SL_THD_EVENT_LIST);
 
-	*blocked = t->event_info.blocked;
-	*cycles  = t->event_info.cycles;
-	*timeout = t->event_info.timeout;
+	e->blocked      = t->event_info.blocked;
+	e->elapsed_cycs = t->event_info.elapsed_cycs;
+	e->next_timeout = t->event_info.next_timeout;
 	sl_thd_event_info_reset(t);
 }
 
@@ -636,29 +636,26 @@ static void
 sl_sched_loop_intern(int non_block)
 {
 	struct sl_global_cpu *g   = sl__globals_cpu();
-	rcv_flags_t           rfl = (non_block ? RCV_NON_BLOCKING : 0) | RCV_ALL_PENDING;
+	//rcv_flags_t           rfl = (non_block ? RCV_NON_BLOCKING : 0) | RCV_ALL_PENDING;
 
 	while (1) {
 		int pending;
 
 		do {
-			thdid_t        tid;
-			int            blocked, rcvd;
-			cycles_t       cycles;
-			tcap_time_t    timeout = g->timeout_next, thd_timeout;
 			struct sl_thd *t = NULL, *tn = NULL;
 			struct sl_child_notification notif;
+			struct cos_sched_event e = { .tid = 0 };
 
 			/*
 			 * a child scheduler may receive both scheduling notifications (block/unblock
 			 * states of it's child threads) and normal notifications (mainly activations from
 			 * it's parent scheduler).
 			 */
-			pending = cos_sched_rcv(g->sched_rcv, rfl, timeout,
-						&rcvd, &tid, &blocked, &cycles, &thd_timeout);
-			if (!tid) goto pending_events;
+			pending = sl_sched_rcv_intern(&e, non_block);
 
-			t = sl_thd_lkup(tid);
+			if (!e.tid) goto pending_events;
+
+			t = sl_thd_lkup(e.tid);
 			assert(t);
 			/* don't report the idle thread or a freed thread */
 			if (unlikely(t == g->idle_thd || t->state == SL_THD_FREE)) goto pending_events;
@@ -670,7 +667,7 @@ sl_sched_loop_intern(int non_block)
 			 * To avoid dropping events, add the events to the scheduler event list and processing all
 			 * the pending events after the scheduler can successfully take the lock.
 			 */
-			sl_thd_event_enqueue(t, blocked, cycles, thd_timeout);
+			sl_thd_event_enqueue(t, &e.evt);
 
 pending_events:
 			if (ps_list_head_empty(&g->event_head) &&
@@ -688,21 +685,21 @@ sl_sched_loop_intern(int non_block)
 
 			ps_list_foreach_del(&g->event_head, t, tn, SL_THD_EVENT_LIST) {
 				/* remove the event from the list and get event info */
-				sl_thd_event_dequeue(t, &blocked, &cycles, &thd_timeout);
+				sl_thd_event_dequeue(t, &e.evt);
 
 				/* outdated event for a freed thread */
 				if (t->state == SL_THD_FREE) continue;
 
-				sl_mod_execution(sl_mod_thd_policy_get(t), cycles);
+				sl_mod_execution(sl_mod_thd_policy_get(t), e.evt.elapsed_cycs);
 
-				if (blocked) {
+				if (e.evt.blocked) {
 					sl_thd_state_t state = SL_THD_BLOCKED;
 					cycles_t abs_timeout = 0;
 
-					if (likely(cycles)) {
-						if (thd_timeout) {
+					if (likely(e.evt.elapsed_cycs)) {
+						if (e.evt.next_timeout) {
 							state       = SL_THD_BLOCKED_TIMEOUT;
-							abs_timeout = tcap_time2cyc(thd_timeout, sl_now());
+							abs_timeout = tcap_time2cyc(e.evt.next_timeout, sl_now());
 						}
 						sl_thd_sched_block_no_cs(t, state, abs_timeout);
 					}
diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index 8ce9ac2bff..1ffafea638 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -101,7 +101,6 @@ cap_ulthd_lazyupdate(struct pt_regs *regs, struct cos_cpu_local_info *cos_info,
 
 	if (unlikely(!(*ci_ptr)->scb_data)) goto done;
 	scb_core = (((*ci_ptr)->scb_data) + get_cpuid());
-	scb_core->reserved_debugging = 0;
 
 	if (unlikely(interrupt)) {
 		assert(scb_core->sched_tok < ~0U);
@@ -582,7 +581,7 @@ asnd_process(struct thread *rcv_thd, struct thread *thd, struct tcap *rcv_tcap,
 {
 	struct thread *next;
 
-	thd_rcvcap_pending_inc(rcv_thd);
+	thd_rcvcap_pending_set(rcv_thd);
 	next = notify_process(rcv_thd, thd, rcv_tcap, tcap, tcap_next, yield);
 
 	/*
@@ -714,7 +713,7 @@ cap_thd_op(struct cap_thd *thd_cap, struct thread *thd, struct pt_regs *regs, st
 		ret  = cap_sched_tok_validate(rcvt, usr_counter, ci, cos_info);
 		if (ret) return ret;
 
-		if (thd_rcvcap_pending(rcvt) > 0) {
+		if (thd_rcvcap_pending(rcvt)) {
 			if (thd == rcvt) return -EBUSY;
 
 			next = rcvt;
@@ -797,7 +796,7 @@ cap_asnd_op(struct cap_asnd *asnd, struct thread *thd, struct pt_regs *regs, str
 		ret  = cap_sched_tok_validate(rcvt, usr_tok, ci, cos_info);
 		if (ret) return ret;
 
-		if (thd_rcvcap_pending(rcvt) > 0) {
+		if (thd_rcvcap_pending(rcvt)) {
 			if (thd == rcvt) return -EBUSY;
 
 			next = rcvt;
@@ -912,23 +911,25 @@ cap_arcv_op(struct cap_arcv *arcv, struct thread *thd, struct pt_regs *regs, str
 	rcv_flags_t          rflags      = __userregs_get1(regs);
 	tcap_time_t          swtimeout   = TCAP_TIME_NIL;
 	tcap_time_t          timeout     = __userregs_get2(regs);
-	int                  all_pending = (!!(rflags & RCV_ALL_PENDING));
 
+	printk("%s:%d\n", __func__, __LINE__);
 	if (unlikely(arcv->thd != thd || arcv->cpuid != get_cpuid())) return -EINVAL;
 
 	/* deliver pending notifications? */
 	if (thd_rcvcap_pending(thd)) {
+	printk("%s:%d\n", __func__, __LINE__);
 		__userregs_set(regs, 0, __userregs_getsp(regs), __userregs_getip(regs));
-		thd_rcvcap_all_pending_set(thd, all_pending);
 		thd_rcvcap_pending_deliver(thd, regs);
 
 		return 0;
 	} else if (rflags & RCV_NON_BLOCKING) {
+	printk("%s:%d\n", __func__, __LINE__);
 		__userregs_set(regs, 0, __userregs_getsp(regs), __userregs_getip(regs));
 		__userregs_setretvals(regs, -EAGAIN, 0, 0, 0);
 
 		return 0;
 	}
+	printk("%s:%d\n", __func__, __LINE__);
 	__userregs_setretvals(regs, 0, 0, 0, 0);
 
 	next = notify_parent(thd, 0);
@@ -963,7 +964,6 @@ cap_arcv_op(struct cap_arcv *arcv, struct thread *thd, struct pt_regs *regs, str
 	if (likely(thd != next)) {
 		assert(!(thd->state & THD_STATE_PREEMPTED));
 		thd->state |= THD_STATE_RCVING;
-		thd_rcvcap_all_pending_set(thd, all_pending);
 		thd->timeout = timeout;
 	}
 
diff --git a/src/kernel/include/component.h b/src/kernel/include/component.h
index 6d6f8d18f9..79cfbd5546 100644
--- a/src/kernel/include/component.h
+++ b/src/kernel/include/component.h
@@ -12,6 +12,7 @@
 #include "captbl.h"
 #include "pgtbl.h"
 #include "cap_ops.h"
+#include "shared/cos_sched.h"
 
 struct comp_info {
 	struct liveness_data        liveness;
diff --git a/src/kernel/include/shared/cos_sched.h b/src/kernel/include/shared/cos_sched.h
new file mode 100644
index 0000000000..3b195f23cd
--- /dev/null
+++ b/src/kernel/include/shared/cos_sched.h
@@ -0,0 +1,53 @@
+#ifndef COS_SCHED_H
+#define COS_SCHED_H
+
+#include "./cos_types.h"
+
+struct cos_thd_event {
+	u16_t         blocked;
+	u32_t         next_timeout;
+	u64_t         elapsed_cycs;
+} __attribute__((packed));
+
+struct cos_sched_event {
+	thdid_t tid;
+	struct cos_thd_event evt;
+} __attribute__((packed));
+
+#define COS_SCHED_EVENT_RING_SIZE 16
+#define COS_SCHED_EVENT_MASK      (COS_SCHED_EVENT_RING_SIZE - 1)
+
+struct cos_sched_ring {
+	int head, tail;
+	struct cos_sched_event event_buf[COS_SCHED_EVENT_RING_SIZE];
+} __attribute__((packed));
+
+struct cos_scb_info {
+	capid_t               curr_thd;
+	cycles_t              timer_next;
+	sched_tok_t           sched_tok;
+	struct cos_sched_ring sched_events;
+} CACHE_ALIGNED;
+
+struct cos_dcb_info {
+	unsigned long ip;
+	unsigned long sp;
+	unsigned long pending; /* binary value. TODO: move it to ip or sp */
+} __attribute__((packed));
+
+/*
+ * This is the "ip" the kernel uses to update the thread when it sees that the
+ * thread is still in user-level dispatch routine.
+ * This is the offset of instruction after resetting the "next" thread's "sp" to zero
+ * in a purely user-level dispatch.
+ *
+ * Whenever kernel is switching to a thread which has "sp" non-zero, it would switch
+ * to the "ip" saved in the dcb_info and reset the "sp" of the thread that the kernel
+ * is dispatching to!
+ * This is necessary because, if the kernel is dispatching to a thread that was in the
+ * user-level dispatch routine before, then the only registers that it can restore are
+ * "ip" and "sp", everything else is either clobbered or saved/loaded at user-level.
+ */
+#define DCB_IP_KERN_OFF 8
+
+#endif /* COS_SCHED_H */
diff --git a/src/kernel/include/shared/cos_types.h b/src/kernel/include/shared/cos_types.h
index 0ccb8ebb4d..02ff8d50ad 100644
--- a/src/kernel/include/shared/cos_types.h
+++ b/src/kernel/include/shared/cos_types.h
@@ -73,7 +73,6 @@ typedef enum {
 
 typedef enum {
 	RCV_NON_BLOCKING = 1,
-	RCV_ALL_PENDING  = 1 << 1,
 } rcv_flags_t;
 
 #define BOOT_LIVENESS_ID_BASE 2
@@ -425,33 +424,6 @@ struct cos_stack_freelists {
 /* #error "Assembly in <fill in file name here> requires that COMP_INFO_STACK_FREELISTS != 1 ||
  * COMP_INFO_TMEM_STK_RELINQ != 0.  Change the defines, or change the assembly" */
 /* #endif */
-struct cos_scb_info {
-	capid_t     curr_thd;
-	cycles_t    timer_next;
-	sched_tok_t sched_tok;
-	int         reserved_debugging;
-} CACHE_ALIGNED;
-
-struct cos_dcb_info {
-	unsigned long ip;
-	unsigned long sp;
-	int           reserved_debugging;
-} __attribute__((packed));
-
-/*
- * This is the "ip" the kernel uses to update the thread when it sees that the
- * thread is still in user-level dispatch routine.
- * This is the offset of instruction after resetting the "next" thread's "sp" to zero
- * in a purely user-level dispatch.
- *
- * Whenever kernel is switching to a thread which has "sp" non-zero, it would switch
- * to the "ip" saved in the dcb_info and reset the "sp" of the thread that the kernel
- * is dispatching to!
- * This is necessary because, if the kernel is dispatching to a thread that was in the
- * user-level dispatch routine before, then the only registers that it can restore are
- * "ip" and "sp", everything else is either clobbered or saved/loaded at user-level.
- */
-#define DCB_IP_KERN_OFF 8
 
 struct cos_component_information {
 	struct cos_stack_freelists cos_stacks;
diff --git a/src/kernel/include/thd.h b/src/kernel/include/thd.h
index 797255dc50..b47755b4c5 100644
--- a/src/kernel/include/thd.h
+++ b/src/kernel/include/thd.h
@@ -34,7 +34,7 @@ struct invstk_entry {
  */
 struct rcvcap_info {
 	/* how many other arcv end-points send notifications to this one? */
-	int            isbound, pending, refcnt, is_all_pending;
+	int            isbound, pending, refcnt, is_init;
 	sched_tok_t    sched_count;
 	struct tcap *  rcvcap_tcap;      /* This rcvcap's tcap */
 	struct thread *rcvcap_thd_notif; /* The parent rcvcap thread for notifications */
@@ -191,13 +191,13 @@ thd_next_thdinfo_update(struct cos_cpu_local_info *cli, struct thread *thd, stru
 }
 
 static void
-thd_rcvcap_init(struct thread *t)
+thd_rcvcap_init(struct thread *t, int is_init)
 {
 	struct rcvcap_info *rc = &t->rcvcap;
 
 	rc->isbound = rc->pending = rc->refcnt = 0;
-	rc->is_all_pending                     = 0;
 	rc->sched_count                        = 0;
+	rc->is_init                            = is_init;
 	rc->rcvcap_thd_notif                   = NULL;
 }
 
@@ -230,36 +230,11 @@ thd_track_exec(struct thread *t)
 	return !list_empty(&t->event_list);
 }
 
-static void
-thd_rcvcap_all_pending_set(struct thread *t, int val)
-{
-	t->rcvcap.is_all_pending = val;
-}
-
-static int
-thd_rcvcap_all_pending_get(struct thread *t)
-{
-	return t->rcvcap.is_all_pending;
-}
-
-static int
-thd_rcvcap_all_pending(struct thread *t)
-{
-	int pending = t->rcvcap.pending;
-
-	/* receive all pending */
-	t->rcvcap.pending = 0;
-	thd_rcvcap_all_pending_set(t, 0);
-
-	return ((pending << 1) | !list_isempty(&t->event_head));
-}
-
 static int
 thd_rcvcap_pending(struct thread *t)
 {
 	if (t->rcvcap.pending) return t->rcvcap.pending;
 	return !list_isempty(&t->event_head);
-	;
 }
 
 static sched_tok_t
@@ -275,20 +250,17 @@ thd_rcvcap_set_counter(struct thread *t, sched_tok_t cntr)
 }
 
 static void
-thd_rcvcap_pending_inc(struct thread *arcvt)
+thd_rcvcap_pending_set(struct thread *arcvt)
 {
-	arcvt->rcvcap.pending++;
+	arcvt->rcvcap.pending = 1;
+
+	if (likely(arcvt->dcbinfo)) arcvt->dcbinfo->pending = 1;
 }
 
-static int
-thd_rcvcap_pending_dec(struct thread *arcvt)
+static void
+thd_rcvcap_pending_reset(struct thread *arcvt)
 {
-	int pending = arcvt->rcvcap.pending;
-
-	if (pending == 0) return 0;
-	arcvt->rcvcap.pending--;
-
-	return pending;
+	arcvt->rcvcap.pending = 0;
 }
 
 static inline int
@@ -372,7 +344,8 @@ thd_activate(struct captbl *t, capid_t cap, capid_t capin, struct thread *thd, c
 	assert(thd->tid <= MAX_NUM_THREADS);
 	thd_scheduler_set(thd, thd_current(cli));
 
-	thd_rcvcap_init(thd);
+	/* TODO: fix the way to specify scheduler in a component! */
+	thd_rcvcap_init(thd, !init_data);
 	list_head_init(&thd->event_head);
 	list_init(&thd->event_list, thd);
 
@@ -573,26 +546,68 @@ thd_preemption_state_update(struct thread *curr, struct thread *next, struct pt_
 	memcpy(&curr->regs, regs, sizeof(struct pt_regs));
 }
 
+static int
+thd_sched_events_produce(struct thread *thd, struct cos_cpu_local_info *cos_info)
+{
+	int delta = 0, inv_top = curr_invstk_top(cos_info);
+	struct cos_scb_info   *scb = NULL;
+	struct cos_sched_ring *r   = NULL;
+	struct comp_info      *c   = NULL;
+
+	printk("%s:%d\n", __func__, __LINE__);
+	if (unlikely(inv_top != 0 || thd->rcvcap.is_init == 0)) return 0;
+
+	printk("%s:%d\n", __func__, __LINE__);
+	c = thd_invstk_peek_compinfo(thd, cos_info, inv_top);
+	if (unlikely(!c || !c->scb_data)) return 0;
+	printk("%s:%d\n", __func__, __LINE__);
+
+	scb = ((c->scb_data) + get_cpuid());
+	r   = &(scb->sched_events);
+	/* 
+	 * only produce more if the ring is empty! 
+	 * so the user only calls after dequeueing all previous events. 
+	 */
+	printk("%s:%d\n", __func__, __LINE__);
+	if (unlikely(r->head != r->tail)) return -EAGAIN;
+
+	printk("%s:%d\n", __func__, __LINE__);
+	r->head = r->tail = 0;
+	while (delta < COS_SCHED_EVENT_RING_SIZE) {
+	printk("%s:%d\n", __func__, __LINE__);
+		struct cos_sched_event *e = &(r->event_buf[delta]);
+		unsigned long thd_state;
+
+		if (!thd_state_evt_deliver(thd, &thd_state, (unsigned long *)&(e->evt.elapsed_cycs),
+					(unsigned long *)&(e->evt.next_timeout))) break;
+	printk("%s:%d\n", __func__, __LINE__);
+		e->tid         = (thd_state << 1) >> 1;
+		e->evt.blocked = (thd_state >> 31);
+
+		delta++;
+	}
+	printk("%s:%d\n", __func__, __LINE__);
+
+	r->tail += delta;
+
+	return delta;
+}
+
 static inline void
 thd_rcvcap_pending_deliver(struct thread *thd, struct pt_regs *regs)
 {
-	unsigned long thd_state = 0, cycles = 0, timeout = 0, pending = 0;
-	int           all_pending = thd_rcvcap_all_pending_get(thd);
+	unsigned long thd_state = 0, cycles = 0, timeout = 0;
 
 	thd_state_evt_deliver(thd, &thd_state, &cycles, &timeout);
-	if (all_pending) {
-		pending = thd_rcvcap_all_pending(thd);
-	} else {
-		thd_rcvcap_pending_dec(thd);
-		pending = thd_rcvcap_pending(thd);
-	}
-	__userregs_setretvals(regs, pending, thd_state, cycles, timeout);
+	thd_rcvcap_pending_reset(thd);
+	thd_sched_events_produce(thd, cos_cpu_local_info());
+	__userregs_setretvals(regs, thd_rcvcap_pending(thd), thd_state, cycles, timeout);
 }
 
 static inline int
 thd_switch_update(struct thread *thd, struct pt_regs *regs, int issame)
 {
-	int preempt = 0;
+	int preempt = 0, pending = 0;
 
 	/* TODO: check FPU */
 	/* fpu_save(thd); */
@@ -604,7 +619,7 @@ thd_switch_update(struct thread *thd, struct pt_regs *regs, int issame)
 		assert(!(thd->state & THD_STATE_PREEMPTED));
 		thd->state &= ~THD_STATE_RCVING;
 		thd_rcvcap_pending_deliver(thd, regs);
-
+		pending = thd_rcvcap_pending(thd);
 		/*
 		 * If a scheduler thread was running using child tcap and blocked on RCVING
 		 * and budget expended logic decided to run the scheduler thread with it's
@@ -624,7 +639,7 @@ thd_switch_update(struct thread *thd, struct pt_regs *regs, int issame)
 	}
 
 	if (issame && preempt == 0) {
-		__userregs_set(regs, 0, __userregs_getsp(regs), __userregs_getip(regs));
+		__userregs_set(regs, pending, __userregs_getsp(regs), __userregs_getip(regs));
 	}
 
 	return preempt;

From 5bfe50c5da718516ac1cc3f0c32295d4c12e9323 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Thu, 4 Apr 2019 23:12:21 -0400
Subject: [PATCH 044/127] Fixed cos_ulsched_rcv, unit-tested for basic use!

* TODO: sl_thd_rcv() test!
---
 src/components/include/cos_ulsched_rcv.h |  45 ++++++++++
 src/components/include/sl.h              | 107 +++++------------------
 src/components/lib/cos_kernel_api.c      |   3 -
 src/components/lib/sl/sl_sched.c         |   8 +-
 src/kernel/capinv.c                      |   4 -
 src/kernel/include/shared/cos_sched.h    |   1 -
 src/kernel/include/shared/cos_types.h    |   1 +
 src/kernel/include/thd.h                 |   8 --
 8 files changed, 72 insertions(+), 105 deletions(-)
 create mode 100644 src/components/include/cos_ulsched_rcv.h

diff --git a/src/components/include/cos_ulsched_rcv.h b/src/components/include/cos_ulsched_rcv.h
new file mode 100644
index 0000000000..29a470e2b1
--- /dev/null
+++ b/src/components/include/cos_ulsched_rcv.h
@@ -0,0 +1,45 @@
+#ifndef COS_ULSCHED_RCV_H
+#define COS_ULSCHED_RCV_H
+
+#include <cos_kernel_api.h>
+
+static inline int
+__cos_sched_events_present(struct cos_sched_ring *r)
+{
+	return !(r->tail == r->head);
+}
+
+static inline int
+__cos_sched_event_consume(struct cos_sched_ring *r, struct cos_sched_event *e)
+{
+	int f = 0;
+
+	if (!r || !e || !__cos_sched_events_present(r)) return 0;
+	*e = r->event_buf[f];
+	f = ps_upfaa((unsigned long *)&r->head, 1);
+//	memcpy((void *)e, (void *)&(r->event_buf[f]), sizeof(struct cos_sched_event));
+
+	return 1;
+}
+
+/* if other than sched-thread calls this, races will need to be handled by the caller! */
+static inline int
+cos_ul_sched_rcv(arcvcap_t rcv, rcv_flags_t rfl, tcap_time_t timeout, struct cos_sched_event *evt)
+{
+	int ret = 0;
+	struct cos_scb_info *scb_cpu = cos_scb_info_get_core();
+	struct cos_sched_ring *r     = &scb_cpu->sched_events;
+
+	assert(scb_cpu);
+	/* a non-scheduler thread, should call with rcv == 0 to consume user-level events alone */
+	if (unlikely(__cos_sched_event_consume(r, evt) == 0
+		     && rcv && !(rfl & RCV_ULONLY))) {
+		ret = cos_sched_rcv(rcv, rfl, timeout, &(evt->tid), (int *)&(evt->evt.blocked),
+			            (cycles_t *)&(evt->evt.elapsed_cycs), (tcap_time_t *)&(evt->evt.next_timeout));
+		if (unlikely(ret < 0)) return ret;
+	}
+
+	return (ret || __cos_sched_events_present(r));
+}
+
+#endif /* COS_ULSCHED_RCV_H */
diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index 67b8ffebc3..e77e903fa4 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -571,18 +571,18 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 	 * catch it.  This is a little twitchy and subtle, so lets put
 	 * it in a function, here.
 	 */
-//	if (likely(to)) {
-//		t = to;
-//		if (unlikely(!sl_thd_is_runnable(t))) to = NULL;
-//	}
-//	if (unlikely(!to)) {
-//		struct sl_thd_policy *pt = sl_mod_schedule();
-//
-//		if (unlikely(!pt))
-//			t = sl__globals_cpu()->idle_thd;
-//		else
-//			t = sl_mod_thd_get(pt);
-//	}
+	if (likely(to)) {
+		t = to;
+		if (unlikely(!sl_thd_is_runnable(t))) to = NULL;
+	}
+	if (unlikely(!to)) {
+		struct sl_thd_policy *pt = sl_mod_schedule();
+
+		if (unlikely(!pt))
+			t = sl__globals_cpu()->idle_thd;
+		else
+			t = sl_mod_thd_get(pt);
+	}
 
 #if 0
 	if (t->properties & SL_THD_PROPERTY_OWN_TCAP && t->budget) {
@@ -723,88 +723,19 @@ sl_thd_yield(thdid_t tid)
 	}
 }
 
-/* TODO: where to put this code? */
-static inline int
-__cos_sched_events_present(struct cos_sched_ring *r)
-{
-	PRINTC("%s:%d\n", __func__, __LINE__);
-	return !(r->tail == r->head);
-}
-
-static inline int
-__cos_sched_event_consume(struct cos_sched_ring *r, struct cos_sched_event *e)
-{
-	int f = 0;
-
-	PRINTC("%s:%d\n", __func__, __LINE__);
-	if (!r || !e || !__cos_sched_events_present(r)) return 0;
-	PRINTC("%s:%d\n", __func__, __LINE__);
-
-	f = ps_upfaa((unsigned long *)&r->head, 1);
-
-	memcpy((void *)e, (void *)&(r->event_buf[f]), sizeof(struct cos_sched_event));
-	PRINTC("%s:%d\n", __func__, __LINE__);
-
-	return 1;
-}
-
-static inline int
-sl_sched_rcv_intern(struct cos_sched_event *e, int nonblock)
-{
-	int ret = 0;
-	struct sl_global_cpu *g  = sl__globals_cpu();
-	struct cos_sched_ring *r = &sl_scb_info_cpu()->sched_events;
-
-	PRINTC("%s:%d %p %p %p\n", __func__, __LINE__, g, sl_scb_info_cpu(), r);
-	//memset(e, 0, sizeof(struct cos_sched_event));
-//	if (unlikely(__cos_sched_event_consume(r, e) == 0)) {
-//	PRINTC("%s:%d\n", __func__, __LINE__);
-//		int blocked;
-//		thdid_t tid;
-//		cycles_t cycs;
-//		tcap_time_t timeout;
-//
-//		ret = cos_sched_rcv(g->sched_rcv, nonblock ? RCV_NON_BLOCKING : 0, g->timeout_next,
-//				    &tid, &blocked, &cycs, &timeout);
-//	PRINTC("%s:%d\n", __func__, __LINE__);
-//		if (ret < 0) return ret;
-//
-//	PRINTC("%s:%d\n", __func__, __LINE__);
-//		e->tid = tid;
-//		e->evt.elapsed_cycs = cycs;
-//		e->evt.blocked      = blocked;
-//		e->evt.next_timeout = timeout;
-//	PRINTC("%s:%d\n", __func__, __LINE__);
-//	}
-
-	PRINTC("%s:%d\n", __func__, __LINE__);
-	return __cos_sched_events_present(r);
-}
-
-static inline int
-sl_sched_rcv_nonblock(struct cos_sched_event *e)
-{
-	return sl_sched_rcv_intern(e, 1);
-}
-
 static inline int
-sl_sched_rcv(struct cos_sched_event *e)
+sl_thd_rcv(rcv_flags_t flags)
 {
-	return sl_sched_rcv_intern(e, 0);
-}
-
-static inline int
-sl_thd_rcv(void)
-{
-	int count = 0;
 	struct sl_thd *t = sl_thd_curr();
 	unsigned long *p = &sl_thd_dcbinfo(t)->pending, q = 0;
 
+	assert(sl_thd_rcvcap(t));
 check:
 	sl_cs_enter();
 	q = *p;
 	if (q == 0) {
-		count++;
+		if (unlikely(!(flags & RCV_ULONLY))) goto rcv;
+		if (unlikely(flags & RCV_NON_BLOCKING)) goto done;
 
 		sl_thd_sched_block_no_cs(t, SL_THD_BLOCKED, 0);
 		sl_cs_exit_switchto(sl__globals_cpu()->sched_thd);
@@ -812,10 +743,16 @@ sl_thd_rcv(void)
 		goto check;
 	}
 
+	/* cas may fail. but we got an event right now! */
 	ps_upcas(p, q, 0);
+done:
 	sl_cs_exit();
 
 	return q;
+rcv:
+	sl_cs_exit();
+
+	return cos_rcv(sl_thd_rcvcap(t), flags);
 }
 
 #endif /* SL_H */
diff --git a/src/components/lib/cos_kernel_api.c b/src/components/lib/cos_kernel_api.c
index f516030e63..61c2b2d73c 100644
--- a/src/components/lib/cos_kernel_api.c
+++ b/src/components/lib/cos_kernel_api.c
@@ -939,14 +939,11 @@ cos_sched_rcv(arcvcap_t rcv, rcv_flags_t flags, tcap_time_t timeout,
 	unsigned long cyc       = 0;
 	int           ret;
 
-	PRINTC("%s:%d\n", __func__, __LINE__);
-
 	ret = call_cap_retvals_asm(rcv, 0, flags, timeout, 0, 0, &thd_state, &cyc, thd_timeout);
 
 	*blocked = (int)(thd_state >> (sizeof(thd_state) * 8 - 1));
 	*thdid   = (thdid_t)(thd_state & ((1 << (sizeof(thdid_t) * 8)) - 1));
 	*cycles  = cyc;
-	PRINTC("%s:%d\n", __func__, __LINE__);
 
 	return ret;
 }
diff --git a/src/components/lib/sl/sl_sched.c b/src/components/lib/sl/sl_sched.c
index 62a936b034..3fcfd17bed 100644
--- a/src/components/lib/sl/sl_sched.c
+++ b/src/components/lib/sl/sl_sched.c
@@ -14,6 +14,7 @@
 #include <cos_kernel_api.h>
 #include <bitmap.h>
 #include <cos_dcb.h>
+#include <cos_ulsched_rcv.h>
 
 struct sl_global sl_global_data;
 struct sl_global_cpu sl_global_cpu_data[NUM_CPU] CACHE_ALIGNED;
@@ -455,11 +456,10 @@ sl_thd_yield_cs_exit_intern(thdid_t tid)
 	struct sl_thd *t = sl_thd_curr();
 
 	/* reset rcv_suspended if the scheduler thinks "curr" was suspended on cos_rcv previously */
-	//sl_thd_sched_unblock_no_cs(t);
+	sl_thd_sched_unblock_no_cs(t);
 	if (likely(tid)) {
 		struct sl_thd *to = sl_thd_lkup(tid);
 
-		//assert(to);
 		sl_cs_exit_switchto(to);
 	} else {
 		sl_mod_yield(sl_mod_thd_policy_get(t), NULL);
@@ -636,7 +636,7 @@ static void
 sl_sched_loop_intern(int non_block)
 {
 	struct sl_global_cpu *g   = sl__globals_cpu();
-	//rcv_flags_t           rfl = (non_block ? RCV_NON_BLOCKING : 0) | RCV_ALL_PENDING;
+	rcv_flags_t           rfl = (non_block ? RCV_NON_BLOCKING : 0);
 
 	while (1) {
 		int pending;
@@ -651,7 +651,7 @@ sl_sched_loop_intern(int non_block)
 			 * states of it's child threads) and normal notifications (mainly activations from
 			 * it's parent scheduler).
 			 */
-			pending = sl_sched_rcv_intern(&e, non_block);
+			pending = cos_ul_sched_rcv(g->sched_rcv, rfl, g->timeout_next, &e);
 
 			if (!e.tid) goto pending_events;
 
diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index 1ffafea638..c2763c5951 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -912,24 +912,20 @@ cap_arcv_op(struct cap_arcv *arcv, struct thread *thd, struct pt_regs *regs, str
 	tcap_time_t          swtimeout   = TCAP_TIME_NIL;
 	tcap_time_t          timeout     = __userregs_get2(regs);
 
-	printk("%s:%d\n", __func__, __LINE__);
 	if (unlikely(arcv->thd != thd || arcv->cpuid != get_cpuid())) return -EINVAL;
 
 	/* deliver pending notifications? */
 	if (thd_rcvcap_pending(thd)) {
-	printk("%s:%d\n", __func__, __LINE__);
 		__userregs_set(regs, 0, __userregs_getsp(regs), __userregs_getip(regs));
 		thd_rcvcap_pending_deliver(thd, regs);
 
 		return 0;
 	} else if (rflags & RCV_NON_BLOCKING) {
-	printk("%s:%d\n", __func__, __LINE__);
 		__userregs_set(regs, 0, __userregs_getsp(regs), __userregs_getip(regs));
 		__userregs_setretvals(regs, -EAGAIN, 0, 0, 0);
 
 		return 0;
 	}
-	printk("%s:%d\n", __func__, __LINE__);
 	__userregs_setretvals(regs, 0, 0, 0, 0);
 
 	next = notify_parent(thd, 0);
diff --git a/src/kernel/include/shared/cos_sched.h b/src/kernel/include/shared/cos_sched.h
index 3b195f23cd..eef5664464 100644
--- a/src/kernel/include/shared/cos_sched.h
+++ b/src/kernel/include/shared/cos_sched.h
@@ -15,7 +15,6 @@ struct cos_sched_event {
 } __attribute__((packed));
 
 #define COS_SCHED_EVENT_RING_SIZE 16
-#define COS_SCHED_EVENT_MASK      (COS_SCHED_EVENT_RING_SIZE - 1)
 
 struct cos_sched_ring {
 	int head, tail;
diff --git a/src/kernel/include/shared/cos_types.h b/src/kernel/include/shared/cos_types.h
index 02ff8d50ad..a92369692b 100644
--- a/src/kernel/include/shared/cos_types.h
+++ b/src/kernel/include/shared/cos_types.h
@@ -73,6 +73,7 @@ typedef enum {
 
 typedef enum {
 	RCV_NON_BLOCKING = 1,
+	RCV_ULONLY       = (1 << 1),
 } rcv_flags_t;
 
 #define BOOT_LIVENESS_ID_BASE 2
diff --git a/src/kernel/include/thd.h b/src/kernel/include/thd.h
index b47755b4c5..224c7e57fd 100644
--- a/src/kernel/include/thd.h
+++ b/src/kernel/include/thd.h
@@ -554,13 +554,10 @@ thd_sched_events_produce(struct thread *thd, struct cos_cpu_local_info *cos_info
 	struct cos_sched_ring *r   = NULL;
 	struct comp_info      *c   = NULL;
 
-	printk("%s:%d\n", __func__, __LINE__);
 	if (unlikely(inv_top != 0 || thd->rcvcap.is_init == 0)) return 0;
 
-	printk("%s:%d\n", __func__, __LINE__);
 	c = thd_invstk_peek_compinfo(thd, cos_info, inv_top);
 	if (unlikely(!c || !c->scb_data)) return 0;
-	printk("%s:%d\n", __func__, __LINE__);
 
 	scb = ((c->scb_data) + get_cpuid());
 	r   = &(scb->sched_events);
@@ -568,25 +565,20 @@ thd_sched_events_produce(struct thread *thd, struct cos_cpu_local_info *cos_info
 	 * only produce more if the ring is empty! 
 	 * so the user only calls after dequeueing all previous events. 
 	 */
-	printk("%s:%d\n", __func__, __LINE__);
 	if (unlikely(r->head != r->tail)) return -EAGAIN;
 
-	printk("%s:%d\n", __func__, __LINE__);
 	r->head = r->tail = 0;
 	while (delta < COS_SCHED_EVENT_RING_SIZE) {
-	printk("%s:%d\n", __func__, __LINE__);
 		struct cos_sched_event *e = &(r->event_buf[delta]);
 		unsigned long thd_state;
 
 		if (!thd_state_evt_deliver(thd, &thd_state, (unsigned long *)&(e->evt.elapsed_cycs),
 					(unsigned long *)&(e->evt.next_timeout))) break;
-	printk("%s:%d\n", __func__, __LINE__);
 		e->tid         = (thd_state << 1) >> 1;
 		e->evt.blocked = (thd_state >> 31);
 
 		delta++;
 	}
-	printk("%s:%d\n", __func__, __LINE__);
 
 	r->tail += delta;
 

From 43096d8087010e4bdd95e2c917821d32a5d932f5 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Thu, 18 Apr 2019 11:34:12 -0400
Subject: [PATCH 045/127] Add OMP & GOMP backend in Composite

* Fixed some multi-core stuff, need to be up to date with
  rump_cfe_integration branch where I've perhaps more bugs fixed!
* Added a OMP_HELLO program which is simple and just prints out HELLO.
* COS_GOMP really doesn't do anything multi-core for now!
---
 .../implementation/capmgr/naive/init.c        |   2 +-
 .../no_interface/llbooter/llbooter.c          |   2 +-
 .../no_interface/omp_hello/Makefile           |  10 +
 .../no_interface/omp_hello/hello_omp.c        | 140 ++++++++++++++
 src/components/include/cos_omp.h              |  18 ++
 src/components/include/omp.h                  | 174 ++++++++++++++++++
 src/components/lib/Makefile                   |   2 +-
 src/components/lib/cos_gomp.c                 |  77 ++++++++
 src/components/lib/cos_omp.c                  |  66 +++++++
 src/components/lib/sl/Makefile                |   2 +-
 src/components/lib/sl/sl_mod_fifo.c           | 105 +++++++++++
 src/kernel/include/shared/cos_config.h        |   2 +-
 src/platform/i386/qemu-kvm.sh                 |   2 +-
 src/platform/i386/runscripts/omp_hello.sh     |   7 +
 14 files changed, 603 insertions(+), 6 deletions(-)
 create mode 100644 src/components/implementation/no_interface/omp_hello/Makefile
 create mode 100644 src/components/implementation/no_interface/omp_hello/hello_omp.c
 create mode 100644 src/components/include/cos_omp.h
 create mode 100644 src/components/include/omp.h
 create mode 100644 src/components/lib/cos_gomp.c
 create mode 100644 src/components/lib/cos_omp.c
 create mode 100644 src/components/lib/sl/sl_mod_fifo.c
 create mode 100644 src/platform/i386/runscripts/omp_hello.sh

diff --git a/src/components/implementation/capmgr/naive/init.c b/src/components/implementation/capmgr/naive/init.c
index 2817b1b14c..33c11068c6 100644
--- a/src/components/implementation/capmgr/naive/init.c
+++ b/src/components/implementation/capmgr/naive/init.c
@@ -7,7 +7,7 @@
 #include <hypercall.h>
 #include <sl.h>
 
-static int capmgr_init_core_done = 0;
+static volatile int capmgr_init_core_done = 0;
 
 static void
 capmgr_comp_info_iter_cpu(void)
diff --git a/src/components/implementation/no_interface/llbooter/llbooter.c b/src/components/implementation/no_interface/llbooter/llbooter.c
index 30a4406f77..23fe65e998 100644
--- a/src/components/implementation/no_interface/llbooter/llbooter.c
+++ b/src/components/implementation/no_interface/llbooter/llbooter.c
@@ -446,7 +446,7 @@ boot_comp_capinfo_init(void)
 	}
 }
 
-static int init_core_alloc_done = 0, core_init_done[NUM_CPU] = { 0 };
+static volatile int init_core_alloc_done = 0, core_init_done[NUM_CPU] = { 0 };
 
 void
 cos_init(void)
diff --git a/src/components/implementation/no_interface/omp_hello/Makefile b/src/components/implementation/no_interface/omp_hello/Makefile
new file mode 100644
index 0000000000..aa2e8f2fac
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_hello/Makefile
@@ -0,0 +1,10 @@
+COMPONENT=omp_hello.o
+INTERFACES=
+DEPENDENCIES=capmgr
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp -lcos_omp $(LIBSLCAPMGR) -lsl_mod_fifo -lsl_thd_static_backend -lcos_defkernel_api
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
+
+CFLAGS += -fopenmp
diff --git a/src/components/implementation/no_interface/omp_hello/hello_omp.c b/src/components/implementation/no_interface/omp_hello/hello_omp.c
new file mode 100644
index 0000000000..081b4a5821
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_hello/hello_omp.c
@@ -0,0 +1,140 @@
+#include <cos_kernel_api.h>
+#include <cos_defkernel_api.h>
+#include <llprint.h>
+#include <sl.h>
+#include <cos_omp.h>
+
+/******************************************************************************/
+
+int main ( void )
+
+/******************************************************************************/
+/*
+  Purpose:
+
+    HELLO has each thread print out its ID.
+
+  Discussion:
+
+    HELLO is a "Hello, World" program for OpenMP.
+
+  Licensing:
+
+    This code is distributed under the GNU LGPL license. 
+
+  Modified:
+
+    23 June 2010
+
+  Author:
+
+    John Burkardt
+*/
+{
+  int id;
+  double wtime;
+
+  PRINTC ( "\n" );
+  PRINTC ( "HELLO_OPENMP\n" );
+  PRINTC ( "  C/OpenMP version\n" );
+
+  PRINTC ( "\n" );
+  PRINTC ( "  Number of processors available = %d\n", omp_get_num_procs ( ) );
+  PRINTC ( "  Number of threads =              %d\n", omp_get_max_threads ( ) );
+
+  wtime = omp_get_wtime ( );
+
+  PRINTC ( "\n" );
+  PRINTC ( "  OUTSIDE the parallel region.\n" );
+  PRINTC ( "\n" );
+
+  id = omp_get_thread_num ( );
+  PRINTC ( "  HELLO from process %d\n", id ) ;
+
+  PRINTC ( "\n" );
+  PRINTC ( "  Going INSIDE the parallel region:\n" );
+  PRINTC ( "\n" );
+/*
+  INSIDE THE PARALLEL REGION, have each thread say hello.
+*/
+#if 0
+#pragma omp parallel
+#pragma omp for
+  for (id = 0; id < 10; id++) {
+	  PRINTC("id:%u\n", id);
+  }
+#else
+# pragma omp parallel \
+  private ( id )
+  {
+    id = omp_get_thread_num ( );
+    PRINTC ("  Hello from process %d\n", id );
+  }
+#endif
+/*
+  Finish up by measuring the elapsed time.
+*/
+  wtime = omp_get_wtime ( ) - wtime;
+
+  PRINTC ( "\n" );
+  PRINTC ( "  Back OUTSIDE the parallel region.\n" );
+/*
+  Terminate.
+*/
+  PRINTC ( "\n" );
+  PRINTC ( "HELLO_OPENMP\n" );
+  PRINTC ( "  Normal end of execution.\n" );
+
+  PRINTC ( "\n" );
+  PRINTC ( "  Elapsed wall clock time = %f\n", wtime );
+
+  return 0;
+}
+
+static void 
+cos_main(void *d)
+{
+	main();
+}
+
+void
+cos_init(void *d)
+{
+	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
+	struct cos_compinfo *   ci    = cos_compinfo_get(defci);
+	int i;
+	static volatile unsigned long first = NUM_CPU + 1, init_done[NUM_CPU] = { 0 };
+
+	if (ps_cas((unsigned long *)&first, NUM_CPU + 1, cos_cpuid())) {
+		PRINTC("In OpenMP-based Hello Program!\n");
+		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
+		cos_defcompinfo_init();
+		cos_omp_init();
+
+	} else {
+		while (!ps_load((unsigned long *)&init_done[first])) ;
+
+		cos_defcompinfo_sched_init();
+		sl_init(SL_MIN_PERIOD_US*100);
+	}
+	ps_faa((unsigned long *)&init_done[cos_cpuid()], 1);
+
+	/* make sure the INITTHD of the scheduler is created on all cores.. for cross-core sl initialization to work! */
+	for (i = 0; i < NUM_CPU; i++) {
+		while (!ps_load((unsigned long *)&init_done[i])) ;
+	}
+
+	if (!cos_cpuid()) {
+		struct sl_thd *t = NULL;
+
+		sl_init(SL_MIN_PERIOD_US*100);
+		t = sl_thd_alloc(cos_main, NULL);
+		assert(t);
+		sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX));
+	}
+
+	sl_sched_loop_nonblock();
+
+	PRINTC("Should never get here!\n");
+	assert(0);
+}
diff --git a/src/components/include/cos_omp.h b/src/components/include/cos_omp.h
new file mode 100644
index 0000000000..28cd98b035
--- /dev/null
+++ b/src/components/include/cos_omp.h
@@ -0,0 +1,18 @@
+/**
+ * Redistribution of this file is permitted under the BSD two clause license.
+ *
+ * Copyright 2019, The George Washington University
+ * Author: Phani Gadepalli, phanikishoreg@gwu.edu
+ */
+
+#ifndef COS_OMP_H
+#define COS_OMP_H
+
+#include <cos_types.h>
+#include <omp.h>
+
+#define COS_OMP_MAX_NUM_THREADS (NUM_CPU)
+
+extern void cos_omp_init(void);
+
+#endif /* COS_OMP_H */
diff --git a/src/components/include/omp.h b/src/components/include/omp.h
new file mode 100644
index 0000000000..f3312ec5bc
--- /dev/null
+++ b/src/components/include/omp.h
@@ -0,0 +1,174 @@
+/* Copyright (C) 2005-2017 Free Software Foundation, Inc.
+   Contributed by Richard Henderson <rth@redhat.com>.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* 
+ * NOTE: This header is from gcc 7 customized 
+ *	 to support only what is required in our environment 
+ */
+#ifndef _OMP_H
+#define _OMP_H 1
+
+#ifndef _LIBGOMP_OMP_LOCK_DEFINED
+#define _LIBGOMP_OMP_LOCK_DEFINED 1
+/* These two structures get edited by the libgomp build process to 
+   reflect the shape of the two types.  Their internals are private
+   to the library.  */
+
+typedef struct
+{
+  unsigned char _x[4] 
+    __attribute__((__aligned__(4)));
+} omp_lock_t;
+
+typedef struct
+{
+#if defined(__linux__)
+  unsigned char _x[8 + sizeof (void *)] 
+    __attribute__((__aligned__(sizeof (void *))));
+#else
+  unsigned char _x[16] 
+    __attribute__((__aligned__(8)));
+#endif
+} omp_nest_lock_t;
+#endif
+
+typedef enum omp_sched_t
+{
+  omp_sched_static = 1,
+  omp_sched_dynamic = 2,
+  omp_sched_guided = 3,
+  omp_sched_auto = 4
+} omp_sched_t;
+
+typedef enum omp_proc_bind_t
+{
+  omp_proc_bind_false = 0,
+  omp_proc_bind_true = 1,
+  omp_proc_bind_master = 2,
+  omp_proc_bind_close = 3,
+  omp_proc_bind_spread = 4
+} omp_proc_bind_t;
+
+typedef enum omp_lock_hint_t
+{
+  omp_lock_hint_none = 0,
+  omp_lock_hint_uncontended = 1,
+  omp_lock_hint_contended = 2,
+  omp_lock_hint_nonspeculative = 4,
+  omp_lock_hint_speculative = 8,
+} omp_lock_hint_t;
+
+#ifdef __cplusplus
+extern "C" {
+# define __GOMP_NOTHROW throw ()
+#else
+# define __GOMP_NOTHROW __attribute__((__nothrow__))
+#endif
+
+//extern void omp_set_num_threads (int) __GOMP_NOTHROW;
+extern int omp_get_num_threads (void) __GOMP_NOTHROW;
+extern int omp_get_max_threads (void) __GOMP_NOTHROW;
+extern int omp_get_thread_num (void) __GOMP_NOTHROW;
+extern int omp_get_num_procs (void) __GOMP_NOTHROW;
+
+//extern int omp_in_parallel (void) __GOMP_NOTHROW;
+//
+//extern void omp_set_dynamic (int) __GOMP_NOTHROW;
+//extern int omp_get_dynamic (void) __GOMP_NOTHROW;
+//
+//extern void omp_set_nested (int) __GOMP_NOTHROW;
+//extern int omp_get_nested (void) __GOMP_NOTHROW;
+//
+//extern void omp_init_lock (omp_lock_t *) __GOMP_NOTHROW;
+//extern void omp_init_lock_with_hint (omp_lock_t *, omp_lock_hint_t)
+//  __GOMP_NOTHROW;
+//extern void omp_destroy_lock (omp_lock_t *) __GOMP_NOTHROW;
+//extern void omp_set_lock (omp_lock_t *) __GOMP_NOTHROW;
+//extern void omp_unset_lock (omp_lock_t *) __GOMP_NOTHROW;
+//extern int omp_test_lock (omp_lock_t *) __GOMP_NOTHROW;
+//
+//extern void omp_init_nest_lock (omp_nest_lock_t *) __GOMP_NOTHROW;
+//extern void omp_init_nest_lock_with_hint (omp_nest_lock_t *, omp_lock_hint_t)
+//  __GOMP_NOTHROW;
+//extern void omp_destroy_nest_lock (omp_nest_lock_t *) __GOMP_NOTHROW;
+//extern void omp_set_nest_lock (omp_nest_lock_t *) __GOMP_NOTHROW;
+//extern void omp_unset_nest_lock (omp_nest_lock_t *) __GOMP_NOTHROW;
+//extern int omp_test_nest_lock (omp_nest_lock_t *) __GOMP_NOTHROW;
+//
+extern double omp_get_wtime (void) __GOMP_NOTHROW;
+//extern double omp_get_wtick (void) __GOMP_NOTHROW;
+//
+//extern void omp_set_schedule (omp_sched_t, int) __GOMP_NOTHROW;
+//extern void omp_get_schedule (omp_sched_t *, int *) __GOMP_NOTHROW;
+//extern int omp_get_thread_limit (void) __GOMP_NOTHROW;
+//extern void omp_set_max_active_levels (int) __GOMP_NOTHROW;
+//extern int omp_get_max_active_levels (void) __GOMP_NOTHROW;
+//extern int omp_get_level (void) __GOMP_NOTHROW;
+//extern int omp_get_ancestor_thread_num (int) __GOMP_NOTHROW;
+//extern int omp_get_team_size (int) __GOMP_NOTHROW;
+//extern int omp_get_active_level (void) __GOMP_NOTHROW;
+//
+//extern int omp_in_final (void) __GOMP_NOTHROW;
+//
+//extern int omp_get_cancellation (void) __GOMP_NOTHROW;
+//extern omp_proc_bind_t omp_get_proc_bind (void) __GOMP_NOTHROW;
+//extern int omp_get_num_places (void) __GOMP_NOTHROW;
+//extern int omp_get_place_num_procs (int) __GOMP_NOTHROW;
+//extern void omp_get_place_proc_ids (int, int *) __GOMP_NOTHROW;
+//extern int omp_get_place_num (void) __GOMP_NOTHROW;
+//extern int omp_get_partition_num_places (void) __GOMP_NOTHROW;
+//extern void omp_get_partition_place_nums (int *) __GOMP_NOTHROW;
+//
+//extern void omp_set_default_device (int) __GOMP_NOTHROW;
+//extern int omp_get_default_device (void) __GOMP_NOTHROW;
+//extern int omp_get_num_devices (void) __GOMP_NOTHROW;
+//extern int omp_get_num_teams (void) __GOMP_NOTHROW;
+//extern int omp_get_team_num (void) __GOMP_NOTHROW;
+//
+//extern int omp_is_initial_device (void) __GOMP_NOTHROW;
+//extern int omp_get_initial_device (void) __GOMP_NOTHROW;
+//extern int omp_get_max_task_priority (void) __GOMP_NOTHROW;
+//
+//extern void *omp_target_alloc (__SIZE_TYPE__, int) __GOMP_NOTHROW;
+//extern void omp_target_free (void *, int) __GOMP_NOTHROW;
+//extern int omp_target_is_present (void *, int) __GOMP_NOTHROW;
+//extern int omp_target_memcpy (void *, void *, __SIZE_TYPE__, __SIZE_TYPE__,
+//			      __SIZE_TYPE__, int, int) __GOMP_NOTHROW;
+//extern int omp_target_memcpy_rect (void *, void *, __SIZE_TYPE__, int,
+//				   const __SIZE_TYPE__ *,
+//				   const __SIZE_TYPE__ *,
+//				   const __SIZE_TYPE__ *,
+//				   const __SIZE_TYPE__ *,
+//				   const __SIZE_TYPE__ *, int, int)
+//  __GOMP_NOTHROW;
+//extern int omp_target_associate_ptr (void *, void *, __SIZE_TYPE__,
+//				     __SIZE_TYPE__, int) __GOMP_NOTHROW;
+//extern int omp_target_disassociate_ptr (void *, int) __GOMP_NOTHROW;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _OMP_H */
diff --git a/src/components/lib/Makefile b/src/components/lib/Makefile
index 85903b5e82..903cc13697 100644
--- a/src/components/lib/Makefile
+++ b/src/components/lib/Makefile
@@ -1,6 +1,6 @@
 include Makefile.src Makefile.comp
 
-LIB_OBJS=heap.o cobj_format.o cos_kernel_api.o cos_defkernel_api.o cos_dcb.o
+LIB_OBJS=heap.o cobj_format.o cos_kernel_api.o cos_defkernel_api.o cos_dcb.o cos_omp.o cos_gomp.o
 LIBS=$(LIB_OBJS:%.o=%.a)
 MANDITORY=c_stub.o cos_asm_upcall.o cos_asm_ainv.o cos_component.o
 MAND=$(MANDITORY_LIB)
diff --git a/src/components/lib/cos_gomp.c b/src/components/lib/cos_gomp.c
new file mode 100644
index 0000000000..757359585e
--- /dev/null
+++ b/src/components/lib/cos_gomp.c
@@ -0,0 +1,77 @@
+/**
+ * Redistribution of this file is permitted under the BSD two clause license.
+ *
+ * Copyright 2019, The George Washington University
+ * Author: Phani Gadepalli, phanikishoreg@gwu.edu
+ *
+ *
+ * NOTE: There is no header file for this library! 
+ *	 This is a backend for GOMP API in GCC and 
+ *	 replaces LIBGOMP for composite!
+ */
+
+#include <res_spec.h>
+#include <sl.h>
+#include <cos_omp.h>
+
+#define _THD_FIXED_PRIO 1
+#define _THD_LOCAL_ACTIVATE(t) sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, _THD_FIXED_PRIO))
+
+static void
+_cos_gomp_thd_fn(void *d)
+{
+	struct sl_thd *t = sl_thd_curr();
+	struct cos_aep_info *a = sl_thd_aepinfo(t);
+	cos_thd_fn_t fn = NULL;
+
+	assert(d == NULL);
+
+	/*
+	 * TODO:
+	 * 1. Understand how gomp works with fn & data and what exactly is being passed!
+	 * 2. If work-stealing.. well, where am I stealing from! (void *d) should help with that!
+	 */
+
+	assert(a->fn);
+	fn = (cos_thd_fn_t)a->fn;
+	fn(a->data);
+
+	sl_thd_exit();
+}
+
+static inline unsigned
+_cos_gomp_num_threads(unsigned num_thds)
+{
+	return num_thds > 0 ? num_thds : (unsigned)omp_get_max_threads();
+}
+
+/* GOMP_parallel prototype from libgomp within gcc */
+void
+GOMP_parallel (void (*fn) (void *), void *data, unsigned num_threads,
+               unsigned int flags)
+{
+	/* FIXME: improve everything! */
+	unsigned i;
+
+	num_threads = _cos_gomp_num_threads(num_threads);
+	assert(num_threads <= MAX_NUM_THREADS);
+	for (i = 1; i < num_threads; i++) {
+		struct sl_thd *t = NULL;
+		struct cos_aep_info *a = NULL;
+
+		/* TODO: any handling of AEPs? */
+		t = sl_thd_alloc(_cos_gomp_thd_fn, NULL);
+		assert(t);
+
+		a       = sl_thd_aepinfo(t);
+		a->fn   = (cos_aepthd_fn_t)fn;
+		a->data = data;
+
+		_THD_LOCAL_ACTIVATE(t);
+	}
+
+	sl_thd_yield(0);
+
+	fn(data);
+	/* TODO: anything else to do in this master? thread */
+}
diff --git a/src/components/lib/cos_omp.c b/src/components/lib/cos_omp.c
new file mode 100644
index 0000000000..54ef11c92d
--- /dev/null
+++ b/src/components/lib/cos_omp.c
@@ -0,0 +1,66 @@
+/**
+ * Redistribution of this file is permitted under the BSD two clause license.
+ *
+ * Copyright 2019, The George Washington University
+ * Author: Phani Gadepalli, phanikishoreg@gwu.edu
+ */
+
+#include <cos_omp.h>
+#include <cos_kernel_api.h>
+#include <cos_types.h>
+
+static unsigned int _cycs_per_usec = 0;
+
+#define _USEC_TO_SEC_d(x) (((double)x)/(double)(1000*1000))
+#define _CYCS_TO_SEC_d(x) _USEC_TO_SEC_d((x)/(double)_cycs_per_usec)
+
+__GOMP_NOTHROW double
+omp_get_wtime(void)
+{
+	cycles_t now;
+
+	rdtscll(now);
+	return _CYCS_TO_SEC_d(now);
+}
+
+__GOMP_NOTHROW int
+omp_get_num_procs(void)
+{
+	return NUM_CPU;
+}
+
+__GOMP_NOTHROW int
+omp_get_max_threads(void)
+{
+	return COS_OMP_MAX_NUM_THREADS;
+}
+
+__GOMP_NOTHROW int
+omp_get_num_threads(void)
+{
+	/* FIXME: number of threads in the current team! */
+	return omp_get_max_threads();
+}
+
+__GOMP_NOTHROW int
+omp_get_thread_num(void)
+{
+	/* 
+	 * thread number within a team of a parallel construct! 
+	 * master thd will be = 0
+	 * not the physical thread id.
+	 *
+	 * TODO: fetch from team structure?
+	 *
+	 * For now though, a big hack!
+	 */
+	return (cos_thdid() % omp_get_max_threads());
+}
+
+void
+cos_omp_init(void)
+{
+	_cycs_per_usec = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
+
+	assert(_cycs_per_usec);
+}
diff --git a/src/components/lib/sl/Makefile b/src/components/lib/sl/Makefile
index f4bcf0a260..86567cfbb3 100644
--- a/src/components/lib/sl/Makefile
+++ b/src/components/lib/sl/Makefile
@@ -1,6 +1,6 @@
 include Makefile.src Makefile.comp
 
-LIB_OBJS=sl_capmgr.o sl_raw.o sl_sched.o sl_xcpu.o sl_child.o sl_mod_fprr.o sl_mod_rr.o sl_lock.o sl_thd_static_backend.o
+LIB_OBJS=sl_capmgr.o sl_raw.o sl_sched.o sl_xcpu.o sl_child.o sl_mod_fprr.o sl_mod_rr.o sl_mod_fifo.o sl_lock.o sl_thd_static_backend.o
 LIBS=$(LIB_OBJS:%.o=%.a)
 CINC+=-m32
 
diff --git a/src/components/lib/sl/sl_mod_fifo.c b/src/components/lib/sl/sl_mod_fifo.c
new file mode 100644
index 0000000000..4f6618f1f0
--- /dev/null
+++ b/src/components/lib/sl/sl_mod_fifo.c
@@ -0,0 +1,105 @@
+/**
+ * Redistribution of this file is permitted under the BSD two clause license.
+ *
+ * Copyright 2019, The George Washington University
+ * Author: Phani Gadepalli, phanikishoreg@gwu.edu
+ */
+
+#include <sl.h>
+#include <sl_consts.h>
+#include <sl_mod_policy.h>
+#include <sl_plugins.h>
+
+#define SL_FPRR_PERIOD_US_MIN  SL_MIN_PERIOD_US
+
+struct ps_list_head threads[NUM_CPU] CACHE_ALIGNED;
+
+/* No RR yet */
+void
+sl_mod_execution(struct sl_thd_policy *t, cycles_t cycles)
+{ }
+
+struct sl_thd_policy *
+sl_mod_schedule(void)
+{
+	struct sl_thd_policy *t = NULL;
+
+	if (unlikely(ps_list_head_empty(&threads[cos_cpuid()]))) goto done;
+	t = ps_list_head_first_d(&threads[cos_cpuid()], struct sl_thd_policy);
+
+done:
+	return t;
+}
+
+void
+sl_mod_block(struct sl_thd_policy *t)
+{
+	ps_list_rem_d(t);
+}
+
+void
+sl_mod_wakeup(struct sl_thd_policy *t)
+{
+	assert(ps_list_singleton_d(t));
+
+	ps_list_head_append_d(&threads[cos_cpuid()], t);
+}
+
+void
+sl_mod_yield(struct sl_thd_policy *t, struct sl_thd_policy *yield_to)
+{
+	/* should yield move the current thread to end of the runQ? don't think so! FIFO scheduler, so yield doesn't change the sched order! */
+	ps_list_rem_d(t);
+	ps_list_head_append_d(&threads[cos_cpuid()], t);
+}
+
+void
+sl_mod_thd_create(struct sl_thd_policy *t)
+{
+	t->priority    = TCAP_PRIO_MIN;
+	t->period      = 0;
+	t->period_usec = 0;
+	ps_list_init_d(t);
+
+	/* TODO: add to runq here? for now, only add when PRIO is set and that's pretty much it's ARRIVAL time! */
+}
+
+void
+sl_mod_thd_delete(struct sl_thd_policy *t)
+{ ps_list_rem_d(t); }
+
+void
+sl_mod_thd_param_set(struct sl_thd_policy *t, sched_param_type_t type, unsigned int v)
+{
+	int cpu = cos_cpuid();
+
+	switch (type) {
+	case SCHEDP_PRIO:
+	{
+		t->priority = v;
+		sl_thd_setprio(sl_mod_thd_get(t), t->priority);
+		ps_list_head_append_d(&threads[cos_cpuid()], t);
+
+		break;
+	}
+	case SCHEDP_WINDOW:
+	{
+		assert(v >= SL_FPRR_PERIOD_US_MIN);
+		t->period_usec    = v;
+		t->period         = sl_usec2cyc(v);
+
+		break;
+	}
+	case SCHEDP_BUDGET:
+	{
+		break;
+	}
+	default: assert(0);
+	}
+}
+
+void
+sl_mod_init(void)
+{
+	ps_list_head_init(&threads[cos_cpuid()]);
+}
diff --git a/src/kernel/include/shared/cos_config.h b/src/kernel/include/shared/cos_config.h
index b90ad107f3..ffe9a9cff2 100644
--- a/src/kernel/include/shared/cos_config.h
+++ b/src/kernel/include/shared/cos_config.h
@@ -17,7 +17,7 @@
 
 #include "cpu_ghz.h"
 
-#define NUM_CPU 1
+#define NUM_CPU 2
 
 /*
  * 1 MB, note that this is not the PA of kernel-usable memory, instead
diff --git a/src/platform/i386/qemu-kvm.sh b/src/platform/i386/qemu-kvm.sh
index ea964376b4..b416a0f107 100755
--- a/src/platform/i386/qemu-kvm.sh
+++ b/src/platform/i386/qemu-kvm.sh
@@ -12,4 +12,4 @@ fi
 MODULES=$(sh $1 | awk '/^Writing image/ { print $3; }' | tr '\n' ' ')
 
 #qemu-system-i386 -m 768 -nographic -kernel kernel.img -no-reboot -s -initrd "$(echo $MODULES | tr ' ' ',')"
-qemu-system-i386 -enable-kvm -rtc base=localtime,clock=host,driftfix=none -smp sockets=1,cores=1,threads=1 -cpu host -nographic -m 768 -kernel kernel.img -initrd "$(echo $MODULES | tr ' ' ',')"
+qemu-system-i386 -enable-kvm -rtc base=localtime,clock=host,driftfix=none -smp sockets=1,cores=2,threads=1 -cpu host -nographic -m 768 -kernel kernel.img -initrd "$(echo $MODULES | tr ' ' ',')"
diff --git a/src/platform/i386/runscripts/omp_hello.sh b/src/platform/i386/runscripts/omp_hello.sh
new file mode 100644
index 0000000000..5284d0f941
--- /dev/null
+++ b/src/platform/i386/runscripts/omp_hello.sh
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+cp llboot_comp.o llboot.o
+cp omp_hello.o boot.o
+cp test_boot.o dummy1.o
+cp test_boot.o dummy2.o
+./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub

From 656f5ebfdf1373d17064051247c05e40f79d6155 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Sun, 21 Apr 2019 15:19:37 -0400
Subject: [PATCH 046/127] Adds omp_dijkstra and more in gomp library

* TODO: make the cos_gomp bookkeep for each "TEAM" and also enable
        cross-core execution!
---
 .../no_interface/omp_dijkstra/Makefile        |  10 +
 .../no_interface/omp_dijkstra/dijkstra_omp.c  | 614 ++++++++++++++++++
 .../no_interface/omp_dijkstra/posix_basic.c   |  91 +++
 .../no_interface/omp_hello/Makefile           |   2 +-
 src/components/lib/cos_gomp.c                 |  45 +-
 src/platform/i386/runscripts/omp_dijkstra.sh  |   7 +
 6 files changed, 765 insertions(+), 4 deletions(-)
 create mode 100644 src/components/implementation/no_interface/omp_dijkstra/Makefile
 create mode 100644 src/components/implementation/no_interface/omp_dijkstra/dijkstra_omp.c
 create mode 100644 src/components/implementation/no_interface/omp_dijkstra/posix_basic.c
 create mode 100644 src/platform/i386/runscripts/omp_dijkstra.sh

diff --git a/src/components/implementation/no_interface/omp_dijkstra/Makefile b/src/components/implementation/no_interface/omp_dijkstra/Makefile
new file mode 100644
index 0000000000..664e201ce2
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_dijkstra/Makefile
@@ -0,0 +1,10 @@
+COMPONENT=omp_dijkstra.o
+INTERFACES=
+DEPENDENCIES=capmgr
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp -lcos_omp $(LIBSLCAPMGR) -lsl_mod_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
+
+CFLAGS += -fopenmp
diff --git a/src/components/implementation/no_interface/omp_dijkstra/dijkstra_omp.c b/src/components/implementation/no_interface/omp_dijkstra/dijkstra_omp.c
new file mode 100644
index 0000000000..fd3b60d4e1
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_dijkstra/dijkstra_omp.c
@@ -0,0 +1,614 @@
+#include <cos_types.h>
+#include <cos_kernel_api.h>
+#include <sl.h>
+#include <llprint.h>
+#include <cos_omp.h>
+
+# define NV 6
+
+//int main ( int argc, char **argv );
+int *dijkstra_distance ( int ohd[NV][NV] );
+void find_nearest ( int s, int e, int mind[NV], int connected[NV], int *d, 
+  int *v );
+void init ( int ohd[NV][NV] );
+void timestamp ( void );
+void update_mind ( int s, int e, int mv, int connected[NV], int ohd[NV][NV], 
+  int mind[NV] );
+
+/******************************************************************************/
+
+int main ( void )//int argc, char **argv )
+
+/******************************************************************************/
+/*
+  Purpose:
+
+    MAIN runs an example of Dijkstra's minimum distance algorithm.
+
+  Discussion:
+
+    Given the distance matrix that defines a graph, we seek a list
+    of the minimum distances between node 0 and all other nodes.
+
+    This program sets up a small example problem and solves it.
+
+    The correct minimum distances are:
+
+      0   35   15   45   49   41
+
+  Licensing:
+
+    This code is distributed under the GNU LGPL license. 
+
+  Modified:
+
+    01 July 2010
+
+  Author:
+
+    Original C version by Norm Matloff, CS Dept, UC Davis.
+    This C version by John Burkardt.
+*/
+{
+  int i;
+  int i4_huge = 2147483647;
+  int j;
+  int *mind;
+  int ohd[NV][NV];
+
+  timestamp ( );
+  PRINTC ( "\n" );
+  PRINTC ( "DIJKSTRA_OPENMP\n" );
+  PRINTC ( "  C version\n" );
+  PRINTC ( "  Use Dijkstra's algorithm to determine the minimum\n" );
+  PRINTC ( "  distance from node 0 to each node in a graph,\n" );
+  PRINTC ( "  given the distances between each pair of nodes.\n" );
+  PRINTC ( "\n" );
+  PRINTC ( "  Although a very small example is considered, we\n" );
+  PRINTC ( "  demonstrate the use of OpenMP directives for\n" );
+  PRINTC ( "  parallel execution.\n" );
+/*
+  Initialize the problem data.
+*/
+  init ( ohd );
+/*
+  Print the distance matrix.
+*/
+  PRINTC ( "\n" );
+  PRINTC ( "  Distance matrix:\n" );
+  PRINTC ( "\n" );
+  for ( i = 0; i < NV; i++ )
+  {
+    for ( j = 0; j < NV; j++ )
+    {
+      if ( ohd[i][j] == i4_huge )
+      {
+        PRINTC ( "  Inf" );
+      }
+      else
+      {
+        PRINTC ( "  %3d", ohd[i][j] );
+      }
+    }
+    PRINTC ( "\n" );
+  }
+/*
+  Carry out the algorithm.
+*/
+  mind = dijkstra_distance ( ohd );  
+/*
+  Print the results.
+*/
+  PRINTC ( "\n" );
+  PRINTC ( "  Minimum distances from node 0:\n");
+  PRINTC ( "\n" );
+  for ( i = 0; i < NV; i++ )
+  {
+    PRINTC ( "  %2d  %2d\n", i, mind[i] );
+  }
+/*
+  Free memory.
+*/
+  free ( mind );
+/*
+  Terminate.
+*/
+  PRINTC ( "\n" );
+  PRINTC ( "DIJKSTRA_OPENMP\n" );
+  PRINTC ( "  Normal end of execution.\n" );
+
+  PRINTC ( "\n" );
+  timestamp ( );
+
+  return 0;
+}
+/******************************************************************************/
+
+int *dijkstra_distance ( int ohd[NV][NV]  )
+
+/******************************************************************************/
+/*
+  Purpose:
+
+    DIJKSTRA_DISTANCE uses Dijkstra's minimum distance algorithm.
+
+  Discussion:
+
+    We essentially build a tree.  We start with only node 0 connected
+    to the tree, and this is indicated by setting CONNECTED[0] = 1.
+
+    We initialize MIND[I] to the one step distance from node 0 to node I.
+    
+    Now we search among the unconnected nodes for the node MV whose minimum
+    distance is smallest, and connect it to the tree.  For each remaining
+    unconnected node I, we check to see whether the distance from 0 to MV
+    to I is less than that recorded in MIND[I], and if so, we can reduce
+    the distance.
+
+    After NV-1 steps, we have connected all the nodes to 0, and computed
+    the correct minimum distances.
+
+  Licensing:
+
+    This code is distributed under the GNU LGPL license. 
+
+  Modified:
+
+    02 July 2010
+
+  Author:
+
+    Original C version by Norm Matloff, CS Dept, UC Davis.
+    This C version by John Burkardt.
+
+  Parameters:
+
+    Input, int OHD[NV][NV], the distance of the direct link between
+    nodes I and J.
+
+    Output, int DIJKSTRA_DISTANCE[NV], the minimum distance from 
+    node 0 to each node.
+*/
+{
+  int *connected;
+  int i;
+  int i4_huge = 2147483647;
+  int md;
+  int *mind;
+  int mv;
+  int my_first;
+  int my_id;
+  int my_last;
+  int my_md;
+  int my_mv;
+  int my_step;
+  int nth;
+/*
+  Start out with only node 0 connected to the tree.
+*/
+  connected = ( int * ) malloc ( NV * sizeof ( int ) );
+
+  connected[0] = 1;
+  for ( i = 1; i < NV; i++ )
+  {
+    connected[i] = 0;
+  }
+/*
+  Initial estimate of minimum distance is the 1-step distance.
+*/
+  mind = ( int * ) malloc ( NV * sizeof ( int ) );
+
+  for ( i = 0; i < NV; i++ )
+  {
+    mind[i] = ohd[0][i];
+  }
+/*
+  Begin the parallel region.
+*/
+  # pragma omp parallel private ( my_first, my_id, my_last, my_md, my_mv, my_step ) \
+  shared ( connected, md, mind, mv, nth, ohd )
+  {
+    my_id = omp_get_thread_num ( );
+    nth = omp_get_num_threads ( ); 
+    my_first =   (   my_id       * NV ) / nth;
+    my_last  =   ( ( my_id + 1 ) * NV ) / nth - 1;
+/*
+  The SINGLE directive means that the block is to be executed by only
+  one thread, and that thread will be whichever one gets here first.
+*/
+    # pragma omp single
+    {
+      PRINTC ( "\n" );
+      PRINTC ( "  P%d: Parallel region begins with %d threads\n", my_id, nth );
+      PRINTC ( "\n" );
+    }
+    PRINTC ( "  P%d:  First=%d  Last=%d\n", my_id, my_first, my_last );
+
+    for ( my_step = 1; my_step < NV; my_step++ )
+    {
+/*
+  Before we compare the results of each thread, set the shared variable 
+  MD to a big value.  Only one thread needs to do this.
+*/
+      # pragma omp single 
+      {
+        md = i4_huge;
+        mv = -1; 
+      }
+/*
+  Each thread finds the nearest unconnected node in its part of the graph.
+  Some threads might have no unconnected nodes left.
+*/
+      find_nearest ( my_first, my_last, mind, connected, &my_md, &my_mv );
+/*
+  In order to determine the minimum of all the MY_MD's, we must insist
+  that only one thread at a time execute this block!
+*/
+      # pragma omp critical
+      {
+        if ( my_md < md )  
+        {
+          md = my_md;
+          mv = my_mv;
+        }
+      }
+/*
+  This barrier means that ALL threads have executed the critical
+  block, and therefore MD and MV have the correct value.  Only then
+  can we proceed.
+*/
+      # pragma omp barrier
+/*
+  If MV is -1, then NO thread found an unconnected node, so we're done early. 
+  OpenMP does not like to BREAK out of a parallel region, so we'll just have 
+  to let the iteration run to the end, while we avoid doing any more updates.
+
+  Otherwise, we connect the nearest node.
+*/
+      # pragma omp single 
+      {
+        if ( mv != - 1 )
+        {
+          connected[mv] = 1;
+          PRINTC ( "  P%d: Connecting node %d.\n", my_id, mv );
+        }
+      }
+/*
+  Again, we don't want any thread to proceed until the value of
+  CONNECTED is updated.
+*/
+      # pragma omp barrier
+/*
+  Now each thread should update its portion of the MIND vector,
+  by checking to see whether the trip from 0 to MV plus the step
+  from MV to a node is closer than the current record.
+*/
+      if ( mv != -1 )
+      {
+        update_mind ( my_first, my_last, mv, connected, ohd, mind );
+      }
+/*
+  Before starting the next step of the iteration, we need all threads 
+  to complete the updating, so we set a BARRIER here.
+*/
+      #pragma omp barrier
+    }
+/*
+  Once all the nodes have been connected, we can exit.
+*/
+    # pragma omp single
+    {
+      PRINTC ( "\n" );
+      PRINTC ( "  P%d: Exiting parallel region.\n", my_id );
+    }
+  }
+
+  free ( connected );
+
+  return mind;
+}
+/******************************************************************************/
+
+void find_nearest ( int s, int e, int mind[NV], int connected[NV], int *d, 
+  int *v )
+
+/******************************************************************************/
+/*
+  Purpose:
+
+    FIND_NEAREST finds the nearest unconnected node.
+
+  Licensing:
+
+    This code is distributed under the GNU LGPL license. 
+
+  Modified:
+
+    02 July 2010
+
+  Author:
+
+    Original C version by Norm Matloff, CS Dept, UC Davis.
+    This C version by John Burkardt.
+
+  Parameters:
+
+    Input, int S, E, the first and last nodes that are to be checked.
+
+    Input, int MIND[NV], the currently computed minimum distance from
+    node 0 to each node.
+
+    Input, int CONNECTED[NV], is 1 for each connected node, whose 
+    minimum distance to node 0 has been determined.
+
+    Output, int *D, the distance from node 0 to the nearest unconnected 
+    node in the range S to E.
+
+    Output, int *V, the index of the nearest unconnected node in the range
+    S to E.
+*/
+{
+  int i;
+  int i4_huge = 2147483647;
+
+  *d = i4_huge;
+  *v = -1;
+
+  for ( i = s; i <= e; i++ )
+  {
+    if ( !connected[i] && ( mind[i] < *d ) )
+    {
+      *d = mind[i];
+      *v = i;
+    }
+  }
+  return;
+}
+/******************************************************************************/
+
+void init ( int ohd[NV][NV] )
+
+/******************************************************************************/
+/*
+  Purpose:
+
+    INIT initializes the problem data.
+
+  Discussion:
+
+    The graph uses 6 nodes, and has the following diagram and
+    distance matrix:
+
+    N0--15--N2-100--N3           0   40   15  Inf  Inf  Inf
+      \      |     /            40    0   20   10   25    6
+       \     |    /             15   20    0  100  Inf  Inf
+        40  20  10             Inf   10  100    0  Inf  Inf
+          \  |  /              Inf   25  Inf  Inf    0    8
+           \ | /               Inf    6  Inf  Inf    8    0
+            N1
+            / \
+           /   \
+          6    25
+         /       \
+        /         \
+      N5----8-----N4
+
+  Licensing:
+
+    This code is distributed under the GNU LGPL license. 
+
+  Modified:
+
+    02 July 2010
+
+  Author:
+
+    Original C version by Norm Matloff, CS Dept, UC Davis.
+    This C version by John Burkardt.
+
+  Parameters:
+
+    Output, int OHD[NV][NV], the distance of the direct link between
+    nodes I and J.
+*/
+{
+  int i;
+  int i4_huge = 2147483647;
+  int j;
+
+  for ( i = 0; i < NV; i++ )  
+  {
+    for ( j = 0; j < NV; j++ )
+    {
+      if ( i == j ) 
+      {
+        ohd[i][i] = 0;
+      }
+      else
+      {
+        ohd[i][j] = i4_huge;
+      }
+    }
+  }
+  ohd[0][1] = ohd[1][0] = 40;
+  ohd[0][2] = ohd[2][0] = 15;
+  ohd[1][2] = ohd[2][1] = 20;
+  ohd[1][3] = ohd[3][1] = 10;
+  ohd[1][4] = ohd[4][1] = 25;
+  ohd[2][3] = ohd[3][2] = 100;
+  ohd[1][5] = ohd[5][1] = 6;
+  ohd[4][5] = ohd[5][4] = 8;
+
+  return;
+}
+/******************************************************************************/
+
+void timestamp ( void )
+
+/******************************************************************************/
+/*
+  Purpose:
+
+    TIMESTAMP prints the current YMDHMS date as a time stamp.
+
+  Example:
+
+    31 May 2001 09:45:54 AM
+
+  Licensing:
+
+    This code is distributed under the GNU LGPL license. 
+
+  Modified:
+
+    24 September 2003
+
+  Author:
+
+    John Burkardt
+
+  Parameters:
+
+    None
+*/
+{
+#if 0
+# define TIME_SIZE 40
+
+  static char time_buffer[TIME_SIZE];
+  const struct tm *tm;
+  time_t now;
+
+  now = time ( NULL );
+  tm = localtime ( &now );
+
+  strftime ( time_buffer, TIME_SIZE, "%d %B %Y %I:%M:%S %p", tm );
+
+  PRINTC ( "%s\n", time_buffer );
+
+  return;
+# undef TIME_SIZE
+#else
+  cycles_t now;
+
+  rdtscll(now);
+  PRINTC("%llu\n", now);
+#endif
+}
+/******************************************************************************/
+
+void update_mind ( int s, int e, int mv, int connected[NV], int ohd[NV][NV],
+  int mind[NV] )
+
+/******************************************************************************/
+/*
+  Purpose:
+
+    UPDATE_MIND updates the minimum distance vector.
+
+  Discussion:
+
+    We've just determined the minimum distance to node MV.
+
+    For each unconnected node I in the range S to E,
+    check whether the route from node 0 to MV to I is shorter
+    than the currently known minimum distance.
+
+  Licensing:
+
+    This code is distributed under the GNU LGPL license. 
+
+  Modified:
+
+    02 July 2010
+
+  Author:
+
+    Original C version by Norm Matloff, CS Dept, UC Davis.
+    This C version by John Burkardt.
+
+  Parameters:
+
+    Input, int S, E, the first and last nodes that are to be checked.
+
+    Input, int MV, the node whose minimum distance to node 0
+    has just been determined.
+
+    Input, int CONNECTED[NV], is 1 for each connected node, whose 
+    minimum distance to node 0 has been determined.
+
+    Input, int OHD[NV][NV], the distance of the direct link between
+    nodes I and J.
+
+    Input/output, int MIND[NV], the currently computed minimum distances
+    from node 0 to each node.  On output, the values for nodes S through
+    E have been updated.
+*/
+{
+  int i;
+  int i4_huge = 2147483647;
+
+  for ( i = s; i <= e; i++ )
+  {
+    if ( !connected[i] )
+    {
+      if ( ohd[mv][i] < i4_huge )
+      {
+        if ( mind[mv] + ohd[mv][i] < mind[i] )  
+        {
+          mind[i] = mind[mv] + ohd[mv][i];
+        }
+      }
+    }
+  }
+  return;
+} 
+
+
+static void
+cos_main(void *d)
+{
+	main();
+}
+
+void
+cos_init(void *d)
+{
+	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
+	struct cos_compinfo *   ci    = cos_compinfo_get(defci);
+	int i;
+	static volatile unsigned long first = NUM_CPU + 1, init_done[NUM_CPU] = { 0 };
+
+	if (ps_cas((unsigned long *)&first, NUM_CPU + 1, cos_cpuid())) {
+		PRINTC("In OpenMP-based Hello Program!\n");
+		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
+		cos_defcompinfo_init();
+		cos_omp_init();
+
+	} else {
+		while (!ps_load((unsigned long *)&init_done[first])) ;
+
+		cos_defcompinfo_sched_init();
+		sl_init(SL_MIN_PERIOD_US*100);
+	}
+	ps_faa((unsigned long *)&init_done[cos_cpuid()], 1);
+
+	/* make sure the INITTHD of the scheduler is created on all cores.. for cross-core sl initialization to work! */
+	for (i = 0; i < NUM_CPU; i++) {
+		while (!ps_load((unsigned long *)&init_done[i])) ;
+	}
+
+	if (!cos_cpuid()) {
+		struct sl_thd *t = NULL;
+
+		sl_init(SL_MIN_PERIOD_US*100);
+		t = sl_thd_alloc(cos_main, NULL);
+		assert(t);
+		sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX));
+	}
+
+	sl_sched_loop_nonblock();
+
+	PRINTC("Should never get here!\n");
+	assert(0);
+}
diff --git a/src/components/implementation/no_interface/omp_dijkstra/posix_basic.c b/src/components/implementation/no_interface/omp_dijkstra/posix_basic.c
new file mode 100644
index 0000000000..c9ae04645d
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_dijkstra/posix_basic.c
@@ -0,0 +1,91 @@
+#include <math.h>
+#include <syscall.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/utsname.h>
+#include <time.h>
+
+#include <cos_component.h>
+#include <cos_kernel_api.h>
+#include <cos_types.h>
+#include <ps.h>
+
+#include <memmgr.h>
+
+// HACK: The hack to end all hacks
+void *
+cos_mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset)
+{
+	void *ret=0;
+
+	if (addr != NULL) {
+		printc("parameter void *addr is not supported!\n");
+		errno = ENOTSUP;
+		return MAP_FAILED;
+	}
+	if (fd != -1) {
+		printc("file mapping is not supported!\n");
+		errno = ENOTSUP;
+		return MAP_FAILED;
+	}
+
+	int pages;
+	if (length % 4096) {
+		pages = length / 4096 + 1;
+	} else {
+		pages = length / 4096;
+	}
+
+	addr = (void *)memmgr_heap_page_allocn(pages);
+	if (!addr){
+		ret = (void *) -1;
+	} else {
+		ret = addr;
+	}
+
+	if (ret == (void *)-1) {  /* return value comes from man page */
+		printc("mmap() failed!\n");
+		/* This is a best guess about what went wrong */
+		errno = ENOMEM;
+	}
+	return ret;
+}
+
+long
+cos_syscall_handler(int syscall_num, long a, long b, long c, long d, long e, long f, long g)
+{
+	if (syscall_num == __NR_clock_gettime) {
+		microsec_t microseconds = ps_tsc() / cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
+		time_t seconds = microseconds / 1000000;
+		long rest = microseconds % 1000000;
+
+		*((struct timespec *)b) = (struct timespec) {seconds, rest};
+		return 0;
+	}
+
+	if (syscall_num == __NR_mmap || syscall_num == __NR_mmap2) {
+		return (long)cos_mmap((void *)a, (size_t)b, (int)c, (int)d, (int)e, (off_t)f);
+	}
+
+	if (syscall_num == __NR_brk) {
+		return 0;
+	}
+
+	printc("Unimplemented syscall number %d\n", syscall_num);
+	assert(0);
+	return 0;
+}
+
+// Hack around thread local data
+static int cancelstate = 0;
+
+int
+pthread_setcancelstate(int new, int *old)
+{
+	if (new > 2) return EINVAL;
+
+	if (old) *old = cancelstate;
+	cancelstate = new;
+	return 0;
+}
diff --git a/src/components/implementation/no_interface/omp_hello/Makefile b/src/components/implementation/no_interface/omp_hello/Makefile
index aa2e8f2fac..e62f427203 100644
--- a/src/components/implementation/no_interface/omp_hello/Makefile
+++ b/src/components/implementation/no_interface/omp_hello/Makefile
@@ -2,7 +2,7 @@ COMPONENT=omp_hello.o
 INTERFACES=
 DEPENDENCIES=capmgr
 IF_LIB=
-ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp -lcos_omp $(LIBSLCAPMGR) -lsl_mod_fifo -lsl_thd_static_backend -lcos_defkernel_api
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp -lcos_omp $(LIBSLCAPMGR) -lsl_mod_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/lib/cos_gomp.c b/src/components/lib/cos_gomp.c
index 757359585e..6d8a731049 100644
--- a/src/components/lib/cos_gomp.c
+++ b/src/components/lib/cos_gomp.c
@@ -12,20 +12,21 @@
 
 #include <res_spec.h>
 #include <sl.h>
+#include <sl_lock.h> /* for now, single core lock! */
 #include <cos_omp.h>
 
 #define _THD_FIXED_PRIO 1
 #define _THD_LOCAL_ACTIVATE(t) sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, _THD_FIXED_PRIO))
+static struct sl_lock _cos_gomp_lock = SL_LOCK_STATIC_INIT();
 
 static void
 _cos_gomp_thd_fn(void *d)
 {
+	int *ndone = (int *)d;
 	struct sl_thd *t = sl_thd_curr();
 	struct cos_aep_info *a = sl_thd_aepinfo(t);
 	cos_thd_fn_t fn = NULL;
 
-	assert(d == NULL);
-
 	/*
 	 * TODO:
 	 * 1. Understand how gomp works with fn & data and what exactly is being passed!
@@ -35,6 +36,7 @@ _cos_gomp_thd_fn(void *d)
 	assert(a->fn);
 	fn = (cos_thd_fn_t)a->fn;
 	fn(a->data);
+	ps_faa((unsigned long *)ndone, 1);
 
 	sl_thd_exit();
 }
@@ -52,6 +54,7 @@ GOMP_parallel (void (*fn) (void *), void *data, unsigned num_threads,
 {
 	/* FIXME: improve everything! */
 	unsigned i;
+	unsigned num_done = 0;
 
 	num_threads = _cos_gomp_num_threads(num_threads);
 	assert(num_threads <= MAX_NUM_THREADS);
@@ -60,7 +63,7 @@ GOMP_parallel (void (*fn) (void *), void *data, unsigned num_threads,
 		struct cos_aep_info *a = NULL;
 
 		/* TODO: any handling of AEPs? */
-		t = sl_thd_alloc(_cos_gomp_thd_fn, NULL);
+		t = sl_thd_alloc(_cos_gomp_thd_fn, (void *)&num_done);
 		assert(t);
 
 		a       = sl_thd_aepinfo(t);
@@ -73,5 +76,41 @@ GOMP_parallel (void (*fn) (void *), void *data, unsigned num_threads,
 	sl_thd_yield(0);
 
 	fn(data);
+	ps_faa((unsigned long *)&num_done, 1);
 	/* TODO: anything else to do in this master? thread */
+
+	while (ps_load((unsigned long *)&num_done) < (unsigned long)num_threads) sl_thd_yield(0);
+}
+
+bool
+GOMP_single_start (void)
+{
+	static thdid_t t = 0;
+
+	/* TODO: intelligence! */
+	if (ps_cas((unsigned long *)&t, 0, cos_thdid())) return true;
+	if (t == cos_thdid()) return true;
+
+	return false;
+}
+
+void
+GOMP_barrier (void)
+{
+	/* TODO: intelligence to wait for all threads in the team! */ 
+	sl_thd_yield(0);
+}
+
+void
+GOMP_critical_start (void)
+{
+	/* TODO: a multi-core lock! */
+	sl_lock_take(&_cos_gomp_lock);
+}
+
+void
+GOMP_critical_end (void)
+{
+	/* TODO: a multi-core lock! */
+	sl_lock_release(&_cos_gomp_lock);
 }
diff --git a/src/platform/i386/runscripts/omp_dijkstra.sh b/src/platform/i386/runscripts/omp_dijkstra.sh
new file mode 100644
index 0000000000..0906da77e8
--- /dev/null
+++ b/src/platform/i386/runscripts/omp_dijkstra.sh
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+cp llboot_comp.o llboot.o
+cp omp_dijkstra.o boot.o
+cp test_boot.o dummy1.o
+cp test_boot.o dummy2.o
+./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub

From a5f3180bbfa0b28a26258f8bfe44fcdb1d65374c Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Mon, 22 Apr 2019 10:55:32 -0400
Subject: [PATCH 047/127] Fixed naming: `sl_xcpu` to `sl_xcore` and `nparams`
 in API

---
 src/components/Makefile.comp                  |   2 +-
 .../implementation/capmgr/naive/init.c        |   4 +-
 .../no_interface/llbooter/llbooter.c          |   2 +-
 .../implementation/sched/sched_init.c         |   2 +-
 .../tests/unit_fprr/unit_fprr.c               |  30 ++---
 src/components/include/sl.h                   |  78 ++++++------
 src/components/include/sl_xcore.h             | 109 ++++++++++++++++
 src/components/include/sl_xcpu.h              | 110 ----------------
 src/components/lib/cos_gomp.c                 |   2 +-
 src/components/lib/sl/Makefile                |   2 +-
 src/components/lib/sl/sl_capmgr.c             |  10 +-
 src/components/lib/sl/sl_child.c              |   2 +-
 src/components/lib/sl/sl_raw.c                |   8 +-
 src/components/lib/sl/sl_sched.c              |  56 ++++----
 src/components/lib/sl/sl_xcore.c              | 119 +++++++++++++++++
 src/components/lib/sl/sl_xcpu.c               | 120 ------------------
 16 files changed, 327 insertions(+), 329 deletions(-)
 create mode 100644 src/components/include/sl_xcore.h
 delete mode 100644 src/components/include/sl_xcpu.h
 create mode 100644 src/components/lib/sl/sl_xcore.c
 delete mode 100644 src/components/lib/sl/sl_xcpu.c

diff --git a/src/components/Makefile.comp b/src/components/Makefile.comp
index 78ece3ad0e..408923b930 100644
--- a/src/components/Makefile.comp
+++ b/src/components/Makefile.comp
@@ -60,6 +60,6 @@ SERVER_STUB=s_stub.o
 CLIENT_STUB=c_stub.o
 
 LIBCOSDEFKERN=-lcos_kernel_api -lcos_defkernel_api
-LIBSLCORE=$(LIBCOSDEFKERN) -lsl_sched -lheap -lsl_xcpu -lsl_child -lck
+LIBSLCORE=$(LIBCOSDEFKERN) -lsl_sched -lheap -lsl_xcore -lsl_child -lck
 LIBSLCAPMGR=$(LIBSLCORE) -lsl_capmgr
 LIBSLRAW=$(LIBSLCORE) -lsl_raw -lcos_dcb
diff --git a/src/components/implementation/capmgr/naive/init.c b/src/components/implementation/capmgr/naive/init.c
index 33c11068c6..3531d87212 100644
--- a/src/components/implementation/capmgr/naive/init.c
+++ b/src/components/implementation/capmgr/naive/init.c
@@ -69,7 +69,7 @@ capmgr_comp_info_iter_cpu(void)
 
 			cap_info_initthd_init(rci, ithd, 0);
 		} else if (cos_spd_id() == spdid) {
-			cap_info_initthd_init(rci, sl__globals_cpu()->sched_thd, 0);
+			cap_info_initthd_init(rci, sl__globals_core()->sched_thd, 0);
 		} else if (!sched_spdid && spdid) {
 			struct sl_thd *booter_thd = cap_info_initthd(btinfo);
 			dcbcap_t dcap;
@@ -176,7 +176,7 @@ capmgr_comp_info_iter(void)
 
 			cap_info_initthd_init(rci, ithd, 0);
 		} else if (cos_spd_id() == spdid) {
-			cap_info_initthd_init(rci, sl__globals_cpu()->sched_thd, 0);
+			cap_info_initthd_init(rci, sl__globals_core()->sched_thd, 0);
 		} else if (!sched_spdid && spdid) {
 			struct sl_thd *booter_thd = cap_info_initthd(btinfo);
 			dcbcap_t dcap;
diff --git a/src/components/implementation/no_interface/llbooter/llbooter.c b/src/components/implementation/no_interface/llbooter/llbooter.c
index 23fe65e998..6c054ce7d4 100644
--- a/src/components/implementation/no_interface/llbooter/llbooter.c
+++ b/src/components/implementation/no_interface/llbooter/llbooter.c
@@ -460,7 +460,7 @@ cos_init(void)
 
 	if (cos_cpuid() == INIT_CORE) {
 		capmgr_spdid = 0;
-		memset(root_spdid, 0, sizeof(int) * NUM_CPU);
+		memset(root_spdid, 0, sizeof(spdid_t) * NUM_CPU);
 		memset(new_comp_cap_info, 0, sizeof(struct comp_cap_info) * (MAX_NUM_SPDS));
 
 		h        = (struct cobj_header *)cos_comp_info.cos_poly[0];
diff --git a/src/components/implementation/sched/sched_init.c b/src/components/implementation/sched/sched_init.c
index 204a31034e..8111e4b8c0 100644
--- a/src/components/implementation/sched/sched_init.c
+++ b/src/components/implementation/sched/sched_init.c
@@ -45,7 +45,7 @@ schedinit_child(void)
 		if (unlikely(t)) continue;
 
 		aep.tid = thdid;
-		aep.tc  = sl_thd_tcap(sl__globals_cpu()->sched_thd);
+		aep.tc  = sl_thd_tcap(sl__globals_core()->sched_thd);
 		t = sl_thd_init_ext(&aep, init);
 		if (!t) return 0;
 	} while (thdid);
diff --git a/src/components/implementation/tests/unit_fprr/unit_fprr.c b/src/components/implementation/tests/unit_fprr/unit_fprr.c
index d3d929ac27..46f612fad8 100644
--- a/src/components/implementation/tests/unit_fprr/unit_fprr.c
+++ b/src/components/implementation/tests/unit_fprr/unit_fprr.c
@@ -106,51 +106,51 @@ test_swapping(void)
 	sl_thd_block_timeout(0, wakeup);
 }
 
-#define XCPU_THDS (NUM_CPU-1)
+#define XCORE_THDS (NUM_CPU-1)
 #define THD_SLEEP_US (100 * 1000)
-volatile unsigned int xcpu_thd_data[NUM_CPU][XCPU_THDS];
-volatile unsigned int xcpu_thd_counter[NUM_CPU];
+volatile unsigned int xcore_thd_data[NUM_CPU][XCORE_THDS];
+volatile unsigned int xcore_thd_counter[NUM_CPU];
 static void
-test_xcpu_fn(void *data)
+test_xcore_fn(void *data)
 {
 	cycles_t wakeup, elapsed;
 	int cpu = *((unsigned int *)data) >> 16;
 	int i   = (*((unsigned int *)data) << 16) >> 16;
 
-	assert(i < XCPU_THDS);
+	assert(i < XCORE_THDS);
 	wakeup = sl_now() + sl_usec2cyc(THD_SLEEP_US);
 	elapsed = sl_thd_block_timeout(0, wakeup);
 
-	if (elapsed) xcpu_thd_counter[cpu] ++;
+	if (elapsed) xcore_thd_counter[cpu] ++;
 	sl_thd_exit();
 }
 
 static void
-run_xcpu_tests()
+run_xcore_tests()
 {
 	int ret = 0, i, cpu = 0;
 
 	if (NUM_CPU == 1) return;
 
-	memset((void *)xcpu_thd_data[cos_cpuid()], 0, sizeof(unsigned int) * XCPU_THDS);
-	xcpu_thd_counter[cos_cpuid()] = 0;
+	memset((void *)xcore_thd_data[cos_cpuid()], 0, sizeof(unsigned int) * XCORE_THDS);
+	xcore_thd_counter[cos_cpuid()] = 0;
 
-	for (i = 0; i < XCPU_THDS; i++) {
+	for (i = 0; i < XCORE_THDS; i++) {
 		sched_param_t p[1];
 
 		if (cpu == cos_cpuid()) cpu++;
 		cpu %= NUM_CPU;
-		xcpu_thd_data[cos_cpuid()][i] = (cpu << 16) | i;
+		xcore_thd_data[cos_cpuid()][i] = (cpu << 16) | i;
 
 		p[0] = sched_param_pack(SCHEDP_PRIO, HIGH_PRIORITY);
-		ret = sl_xcpu_thd_alloc(cpu, test_xcpu_fn, (void *)&xcpu_thd_data[cos_cpuid()][i], p);
+		ret = sl_xcore_thd_alloc(cpu, test_xcore_fn, (void *)&xcore_thd_data[cos_cpuid()][i], 1, p);
 		if (ret) break;
 
 		cpu++;
 	}
 
-	PRINTC("%s: Creating cross-CPU threads!\n", ret ? "FAILURE" : "SUCCESS");
-	while (xcpu_thd_counter[cos_cpuid()] != XCPU_THDS) ;
+	PRINTC("%s: Creating cross-core threads!\n", ret ? "FAILURE" : "SUCCESS");
+	while (xcore_thd_counter[cos_cpuid()] != XCORE_THDS) ;
 }
 
 static void
@@ -161,7 +161,7 @@ run_tests()
 	test_swapping();
 	PRINTC("%s: Swap back and forth!\n", (thd1_ran[cos_cpuid()] && thd2_ran[cos_cpuid()]) ? "SUCCESS" : "FAILURE");
 
-	run_xcpu_tests();
+	run_xcore_tests();
 
 	PRINTC("Unit-test done!\n");
 	sl_thd_exit();
diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index e77e903fa4..5256a254cb 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -37,7 +37,7 @@
 #include <sl_plugins.h>
 #include <sl_thd.h>
 #include <sl_consts.h>
-#include <sl_xcpu.h>
+#include <sl_xcore.h>
 #include <heap.h>
 
 #undef SL_TIMEOUTS
@@ -54,7 +54,7 @@ struct sl_cs {
 	} u;
 };
 
-struct sl_global_cpu {
+struct sl_global_core {
 	struct sl_cs lock;
 
 	thdcap_t       sched_thdcap;
@@ -72,18 +72,18 @@ struct sl_global_cpu {
 	struct ps_list_head event_head; /* all pending events for sched end-point */
 };
 
-extern struct sl_global_cpu sl_global_cpu_data[];
+extern struct sl_global_core sl_global_core_data[];
 
-static inline struct sl_global_cpu *
-sl__globals_cpu(void)
+static inline struct sl_global_core *
+sl__globals_core(void)
 {
-	return &(sl_global_cpu_data[cos_cpuid()]);
+	return &(sl_global_core_data[cos_cpuid()]);
 }
 
 static inline struct cos_scb_info *
-sl_scb_info_cpu(void)
+sl_scb_info_core(void)
 {
-	return (sl__globals_cpu()->scb_info);
+	return (sl__globals_core()->scb_info);
 }
 
 static inline void
@@ -145,7 +145,7 @@ sl_thd_curr(void)
 static inline int
 sl_cs_owner(void)
 {
-	return sl__globals_cpu()->lock.u.s.owner == sl_thd_thdcap(sl_thd_curr());
+	return sl__globals_core()->lock.u.s.owner == sl_thd_thdcap(sl_thd_curr());
 }
 
 /* ...not part of the public API */
@@ -161,7 +161,7 @@ sl_cs_owner(void)
  *     -ve from cos_defswitch failure, allowing caller for ex: the scheduler thread to
  *     check if it was -EBUSY to first recieve pending notifications before retrying lock.
  */
-int sl_cs_enter_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, struct sl_global_cpu *gcpu, struct sl_thd *curr, sched_tok_t tok);
+int sl_cs_enter_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, struct sl_global_core *gcore, struct sl_thd *curr, sched_tok_t tok);
 /*
  * @csi: current critical section value
  * @cached: a cached copy of @csi
@@ -169,28 +169,28 @@ int sl_cs_enter_contention(union sl_cs_intern *csi, union sl_cs_intern *cached,
  *
  * @ret: returns 1 if we need a retry, 0 otherwise
  */
-int sl_cs_exit_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, struct sl_global_cpu *gcpu, sched_tok_t tok);
+int sl_cs_exit_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, struct sl_global_core *gcore, sched_tok_t tok);
 
 /* Enter into the scheduler critical section */
 static inline int
 sl_cs_enter_nospin(void)
 {
 #ifdef SL_CS
-	struct sl_global_cpu *gcpu = sl__globals_cpu();
-	struct sl_thd         *t   = sl_thd_curr();
+	struct sl_global_core *gcore = sl__globals_core();
+	struct sl_thd         *t     = sl_thd_curr();
 	union sl_cs_intern csi, cached;
 
 	assert(t);
-	csi.v    = gcpu->lock.u.v;
+	csi.v    = gcore->lock.u.v;
 	cached.v = csi.v;
 
 	if (unlikely(csi.s.owner)) {
 		assert(0);
-		return sl_cs_enter_contention(&csi, &cached, gcpu, t, cos_sched_sync());
+		return sl_cs_enter_contention(&csi, &cached, gcore, t, cos_sched_sync());
 	}
 
 	csi.s.owner = sl_thd_thdcap(t);
-	if (!ps_upcas(&gcpu->lock.u.v, cached.v, csi.v)) return 1;
+	if (!ps_upcas(&gcore->lock.u.v, cached.v, csi.v)) return 1;
 #endif
 	return 0;
 }
@@ -227,22 +227,22 @@ static inline void
 sl_cs_exit(void)
 {
 #ifdef SL_CS
-	struct sl_global_cpu *gcpu = sl__globals_cpu();
+	struct sl_global_core *gcore = sl__globals_core();
 	union sl_cs_intern csi, cached;
 
 	assert(sl_cs_owner());
 retry:
-	csi.v    = gcpu->lock.u.v;
+	csi.v    = gcore->lock.u.v;
 	cached.v = csi.v;
 
 	if (unlikely(csi.s.contention)) {
 		assert(0);
-		if (sl_cs_exit_contention(&csi, &cached, gcpu, cos_sched_sync())) goto retry;
+		if (sl_cs_exit_contention(&csi, &cached, gcore, cos_sched_sync())) goto retry;
 
 		return;
 	}
 
-	if (!ps_upcas(&gcpu->lock.u.v, cached.v, 0)) goto retry;
+	if (!ps_upcas(&gcore->lock.u.v, cached.v, 0)) goto retry;
 #endif
 }
 
@@ -315,13 +315,13 @@ void sl_thd_param_set(struct sl_thd *t, sched_param_t sp);
 static inline microsec_t
 sl_cyc2usec(cycles_t cyc)
 {
-	return cyc / sl__globals_cpu()->cyc_per_usec;
+	return cyc / sl__globals_core()->cyc_per_usec;
 }
 
 static inline cycles_t
 sl_usec2cyc(microsec_t usec)
 {
-	return usec * sl__globals_cpu()->cyc_per_usec;
+	return usec * sl__globals_core()->cyc_per_usec;
 }
 
 static inline cycles_t
@@ -353,17 +353,17 @@ void sl_timeout_period(cycles_t period);
 static inline cycles_t
 sl_timeout_period_get(void)
 {
-	return sl__globals_cpu()->period;
+	return sl__globals_core()->period;
 }
 
 #ifdef SL_TIMEOUTS
 static inline void
 sl_timeout_oneshot(cycles_t absolute_us)
 {
-	sl__globals_cpu()->timer_next   = absolute_us;
-	sl__globals_cpu()->timeout_next = tcap_cyc2time(absolute_us);
+	sl__globals_core()->timer_next   = absolute_us;
+	sl__globals_core()->timeout_next = tcap_cyc2time(absolute_us);
 
-	sl_scb_info_cpu()->timer_next   = absolute_us;
+	sl_scb_info_core()->timer_next   = absolute_us;
 }
 
 static inline void
@@ -424,7 +424,7 @@ int sl_thd_kern_dispatch(thdcap_t t);
 static inline int
 sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 {
-	struct cos_scb_info *scb = sl_scb_info_cpu();
+	struct cos_scb_info *scb = sl_scb_info_core();
 
 	assert(sl_thd_dcbinfo(curr) && sl_thd_dcbinfo(next));
 	/*
@@ -470,7 +470,7 @@ sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 		  "c" (&(scb->curr_thd)), "d" (sl_thd_thdcap(next))
 		: "memory", "cc");
 
-	if (likely(sl_scb_info_cpu()->sched_tok == tok)) return 0;
+	if (likely(sl_scb_info_core()->sched_tok == tok)) return 0;
 
 	return -EAGAIN;
 }
@@ -480,7 +480,7 @@ sl_thd_activate(struct sl_thd *t, sched_tok_t tok)
 {
 //	struct cos_defcompinfo *dci = cos_defcompinfo_curr_get();
 //	struct cos_compinfo    *ci  = &dci->ci;
-//	struct sl_global_cpu   *g   = sl__globals_cpu();
+//	struct sl_global_core  *g   = sl__globals_core();
 //	int ret = 0;
 
 #if 0
@@ -543,12 +543,12 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 {
 //	return sl_thd_dispatch(to, cos_sched_sync(), sl_thd_curr());
 #if 1
-	struct sl_thd        *t = to;
-//	struct sl_global_cpu *globals = sl__globals_cpu();
-	sched_tok_t           tok;
-//	cycles_t              now;
-//	s64_t                 offset;
-//	int                   ret;
+	struct sl_thd         *t = to;
+//	struct sl_global_core *globals = sl__globals_core();
+	sched_tok_t            tok;
+//	cycles_t               now;
+//	s64_t                  offset;
+//	int                    ret;
 
 	/* Don't abuse this, it is only to enable the tight loop around this function for races... */
 #ifdef SL_CS
@@ -579,7 +579,7 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 		struct sl_thd_policy *pt = sl_mod_schedule();
 
 		if (unlikely(!pt))
-			t = sl__globals_cpu()->idle_thd;
+			t = sl__globals_core()->idle_thd;
 		else
 			t = sl_mod_thd_get(pt);
 	}
@@ -589,7 +589,7 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 		struct cos_compinfo *ci = cos_compinfo_get(cos_defcompinfo_curr_get());
 
 		assert(t->period);
-		assert(sl_thd_tcap(t) != sl__globals_cpu()->sched_tcap);
+		assert(sl_thd_tcap(t) != sl__globals_core()->sched_tcap);
 
 		if (t->last_replenish == 0 || t->last_replenish + t->period <= now) {
 			tcap_res_t currbudget = 0;
@@ -601,7 +601,7 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 			if (!cycles_same(currbudget, t->budget, SL_CYCS_DIFF) && currbudget < t->budget) {
 				tcap_res_t transfer = t->budget - currbudget;
 
-				ret = cos_tcap_transfer(sl_thd_rcvcap(t), sl__globals_cpu()->sched_tcap, transfer, t->prio);
+				ret = cos_tcap_transfer(sl_thd_rcvcap(t), sl__globals_core()->sched_tcap, transfer, t->prio);
 			}
 
 			if (likely(ret == 0)) t->last_replenish = replenish;
@@ -738,7 +738,7 @@ sl_thd_rcv(rcv_flags_t flags)
 		if (unlikely(flags & RCV_NON_BLOCKING)) goto done;
 
 		sl_thd_sched_block_no_cs(t, SL_THD_BLOCKED, 0);
-		sl_cs_exit_switchto(sl__globals_cpu()->sched_thd);
+		sl_cs_exit_switchto(sl__globals_core()->sched_thd);
 
 		goto check;
 	}
diff --git a/src/components/include/sl_xcore.h b/src/components/include/sl_xcore.h
new file mode 100644
index 0000000000..40c5c54e19
--- /dev/null
+++ b/src/components/include/sl_xcore.h
@@ -0,0 +1,109 @@
+#ifndef SL_XCORE_H
+#define SL_XCORE_H
+
+#include <ck_ring.h>
+#include <cos_kernel_api.h>
+#include <cos_defkernel_api.h>
+#include <res_spec.h>
+
+#define SL_XCORE_PARAM_MAX 4
+
+typedef enum {
+	SL_XCORE_THD_ALLOC = 0,
+	SL_XCORE_THD_ALLOC_EXT,
+	SL_XCORE_AEP_ALLOC,
+	SL_XCORE_AEP_ALLOC_EXT,
+	SL_XCORE_INITAEP_ALLOC,
+	SL_XCORE_THD_DEALLOC, /* thread delete, need it? */
+} sl_xcore_req_t;
+
+struct sl_xcore_request {
+	sl_xcore_req_t type;         /* request type */
+	cpuid_t       client;       /* client cpu making the request */
+	int           req_response; /* client needs a response */
+	sched_param_t params[SL_XCORE_PARAM_MAX]; /* scheduling parameters */
+	int           param_count;		 /* number of parameters */
+
+	union {
+		struct {
+			cos_thd_fn_t            fn;
+			void                   *data;
+		} sl_xcore_req_thd_alloc;
+		struct {
+			cos_thd_fn_t            fn;
+			void                   *data;
+			int                     own_tcap;
+			cos_channelkey_t        key;
+		} sl_xcore_req_aep_alloc;
+		struct {
+			thdclosure_index_t      idx; /* TODO: create thread in another component ? */
+			struct cos_defcompinfo *dci;
+		} sl_xcore_req_thd_alloc_ext;
+		struct {
+			thdclosure_index_t      idx;
+			int                     own_tcap;
+			cos_channelkey_t        key;
+			struct cos_defcompinfo *dci;
+		} sl_xcore_req_aep_alloc_ext;
+		struct {
+			int                     is_sched;
+			int                     own_tcap;
+			struct cos_defcompinfo *dci, *sched;
+		} sl_xcore_req_initaep_alloc;
+	};
+};
+
+CK_RING_PROTOTYPE(xcore, sl_xcore_request);
+
+#define SL_XCORE_RING_SIZE (64 * sizeof(struct sl_xcore_request)) /* in sl_const.h? */
+
+/* perhaps move these to sl.h? */
+struct sl_global {
+	struct ck_ring xcore_ring[NUM_CPU]; /* mpsc ring! */
+
+	struct sl_xcore_request xcore_rbuf[NUM_CPU][SL_XCORE_RING_SIZE];
+	u32_t core_bmp[(NUM_CPU + 7)/8]; /* bitmap of cores this scheduler is running on! */
+	asndcap_t xcore_asnd[NUM_CPU][NUM_CPU];
+	struct cos_scb_info *scb_area;
+} CACHE_ALIGNED;
+
+extern struct sl_global sl_global_data;
+
+static inline struct sl_global *
+sl__globals(void)
+{
+	return &sl_global_data;
+}
+
+static inline struct ck_ring *
+sl__ring(cpuid_t core)
+{
+	return &(sl__globals()->xcore_ring[core]);
+}
+
+static inline struct ck_ring *
+sl__ring_curr(void)
+{
+	return sl__ring(cos_cpuid());
+}
+
+static inline struct sl_xcore_request *
+sl__ring_buffer(cpuid_t core)
+{
+	return (sl__globals()->xcore_rbuf[core]);
+}
+
+static inline struct sl_xcore_request *
+sl__ring_buffer_curr(void)
+{
+	return sl__ring_buffer(cos_cpuid());
+}
+
+int sl_xcore_thd_alloc(cpuid_t core, cos_thd_fn_t fn, void *data, int nparams, sched_param_t params[]);
+int sl_xcore_thd_alloc_ext(cpuid_t core, struct cos_defcompinfo *dci, thdclosure_index_t idx, int nparams, sched_param_t params[]);
+int sl_xcore_aep_alloc(cpuid_t core, cos_thd_fn_t fn, void *data, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[]);
+int sl_xcore_aep_alloc_ext(cpuid_t core, struct cos_defcompinfo *dci, thdclosure_index_t idx, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[]);
+int sl_xcore_initaep_alloc(cpuid_t core, struct cos_defcompinfo *dci, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[]);
+int sl_xcore_initaep_alloc_ext(cpuid_t core, struct cos_defcompinfo *dci, struct cos_defcompinfo *sched, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[]);
+
+#endif /* SL_XCORE_H */
diff --git a/src/components/include/sl_xcpu.h b/src/components/include/sl_xcpu.h
deleted file mode 100644
index 21d388eb80..0000000000
--- a/src/components/include/sl_xcpu.h
+++ /dev/null
@@ -1,110 +0,0 @@
-#ifndef SL_XCPU_H
-#define SL_XCPU_H
-
-#include <ck_ring.h>
-#include <cos_kernel_api.h>
-#include <cos_defkernel_api.h>
-#include <res_spec.h>
-
-#define SL_XCPU_PARAM_MAX 4
-
-typedef enum {
-	SL_XCPU_THD_ALLOC = 0,
-	SL_XCPU_THD_ALLOC_EXT,
-	SL_XCPU_AEP_ALLOC,
-	SL_XCPU_AEP_ALLOC_EXT,
-	SL_XCPU_INITAEP_ALLOC,
-	SL_XCPU_THD_DEALLOC, /* thread delete, need it? */
-} sl_xcpu_req_t;
-
-struct sl_xcpu_request {
-	sl_xcpu_req_t type;         /* request type */
-	cpuid_t       client;       /* client cpu making the request */
-	int           req_response; /* client needs a response */
-	sched_param_t params[SL_XCPU_PARAM_MAX]; /* scheduling parameters */
-	int           param_count;		 /* number of parameters */
-
-	union {
-		struct {
-			cos_thd_fn_t            fn;
-			void                   *data;
-		} sl_xcpu_req_thd_alloc;
-		struct {
-			cos_thd_fn_t            fn;
-			void                   *data;
-			int                     own_tcap;
-			cos_channelkey_t        key;
-		} sl_xcpu_req_aep_alloc;
-		struct {
-			thdclosure_index_t      idx; /* TODO: create thread in another component ? */
-			struct cos_defcompinfo *dci;
-		} sl_xcpu_req_thd_alloc_ext;
-		struct {
-			thdclosure_index_t      idx;
-			int                     own_tcap;
-			cos_channelkey_t        key;
-			struct cos_defcompinfo *dci;
-		} sl_xcpu_req_aep_alloc_ext;
-		struct {
-			int                     is_sched;
-			int                     own_tcap;
-			struct cos_defcompinfo *dci, *sched;
-		} sl_xcpu_req_initaep_alloc;
-	};
-};
-
-CK_RING_PROTOTYPE(xcpu, sl_xcpu_request);
-
-#define SL_XCPU_RING_SIZE (64 * sizeof(struct sl_xcpu_request)) /* in sl_const.h? */
-
-/* perhaps move these to sl.h? */
-struct sl_global {
-	struct ck_ring xcpu_ring[NUM_CPU]; /* mpsc ring! */
-
-	struct sl_xcpu_request xcpu_rbuf[NUM_CPU][SL_XCPU_RING_SIZE];
-	u32_t cpu_bmp[(NUM_CPU + 7)/8]; /* bitmap of cpus this scheduler is running on! */
-	asndcap_t xcpu_asnd[NUM_CPU][NUM_CPU];
-	struct cos_scb_info *scb_area;
-} CACHE_ALIGNED;
-
-extern struct sl_global sl_global_data;
-
-static inline struct sl_global *
-sl__globals(void)
-{
-	return &sl_global_data;
-}
-
-static inline struct ck_ring *
-sl__ring(cpuid_t cpu)
-{
-	return &(sl__globals()->xcpu_ring[cpu]);
-}
-
-static inline struct ck_ring *
-sl__ring_curr(void)
-{
-	return sl__ring(cos_cpuid());
-}
-
-static inline struct sl_xcpu_request *
-sl__ring_buffer(cpuid_t cpu)
-{
-	return (sl__globals()->xcpu_rbuf[cpu]);
-}
-
-static inline struct sl_xcpu_request *
-sl__ring_buffer_curr(void)
-{
-	return sl__ring_buffer(cos_cpuid());
-}
-
-/* perhaps move these to sl.h? */
-int sl_xcpu_thd_alloc(cpuid_t cpu, cos_thd_fn_t fn, void *data, sched_param_t params[]);
-int sl_xcpu_thd_alloc_ext(cpuid_t cpu, struct cos_defcompinfo *dci, thdclosure_index_t idx, sched_param_t params[]);
-int sl_xcpu_aep_alloc(cpuid_t cpu, cos_thd_fn_t fn, void *data, int own_tcap, cos_channelkey_t key, sched_param_t params[]);
-int sl_xcpu_aep_alloc_ext(cpuid_t cpu, struct cos_defcompinfo *dci, thdclosure_index_t idx, int own_tcap, cos_channelkey_t key, sched_param_t params[]);
-int sl_xcpu_initaep_alloc(cpuid_t cpu, struct cos_defcompinfo *dci, int own_tcap, cos_channelkey_t key, sched_param_t params[]);
-int sl_xcpu_initaep_alloc_ext(cpuid_t cpu, struct cos_defcompinfo *dci, struct cos_defcompinfo *sched, int own_tcap, cos_channelkey_t key, sched_param_t params[]);
-
-#endif /* SL_XCPU_H */
diff --git a/src/components/lib/cos_gomp.c b/src/components/lib/cos_gomp.c
index 6d8a731049..49b65a28ac 100644
--- a/src/components/lib/cos_gomp.c
+++ b/src/components/lib/cos_gomp.c
@@ -54,7 +54,7 @@ GOMP_parallel (void (*fn) (void *), void *data, unsigned num_threads,
 {
 	/* FIXME: improve everything! */
 	unsigned i;
-	unsigned num_done = 0;
+	unsigned long num_done = 0;
 
 	num_threads = _cos_gomp_num_threads(num_threads);
 	assert(num_threads <= MAX_NUM_THREADS);
diff --git a/src/components/lib/sl/Makefile b/src/components/lib/sl/Makefile
index 86567cfbb3..f14d35ad63 100644
--- a/src/components/lib/sl/Makefile
+++ b/src/components/lib/sl/Makefile
@@ -1,6 +1,6 @@
 include Makefile.src Makefile.comp
 
-LIB_OBJS=sl_capmgr.o sl_raw.o sl_sched.o sl_xcpu.o sl_child.o sl_mod_fprr.o sl_mod_rr.o sl_mod_fifo.o sl_lock.o sl_thd_static_backend.o
+LIB_OBJS=sl_capmgr.o sl_raw.o sl_sched.o sl_xcore.o sl_child.o sl_mod_fprr.o sl_mod_rr.o sl_mod_fifo.o sl_lock.o sl_thd_static_backend.o
 LIBS=$(LIB_OBJS:%.o=%.a)
 CINC+=-m32
 
diff --git a/src/components/lib/sl/sl_capmgr.c b/src/components/lib/sl/sl_capmgr.c
index 6bf0b432c6..941b0001dc 100644
--- a/src/components/lib/sl/sl_capmgr.c
+++ b/src/components/lib/sl/sl_capmgr.c
@@ -38,7 +38,7 @@ sl_shm_map(cbuf_t id)
 }
 
 void
-sl_xcpu_asnd_alloc(void)
+sl_xcore_asnd_alloc(void)
 {
 	int i;
 
@@ -47,11 +47,11 @@ sl_xcpu_asnd_alloc(void)
 		thdid_t tid;
 
 		if (i == cos_cpuid()) continue;
-		if (!bitmap_check(sl__globals()->cpu_bmp, i)) continue;
+		if (!bitmap_check(sl__globals()->core_bmp, i)) continue;
 
 		snd = capmgr_asnd_rcv_create(BOOT_CAPTBL_SELF_INITRCV_BASE_CPU(i));
 		assert(snd);
-		sl__globals()->xcpu_asnd[cos_cpuid()][i] = snd;
+		sl__globals()->xcore_asnd[cos_cpuid()][i] = snd;
 	}
 }
 
@@ -156,7 +156,7 @@ sl_thd_alloc_ext_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx, vad
 
 		aep->thd = capmgr_thd_create_ext(comp->id, idx, &aep->tid, (struct cos_dcb_info **)dcbuaddr);
 		if (!aep->thd) goto done;
-		aep->tc  = sl_thd_tcap(sl__globals_cpu()->sched_thd);
+		aep->tc  = sl_thd_tcap(sl__globals_core()->sched_thd);
 
 		t = sl_thd_alloc_init(aep, 0, 0, NULL);
 		sl_mod_thd_create(sl_mod_thd_policy_get(t));
@@ -399,7 +399,7 @@ sl_thd_retrieve_lazy(thdid_t tid)
 	it = sl_thd_try_lkup(itid);
 	assert(it);
 	aep.tid = tid;
-	aep.tc  = sl__globals_cpu()->sched_tcap;
+	aep.tc  = sl__globals_core()->sched_tcap;
 	t = sl_thd_init_ext_no_cs(&aep, it);
 
 	/* if (tid != sl_thdid()) sl_cs_exit(); */
diff --git a/src/components/lib/sl/sl_child.c b/src/components/lib/sl/sl_child.c
index d0acb796f5..a942149f47 100644
--- a/src/components/lib/sl/sl_child.c
+++ b/src/components/lib/sl/sl_child.c
@@ -52,7 +52,7 @@ sl_parent_notif_enqueue(struct sl_thd *thd, struct sl_child_notification *notif)
 	return 0;
 }
 
-/* there is only 1 parent per scheduler per cpu */
+/* there is only 1 parent per scheduler per core */
 int
 sl_child_notif_map(cbuf_t id)
 {
diff --git a/src/components/lib/sl/sl_raw.c b/src/components/lib/sl/sl_raw.c
index 111d6956a6..6b6741e62a 100644
--- a/src/components/lib/sl/sl_raw.c
+++ b/src/components/lib/sl/sl_raw.c
@@ -29,7 +29,7 @@ sl_shm_map(cbuf_t id)
 }
 
 void
-sl_xcpu_asnd_alloc(void)
+sl_xcore_asnd_alloc(void)
 {
         struct cos_defcompinfo *dci = cos_defcompinfo_curr_get();
         struct cos_compinfo    *ci  = cos_compinfo_get(dci);
@@ -39,11 +39,11 @@ sl_xcpu_asnd_alloc(void)
 		asndcap_t snd;
 
 		if (i == cos_cpuid()) continue;
-		if (!bitmap_check(sl__globals()->cpu_bmp, i)) continue;
+		if (!bitmap_check(sl__globals()->core_bmp, i)) continue;
 
 		snd = cos_asnd_alloc(ci, BOOT_CAPTBL_SELF_INITRCV_BASE_CPU(i), ci->captbl_cap);
 		assert(snd);
-		sl__globals()->xcpu_asnd[cos_cpuid()][i] = snd;
+		sl__globals()->xcore_asnd[cos_cpuid()][i] = snd;
 	}
 }
 
@@ -184,7 +184,7 @@ sl_thd_aep_alloc_no_cs(cos_aepthd_fn_t fn, void *data, sl_thd_property_t prps, c
 
 	/* NOTE: Cannot use stack-allocated cos_aep_info struct here */
 	if (prps & SL_THD_PROPERTY_OWN_TCAP) ret = cos_aep_alloc(aep, fn, data, dcap, doff);
-	else                                 ret = cos_aep_tcap_alloc(aep, sl_thd_aepinfo(sl__globals_cpu()->sched_thd)->tc,
+	else                                 ret = cos_aep_tcap_alloc(aep, sl_thd_aepinfo(sl__globals_core()->sched_thd)->tc,
 			                                              fn, data, dcap, doff);
 	if (ret) goto done;
 
diff --git a/src/components/lib/sl/sl_sched.c b/src/components/lib/sl/sl_sched.c
index 3fcfd17bed..e7c1d6d107 100644
--- a/src/components/lib/sl/sl_sched.c
+++ b/src/components/lib/sl/sl_sched.c
@@ -7,7 +7,7 @@
 
 #include <ps.h>
 #include <sl.h>
-#include <sl_xcpu.h>
+#include <sl_xcore.h>
 #include <sl_child.h>
 #include <sl_mod_policy.h>
 #include <cos_debug.h>
@@ -17,18 +17,18 @@
 #include <cos_ulsched_rcv.h>
 
 struct sl_global sl_global_data;
-struct sl_global_cpu sl_global_cpu_data[NUM_CPU] CACHE_ALIGNED;
+struct sl_global_core sl_global_core_data[NUM_CPU] CACHE_ALIGNED;
 static void sl_sched_loop_intern(int non_block) __attribute__((noreturn));
 extern struct sl_thd *sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t prps, struct cos_dcb_info *dcb);
-extern int sl_xcpu_process_no_cs(void);
-extern void sl_xcpu_asnd_alloc(void);
+extern int sl_xcore_process_no_cs(void);
+extern void sl_xcore_asnd_alloc(void);
 
 /*
  * These functions are removed from the inlined fast-paths of the
  * critical section (cs) code to save on code size/locality
  */
 int
-sl_cs_enter_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, struct sl_global_cpu *gcpu, struct sl_thd *curr, sched_tok_t tok)
+sl_cs_enter_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, struct sl_global_core *gcore, struct sl_thd *curr, sched_tok_t tok)
 {
 #ifdef SL_CS
 	int ret;
@@ -37,11 +37,11 @@ sl_cs_enter_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, stru
 	assert(csi->s.owner != sl_thd_thdcap(curr));
 	if (!csi->s.contention) {
 		csi->s.contention = 1;
-		if (!ps_upcas(&gcpu->lock.u.v, cached->v, csi->v)) return 1;
+		if (!ps_upcas(&gcore->lock.u.v, cached->v, csi->v)) return 1;
 	}
 	/* Switch to the owner of the critical section, with inheritance using our tcap/priority */
-	if ((ret = cos_defswitch(csi->s.owner, curr->prio, csi->s.owner == sl_thd_thdcap(gcpu->sched_thd) ?
-				 TCAP_TIME_NIL : gcpu->timeout_next, tok))) return ret;
+	if ((ret = cos_defswitch(csi->s.owner, curr->prio, csi->s.owner == sl_thd_thdcap(gcore->sched_thd) ?
+				 TCAP_TIME_NIL : gcore->timeout_next, tok))) return ret;
 	/* if we have an outdated token, then we want to use the same repeat loop, so return to that */
 #endif
 
@@ -50,12 +50,12 @@ sl_cs_enter_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, stru
 
 /* Return 1 if we need a retry, 0 otherwise */
 int
-sl_cs_exit_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, struct sl_global_cpu *gcpu, sched_tok_t tok)
+sl_cs_exit_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, struct sl_global_core *gcore, sched_tok_t tok)
 {
 #ifdef SL_CS
-	if (!ps_upcas(&gcpu->lock.u.v, cached->v, 0)) return 1;
+	if (!ps_upcas(&gcore->lock.u.v, cached->v, 0)) return 1;
 	/* let the scheduler thread decide which thread to run next, inheriting our budget/priority */
-	cos_defswitch(gcpu->sched_thdcap, sl_thd_curr()->prio, TCAP_TIME_NIL, tok);
+	cos_defswitch(gcore->sched_thdcap, sl_thd_curr()->prio, TCAP_TIME_NIL, tok);
 #endif
 
 	return 0;
@@ -342,7 +342,7 @@ sl_thd_block_expiry(struct sl_thd *t)
 {
 	cycles_t abs_timeout = 0;
 
-	assert(t != sl__globals_cpu()->sched_thd);
+	assert(t != sl__globals_core()->sched_thd);
 	if (!(t->properties & SL_THD_PROPERTY_OWN_TCAP)) {
 		assert(!t->rcv_suspended);
 		return;
@@ -492,7 +492,7 @@ sl_thd_event_info_reset(struct sl_thd *t)
 static inline void
 sl_thd_event_enqueue(struct sl_thd *t, struct cos_thd_event *e)
 {
-	struct sl_global_cpu *g = sl__globals_cpu();
+	struct sl_global_core *g = sl__globals_core();
 
 	if (ps_list_singleton(t, SL_THD_EVENT_LIST)) ps_list_head_append(&g->event_head, t, SL_THD_EVENT_LIST);
 
@@ -551,7 +551,7 @@ sl_timeout_period(microsec_t period)
 {
 	cycles_t p = sl_usec2cyc(period);
 
-	sl__globals_cpu()->period = p;
+	sl__globals_core()->period = p;
 #ifdef SL_TIMEOUTS
 	sl_timeout_relative(p);
 #endif
@@ -564,7 +564,7 @@ sl_idle(void *d)
 
 /* call from the user? */
 static void
-sl_global_init(u32_t *cpu_bmp)
+sl_global_init(u32_t *core_bmp)
 {
 	struct sl_global *g = sl__globals();
 	unsigned int i = 0;
@@ -573,10 +573,10 @@ sl_global_init(u32_t *cpu_bmp)
 	assert(sizeof(struct cos_scb_info) * NUM_CPU <= COS_SCB_SIZE && COS_SCB_SIZE == PAGE_SIZE);
 
 	for (i = 0; i < NUM_CPU; i++) {
-		if (!bitmap_check(cpu_bmp, i)) continue;
+		if (!bitmap_check(core_bmp, i)) continue;
 
-		bitmap_set(g->cpu_bmp, i);
-		ck_ring_init(sl__ring(i), SL_XCPU_RING_SIZE);
+		bitmap_set(g->core_bmp, i);
+		ck_ring_init(sl__ring(i), SL_XCORE_RING_SIZE);
 	}
 	g->scb_area = (struct cos_scb_info *)cos_scb_info_get();
 }
@@ -588,13 +588,13 @@ sl_init(microsec_t period)
 	static unsigned long first = 1, init_done = 0;
 	struct cos_defcompinfo *dci = cos_defcompinfo_curr_get();
 	struct cos_compinfo    *ci  = cos_compinfo_get(dci);
-	struct sl_global_cpu   *g   = sl__globals_cpu();
+	struct sl_global_core  *g   = sl__globals_core();
 	struct cos_aep_info    *ga  = cos_sched_aep_get(dci);
-	u32_t cpu_bmp[(NUM_CPU + 7)/8] = { 0 }; /* TODO! pass from the user! */
+	u32_t core_bmp[(NUM_CPU + 7)/8] = { 0 }; /* TODO! pass from the user! */
 
 	if (ps_cas(&first, 1, 0)) {
-		bitmap_set_contig(cpu_bmp, 0, NUM_CPU, 1);
-		sl_global_init(cpu_bmp);
+		bitmap_set_contig(core_bmp, 0, NUM_CPU, 1);
+		sl_global_init(core_bmp);
 
 		ps_faa(&init_done, 1);
 	} else {
@@ -603,7 +603,7 @@ sl_init(microsec_t period)
 	}
 	/* must fit in a word */
 	assert(sizeof(struct sl_cs) <= sizeof(unsigned long));
-	memset(g, 0, sizeof(struct sl_global_cpu));
+	memset(g, 0, sizeof(struct sl_global_core));
 
 	g->cyc_per_usec = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
 	g->lock.u.v     = 0;
@@ -627,7 +627,7 @@ sl_init(microsec_t period)
 	g->idle_thd        = sl_thd_alloc(sl_idle, NULL);
 	assert(g->idle_thd);
 
-	sl_xcpu_asnd_alloc();
+	sl_xcore_asnd_alloc();
 
 	return;
 }
@@ -635,8 +635,8 @@ sl_init(microsec_t period)
 static void
 sl_sched_loop_intern(int non_block)
 {
-	struct sl_global_cpu *g   = sl__globals_cpu();
-	rcv_flags_t           rfl = (non_block ? RCV_NON_BLOCKING : 0);
+	struct sl_global_core *g   = sl__globals_core();
+	rcv_flags_t            rfl = (non_block ? RCV_NON_BLOCKING : 0);
 
 	while (1) {
 		int pending;
@@ -717,7 +717,7 @@ sl_sched_loop_intern(int non_block)
 			}
 
 			/* process cross-core requests */
-			sl_xcpu_process_no_cs();
+			sl_xcore_process_no_cs();
 
 			sl_cs_exit();
 		} while (pending > 0);
@@ -743,6 +743,6 @@ sl_sched_loop_nonblock(void)
 int
 sl_thd_kern_dispatch(thdcap_t t)
 {
-	//return cos_switch(t, sl__globals_cpu()->sched_tcap, 0, sl__globals_cpu()->timeout_next, sl__globals_cpu()->sched_rcv, cos_sched_sync());
+	//return cos_switch(t, sl__globals_core()->sched_tcap, 0, sl__globals_core()->timeout_next, sl__globals_core()->sched_rcv, cos_sched_sync());
 	return cos_thd_switch(t);
 }
diff --git a/src/components/lib/sl/sl_xcore.c b/src/components/lib/sl/sl_xcore.c
new file mode 100644
index 0000000000..f7157520c3
--- /dev/null
+++ b/src/components/lib/sl/sl_xcore.c
@@ -0,0 +1,119 @@
+#include <ps.h>
+#include <ck_ring.h>
+#include <sl_xcore.h>
+#include <sl.h>
+#include <bitmap.h>
+
+#define SL_REQ_THD_ALLOC(req, fn, data) do {							\
+						req.type = SL_XCORE_THD_ALLOC;			\
+						req.client = cos_cpuid();			\
+						req.req_response = 0;				\
+						req.sl_xcore_req_thd_alloc.fn = fn;		\
+						req.sl_xcore_req_thd_alloc.data = data;		\
+					     } while (0)
+
+extern struct sl_thd *sl_thd_alloc_no_cs(cos_thd_fn_t fn, void *data);
+
+int
+sl_xcore_thd_alloc(cpuid_t core, cos_thd_fn_t fn, void *data, int nparams, sched_param_t params[])
+{
+	int ret = 0;
+	asndcap_t snd = 0;
+	struct sl_xcore_request req;
+
+	if (core == cos_cpuid()) return -EINVAL;
+	if (!bitmap_check(sl__globals()->core_bmp, core)) return -EINVAL;
+
+	sl_cs_enter();
+
+	SL_REQ_THD_ALLOC(req, fn, data);
+	if (nparams) memcpy(req.params, params, sizeof(sched_param_t) * nparams);
+	req.param_count = nparams;
+	if (ck_ring_enqueue_mpsc_xcore(sl__ring(core), sl__ring_buffer(core), &req) != true) {
+		ret = -ENOMEM;
+	} else {
+		snd = sl__globals()->xcore_asnd[cos_cpuid()][core];
+		assert(snd);
+	}
+
+	sl_cs_exit();
+	/* if (!snd) return -1; */
+	/* send an IPI for the request */
+	/* cos_asnd(snd, 0); */
+
+	return ret;
+}
+
+int
+sl_xcore_thd_alloc_ext(cpuid_t core, struct cos_defcompinfo *dci, thdclosure_index_t idx, int nparams, sched_param_t params[])
+{
+	return -ENOTSUP;
+}
+
+int
+sl_xcore_aep_alloc(cpuid_t core, cos_thd_fn_t fn, void *data, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[])
+{
+	return -ENOTSUP;
+}
+
+int
+sl_xcore_aep_alloc_ext(cpuid_t core, struct cos_defcompinfo *dci, thdclosure_index_t idx, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[])
+{
+	return -ENOTSUP;
+}
+
+int
+sl_xcore_initaep_alloc(cpuid_t core, struct cos_defcompinfo *dci, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[])
+{
+	return -ENOTSUP;
+}
+
+int
+sl_xcore_initaep_alloc_ext(cpuid_t core, struct cos_defcompinfo *dci, struct cos_defcompinfo *sched, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[])
+{
+	return -ENOTSUP;
+}
+
+int
+sl_xcore_process_no_cs(void)
+{
+	int num = 0;
+	struct sl_xcore_request xcore_req;
+
+	while (ck_ring_dequeue_mpsc_xcore(sl__ring_curr(), sl__ring_buffer_curr(), &xcore_req) == true) {
+
+		assert(xcore_req.client != cos_cpuid());
+		switch(xcore_req.type) {
+		case SL_XCORE_THD_ALLOC:
+		{
+			cos_thd_fn_t   fn   = xcore_req.sl_xcore_req_thd_alloc.fn;
+			void          *data = xcore_req.sl_xcore_req_thd_alloc.data;
+			struct sl_thd *t;
+			int i;
+
+			assert(fn);
+
+			t = sl_thd_alloc_no_cs(fn, data);
+			assert(t);
+			for (i = 0; i < xcore_req.param_count; i++) {
+				sl_thd_param_set(t, xcore_req.params[i]);
+			}
+
+			break;
+		}
+		case SL_XCORE_THD_ALLOC_EXT:
+		case SL_XCORE_AEP_ALLOC:
+		case SL_XCORE_AEP_ALLOC_EXT:
+		case SL_XCORE_INITAEP_ALLOC:
+		case SL_XCORE_THD_DEALLOC:
+		default:
+		{
+			PRINTC("Unimplemented request! Aborting!\n");
+			assert(0);
+		}
+		}
+		num ++;
+	}
+
+	return num; /* number of requests processed */
+}
diff --git a/src/components/lib/sl/sl_xcpu.c b/src/components/lib/sl/sl_xcpu.c
deleted file mode 100644
index a44f84f5e3..0000000000
--- a/src/components/lib/sl/sl_xcpu.c
+++ /dev/null
@@ -1,120 +0,0 @@
-#include <ps.h>
-#include <ck_ring.h>
-#include <sl_xcpu.h>
-#include <sl.h>
-#include <bitmap.h>
-
-#define SL_REQ_THD_ALLOC(req, fn, data) do {							\
-						req.type = SL_XCPU_THD_ALLOC;			\
-						req.client = cos_cpuid();			\
-						req.req_response = 0;				\
-						req.sl_xcpu_req_thd_alloc.fn = fn;		\
-						req.sl_xcpu_req_thd_alloc.data = data;		\
-					     } while (0)
-
-extern struct sl_thd *sl_thd_alloc_no_cs(cos_thd_fn_t fn, void *data);
-
-int
-sl_xcpu_thd_alloc(cpuid_t cpu, cos_thd_fn_t fn, void *data, sched_param_t params[])
-{
-	int i, sz = sizeof(params) / sizeof(params[0]);
-	int ret = 0;
-	asndcap_t snd = 0;
-	struct sl_xcpu_request req;
-
-	if (cpu == cos_cpuid()) return -EINVAL;
-	if (!bitmap_check(sl__globals()->cpu_bmp, cpu)) return -EINVAL;
-
-	sl_cs_enter();
-
-	SL_REQ_THD_ALLOC(req, fn, data);
-	memcpy(req.params, params, sizeof(sched_param_t) * sz);
-	req.param_count = sz;
-	if (ck_ring_enqueue_mpsc_xcpu(sl__ring(cpu), sl__ring_buffer(cpu), &req) != true) {
-		ret = -ENOMEM;
-	} else {
-		snd = sl__globals()->xcpu_asnd[cos_cpuid()][cpu];
-		assert(snd);
-	}
-
-	sl_cs_exit();
-	/* if (!snd) return -1; */
-	/* send an IPI for the request */
-	/* cos_asnd(snd, 0); */
-
-	return ret;
-}
-
-int
-sl_xcpu_thd_alloc_ext(cpuid_t cpu, struct cos_defcompinfo *dci, thdclosure_index_t idx, sched_param_t params[])
-{
-	return -ENOTSUP;
-}
-
-int
-sl_xcpu_aep_alloc(cpuid_t cpu, cos_thd_fn_t fn, void *data, int own_tcap, cos_channelkey_t key, sched_param_t params[])
-{
-	return -ENOTSUP;
-}
-
-int
-sl_xcpu_aep_alloc_ext(cpuid_t cpu, struct cos_defcompinfo *dci, thdclosure_index_t idx, int own_tcap, cos_channelkey_t key, sched_param_t params[])
-{
-	return -ENOTSUP;
-}
-
-int
-sl_xcpu_initaep_alloc(cpuid_t cpu, struct cos_defcompinfo *dci, int own_tcap, cos_channelkey_t key, sched_param_t params[])
-{
-	return -ENOTSUP;
-}
-
-int
-sl_xcpu_initaep_alloc_ext(cpuid_t cpu, struct cos_defcompinfo *dci, struct cos_defcompinfo *sched, int own_tcap, cos_channelkey_t key, sched_param_t params[])
-{
-	return -ENOTSUP;
-}
-
-int
-sl_xcpu_process_no_cs(void)
-{
-	int num = 0;
-	struct sl_xcpu_request xcpu_req;
-
-	while (ck_ring_dequeue_mpsc_xcpu(sl__ring_curr(), sl__ring_buffer_curr(), &xcpu_req) == true) {
-
-		assert(xcpu_req.client != cos_cpuid());
-		switch(xcpu_req.type) {
-		case SL_XCPU_THD_ALLOC:
-		{
-			cos_thd_fn_t   fn   = xcpu_req.sl_xcpu_req_thd_alloc.fn;
-			void          *data = xcpu_req.sl_xcpu_req_thd_alloc.data;
-			struct sl_thd *t;
-			int i;
-
-			assert(fn);
-
-			t = sl_thd_alloc_no_cs(fn, data);
-			assert(t);
-			for (i = 0; i < xcpu_req.param_count; i++) {
-				sl_thd_param_set(t, xcpu_req.params[i]);
-			}
-
-			break;
-		}
-		case SL_XCPU_THD_ALLOC_EXT:
-		case SL_XCPU_AEP_ALLOC:
-		case SL_XCPU_AEP_ALLOC_EXT:
-		case SL_XCPU_INITAEP_ALLOC:
-		case SL_XCPU_THD_DEALLOC:
-		default:
-		{
-			PRINTC("Unimplemented request! Aborting!\n");
-			assert(0);
-		}
-		}
-		num ++;
-	}
-
-	return num; /* number of requests processed */
-}

From c6d004719b4e829a8af7e6e4ff08c6d3dddd902c Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Mon, 22 Apr 2019 16:01:15 -0400
Subject: [PATCH 048/127] Modified sl_xcore allocation api to be synchronous

---
 .../tests/unit_fprr/unit_fprr.c               |  33 ++-
 src/components/include/sl.h                   |   6 +-
 src/components/include/sl_xcore.h             |  70 ++++-
 src/components/lib/cos_component.c            |   8 +
 src/components/lib/sl/sl_capmgr.c             |   4 +-
 src/components/lib/sl/sl_raw.c                |   4 +-
 src/components/lib/sl/sl_sched.c              |   2 +-
 src/components/lib/sl/sl_xcore.c              | 266 ++++++++++++++----
 8 files changed, 315 insertions(+), 78 deletions(-)

diff --git a/src/components/implementation/tests/unit_fprr/unit_fprr.c b/src/components/implementation/tests/unit_fprr/unit_fprr.c
index 46f612fad8..9fd11b7e21 100644
--- a/src/components/implementation/tests/unit_fprr/unit_fprr.c
+++ b/src/components/implementation/tests/unit_fprr/unit_fprr.c
@@ -8,11 +8,12 @@
 #include <llprint.h>
 #include <res_spec.h>
 #include <sl.h>
+#include <cos_dcb.h>
 
 /* Ensure this is the same as what is in sl_mod_fprr.c */
 #define SL_FPRR_NPRIOS 32
 
-#define LOWEST_PRIORITY (SL_FPRR_NPRIOS - 1)
+#define LOWEST_PRIORITY (15)
 
 #define LOW_PRIORITY (LOWEST_PRIORITY - 1)
 #define HIGH_PRIORITY (LOWEST_PRIORITY - 10)
@@ -137,14 +138,18 @@ run_xcore_tests()
 
 	for (i = 0; i < XCORE_THDS; i++) {
 		sched_param_t p[1];
+		struct sl_xcore_thd *t = NULL;
 
 		if (cpu == cos_cpuid()) cpu++;
 		cpu %= NUM_CPU;
 		xcore_thd_data[cos_cpuid()][i] = (cpu << 16) | i;
 
 		p[0] = sched_param_pack(SCHEDP_PRIO, HIGH_PRIORITY);
-		ret = sl_xcore_thd_alloc(cpu, test_xcore_fn, (void *)&xcore_thd_data[cos_cpuid()][i], 1, p);
-		if (ret) break;
+		t = sl_xcore_thd_alloc(cpu, test_xcore_fn, (void *)&xcore_thd_data[cos_cpuid()][i], 1, p);
+		if (!t) {
+			ret = -1;
+			break;
+		}
 
 		cpu++;
 	}
@@ -156,10 +161,10 @@ run_xcore_tests()
 static void
 run_tests()
 {
-	test_highest_is_scheduled();
-	PRINTC("%s: Schedule highest priority thread only!\n", high_thd_test_status[cos_cpuid()] ? "FAILURE" : "SUCCESS");
-	test_swapping();
-	PRINTC("%s: Swap back and forth!\n", (thd1_ran[cos_cpuid()] && thd2_ran[cos_cpuid()]) ? "SUCCESS" : "FAILURE");
+//	test_highest_is_scheduled();
+//	PRINTC("%s: Schedule highest priority thread only!\n", high_thd_test_status[cos_cpuid()] ? "FAILURE" : "SUCCESS");
+//	test_swapping();
+//	PRINTC("%s: Swap back and forth!\n", (thd1_ran[cos_cpuid()] && thd2_ran[cos_cpuid()]) ? "SUCCESS" : "FAILURE");
 
 	run_xcore_tests();
 
@@ -170,22 +175,26 @@ run_tests()
 void
 cos_init(void)
 {
-	static unsigned long first = 1, init_done = 0;
+	int i;
+	static unsigned long first = NUM_CPU + 1, init_done[NUM_CPU] = { 0 };
 	struct sl_thd *testing_thread;
 	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
 	struct cos_compinfo    *ci    = cos_compinfo_get(defci);
 
 	PRINTC("Unit-test for the scheduling library (sl)\n");
 
-	if (ps_cas(&first, 1, 0)) {
+	if (ps_cas(&first, NUM_CPU + 1, cos_cpuid())) {
 		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
-		cos_defcompinfo_init();
-		ps_faa(&init_done, 1);
+		cos_defcompinfo_llinit();
 	} else {
-		while (!ps_load(&init_done)) ;
+		while (!ps_load(&init_done[first])) ;
 
 		cos_defcompinfo_sched_init();
 	}
+	ps_faa(&init_done[cos_cpuid()], 1);
+	for (i = 0; i < NUM_CPU; i++) {
+		while (!ps_load(&init_done[i])) ;
+	}
 	sl_init(SL_MIN_PERIOD_US);
 
 	testing_thread = sl_thd_alloc(run_tests, NULL);
diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index 5256a254cb..cc9ed36a21 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -426,7 +426,9 @@ sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 {
 	struct cos_scb_info *scb = sl_scb_info_core();
 
-	assert(sl_thd_dcbinfo(curr) && sl_thd_dcbinfo(next));
+	if (unlikely(!sl_thd_dcbinfo(curr) || !sl_thd_dcbinfo(next))) {
+		return sl_thd_kern_dispatch(sl_thd_thdcap(next));
+	}
 	/*
 	 * jump labels in the asm routine:
 	 *
@@ -613,6 +615,8 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 #ifdef SL_CS
 	sl_cs_exit();
 #endif
+	if (t == sl__globals_core()->idle_thd) t = sl__globals_core()->sched_thd;
+	if (t == sl_thd_curr()) return 0;
 
 	return sl_thd_dispatch(t, tok, sl_thd_curr());
 //	ret = sl_thd_activate(t, tok);
diff --git a/src/components/include/sl_xcore.h b/src/components/include/sl_xcore.h
index 40c5c54e19..8313e9a787 100644
--- a/src/components/include/sl_xcore.h
+++ b/src/components/include/sl_xcore.h
@@ -15,41 +15,60 @@ typedef enum {
 	SL_XCORE_AEP_ALLOC_EXT,
 	SL_XCORE_INITAEP_ALLOC,
 	SL_XCORE_THD_DEALLOC, /* thread delete, need it? */
+
+	SL_XCORE_THD_PARAM_SET,
+	SL_XCORE_THD_WAKEUP,
 } sl_xcore_req_t;
 
 struct sl_xcore_request {
 	sl_xcore_req_t type;         /* request type */
-	cpuid_t       client;       /* client cpu making the request */
-	int           req_response; /* client needs a response */
-	sched_param_t params[SL_XCORE_PARAM_MAX]; /* scheduling parameters */
-	int           param_count;		 /* number of parameters */
+	cpuid_t        client_core;  /* client cpu making the request */
+	thdid_t        client_thd;
+	vaddr_t        response;     /* response addr */
 
 	union {
 		struct {
 			cos_thd_fn_t            fn;
 			void                   *data;
+			sched_param_t           params[SL_XCORE_PARAM_MAX]; /* scheduling parameters */
+			int                     param_count;		 /* number of parameters */
 		} sl_xcore_req_thd_alloc;
 		struct {
 			cos_thd_fn_t            fn;
 			void                   *data;
 			int                     own_tcap;
 			cos_channelkey_t        key;
+			sched_param_t           params[SL_XCORE_PARAM_MAX]; /* scheduling parameters */
+			int                     param_count;		 /* number of parameters */
 		} sl_xcore_req_aep_alloc;
 		struct {
 			thdclosure_index_t      idx; /* TODO: create thread in another component ? */
 			struct cos_defcompinfo *dci;
+			sched_param_t           params[SL_XCORE_PARAM_MAX]; /* scheduling parameters */
+			int                     param_count;		 /* number of parameters */
 		} sl_xcore_req_thd_alloc_ext;
 		struct {
 			thdclosure_index_t      idx;
 			int                     own_tcap;
 			cos_channelkey_t        key;
 			struct cos_defcompinfo *dci;
+			sched_param_t           params[SL_XCORE_PARAM_MAX]; /* scheduling parameters */
+			int                     param_count;		 /* number of parameters */
 		} sl_xcore_req_aep_alloc_ext;
 		struct {
 			int                     is_sched;
 			int                     own_tcap;
 			struct cos_defcompinfo *dci, *sched;
+			sched_param_t           params[SL_XCORE_PARAM_MAX]; /* scheduling parameters */
+			int                     param_count;		 /* number of parameters */
 		} sl_xcore_req_initaep_alloc;
+		struct {
+			thdid_t tid;
+			sched_param_t param;
+		} sl_xcore_req_thd_param_set;
+		struct {
+			thdid_t tid;
+		} sl_xcore_req_thd_wakeup;
 	};
 };
 
@@ -57,6 +76,34 @@ CK_RING_PROTOTYPE(xcore, sl_xcore_request);
 
 #define SL_XCORE_RING_SIZE (64 * sizeof(struct sl_xcore_request)) /* in sl_const.h? */
 
+/* 
+ * TODO: unionize with sl_thd? 
+ *
+ * IMHO, no! This will occupy too much memory if unionized!
+ * Plus, that would require that we'd need cpuid in the sl_thd and many
+ * branches around in the code for core-local scheduling!
+ * Also, making this struct explicit, makes API use explicit.
+ * I should only be able to use: param_set(), wakeup() and perhaps free(). 
+ */
+struct sl_xcore_thd {
+	thdid_t thd;
+	cpuid_t core;
+
+	asndcap_t asnd;
+};
+
+struct sl_xcore_thd *sl_xcore_thd_lookup(thdid_t tid, cpuid_t core);
+static inline thdid_t
+sl_xcore_thd_thdid(struct sl_xcore_thd *t)
+{
+	return t->thd;
+}
+static inline cpuid_t
+sl_xcore_thd_core(struct sl_xcore_thd *t)
+{
+	return t->core;
+}
+
 /* perhaps move these to sl.h? */
 struct sl_global {
 	struct ck_ring xcore_ring[NUM_CPU]; /* mpsc ring! */
@@ -99,11 +146,14 @@ sl__ring_buffer_curr(void)
 	return sl__ring_buffer(cos_cpuid());
 }
 
-int sl_xcore_thd_alloc(cpuid_t core, cos_thd_fn_t fn, void *data, int nparams, sched_param_t params[]);
-int sl_xcore_thd_alloc_ext(cpuid_t core, struct cos_defcompinfo *dci, thdclosure_index_t idx, int nparams, sched_param_t params[]);
-int sl_xcore_aep_alloc(cpuid_t core, cos_thd_fn_t fn, void *data, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[]);
-int sl_xcore_aep_alloc_ext(cpuid_t core, struct cos_defcompinfo *dci, thdclosure_index_t idx, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[]);
-int sl_xcore_initaep_alloc(cpuid_t core, struct cos_defcompinfo *dci, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[]);
-int sl_xcore_initaep_alloc_ext(cpuid_t core, struct cos_defcompinfo *dci, struct cos_defcompinfo *sched, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[]);
+struct sl_xcore_thd *sl_xcore_thd_alloc(cpuid_t core, cos_thd_fn_t fn, void *data, int nparams, sched_param_t params[]);
+struct sl_xcore_thd *sl_xcore_thd_alloc_ext(cpuid_t core, struct cos_defcompinfo *dci, thdclosure_index_t idx, int nparams, sched_param_t params[]);
+struct sl_xcore_thd *sl_xcore_aep_alloc(cpuid_t core, cos_thd_fn_t fn, void *data, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[]);
+struct sl_xcore_thd *sl_xcore_aep_alloc_ext(cpuid_t core, struct cos_defcompinfo *dci, thdclosure_index_t idx, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[]);
+struct sl_xcore_thd *sl_xcore_initaep_alloc(cpuid_t core, struct cos_defcompinfo *dci, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[]);
+struct sl_xcore_thd *sl_xcore_initaep_alloc_ext(cpuid_t core, struct cos_defcompinfo *dci, struct cos_defcompinfo *sched, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[]);
+void                 sl_xcore_thd_param_set(struct sl_xcore_thd *t, sched_param_t param);
+void                 sl_xcore_thd_wakeup(struct sl_xcore_thd *t);
+void                 sl_xcore_thd_wakeup_tid(thdid_t tid, cpuid_t core);
 
 #endif /* SL_XCORE_H */
diff --git a/src/components/lib/cos_component.c b/src/components/lib/cos_component.c
index e4df0404a7..91db2a0ffc 100644
--- a/src/components/lib/cos_component.c
+++ b/src/components/lib/cos_component.c
@@ -171,6 +171,14 @@ cos_upcall_fn(upcall_type_t t, void *arg1, void *arg2, void *arg3)
 		constructors_execute();
 	}
 
+	/*
+	 * if it's the first component.. wait for timer calibration
+	 * NOTE: for "fork"ing components and not updating "spdid"s, this call will just fail and should be fine.
+	 */
+	if (cos_spd_id() == 0) {
+		cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
+	}
+
 	switch (t) {
 	case COS_UPCALL_THD_CREATE:
 		/* New thread creation method passes in this type. */
diff --git a/src/components/lib/sl/sl_capmgr.c b/src/components/lib/sl/sl_capmgr.c
index 941b0001dc..95408324c7 100644
--- a/src/components/lib/sl/sl_capmgr.c
+++ b/src/components/lib/sl/sl_capmgr.c
@@ -103,7 +103,7 @@ sl_thd_alloc_no_cs(cos_thd_fn_t fn, void *data)
 	aep->thd = capmgr_thd_create(fn, data, &tid, &dcb);
 	if (!aep->thd) goto done;
 	aep->tid = tid;
-	assert(tid && dcb);
+	assert(tid);
 
 	t = sl_thd_alloc_init(aep, 0, 0, dcb);
 	sl_mod_thd_create(sl_mod_thd_policy_get(t));
@@ -227,7 +227,7 @@ sl_thd_aep_alloc_no_cs(cos_aepthd_fn_t fn, void *data, sl_thd_property_t prps, c
 	if (prps & SL_THD_PROPERTY_OWN_TCAP) owntc = 1;
 	capmgr_aep_create(aep, fn, data, owntc, key, &dcb);
 	if (aep->thd == 0) goto done;
-	assert(aep->tid && dcb);
+	assert(aep->tid);
 
 	t = sl_thd_alloc_init(aep, 0, prps, dcb);
 	sl_mod_thd_create(sl_mod_thd_policy_get(t));
diff --git a/src/components/lib/sl/sl_raw.c b/src/components/lib/sl/sl_raw.c
index 6b6741e62a..4af637aee7 100644
--- a/src/components/lib/sl/sl_raw.c
+++ b/src/components/lib/sl/sl_raw.c
@@ -93,7 +93,7 @@ sl_thd_alloc_no_cs(cos_thd_fn_t fn, void *data)
 	aep = sl_thd_alloc_aep_backend();
 	if (!aep) goto done;
 	dcap = cos_dcb_info_alloc_curr(&doff, (vaddr_t *)&dcb);
-	assert(dcap);
+	if (dcb && doff) assert(dcap);
 
 	aep->thd = cos_thd_alloc(ci, ci->comp_cap, fn, data, dcap, doff);
 	if (!aep->thd) goto done;
@@ -180,7 +180,7 @@ sl_thd_aep_alloc_no_cs(cos_aepthd_fn_t fn, void *data, sl_thd_property_t prps, c
 	aep = sl_thd_alloc_aep_backend();
 	if (!aep) goto done;
 	dcap = cos_dcb_info_alloc_curr(&doff, (vaddr_t *)&dcb);
-	assert(dcap);
+	if (dcb && doff) assert(dcap);
 
 	/* NOTE: Cannot use stack-allocated cos_aep_info struct here */
 	if (prps & SL_THD_PROPERTY_OWN_TCAP) ret = cos_aep_alloc(aep, fn, data, dcap, doff);
diff --git a/src/components/lib/sl/sl_sched.c b/src/components/lib/sl/sl_sched.c
index e7c1d6d107..05a55c7f1c 100644
--- a/src/components/lib/sl/sl_sched.c
+++ b/src/components/lib/sl/sl_sched.c
@@ -462,7 +462,7 @@ sl_thd_yield_cs_exit_intern(thdid_t tid)
 
 		sl_cs_exit_switchto(to);
 	} else {
-		sl_mod_yield(sl_mod_thd_policy_get(t), NULL);
+		if (likely(t != sl__globals_core()->sched_thd && t != sl__globals_core()->idle_thd)) sl_mod_yield(sl_mod_thd_policy_get(t), NULL);
 		sl_cs_exit_schedule();
 	}
 }
diff --git a/src/components/lib/sl/sl_xcore.c b/src/components/lib/sl/sl_xcore.c
index f7157520c3..0f4db433bc 100644
--- a/src/components/lib/sl/sl_xcore.c
+++ b/src/components/lib/sl/sl_xcore.c
@@ -4,74 +4,236 @@
 #include <sl.h>
 #include <bitmap.h>
 
-#define SL_REQ_THD_ALLOC(req, fn, data) do {							\
-						req.type = SL_XCORE_THD_ALLOC;			\
-						req.client = cos_cpuid();			\
-						req.req_response = 0;				\
-						req.sl_xcore_req_thd_alloc.fn = fn;		\
-						req.sl_xcore_req_thd_alloc.data = data;		\
-					     } while (0)
+/******************************* Client-side ***************************/
+
+/* static xcore thread backend! mainly for bookkeeping across cores! */
+struct _sl_xcore_thds {
+	struct sl_xcore_thd _thds[MAX_NUM_THREADS];
+} CACHE_ALIGNED;
+
+static struct _sl_xcore_thds _xcore_thds[NUM_CPU];
+
+static inline struct sl_xcore_thd *
+_sl_xcore_thd_backend_lookup(thdid_t tid)
+{
+	return &(_xcore_thds[cos_cpuid()]._thds[tid]);
+}
+
+static inline struct sl_xcore_thd *
+_sl_xcore_thd_backend_init(thdid_t tid, cpuid_t core, asndcap_t snd)
+{
+	struct sl_xcore_thd *t = _sl_xcore_thd_backend_lookup(tid);
+
+	sl_cs_enter();
+	if (unlikely(t->thd)) goto done;
+	t->thd  = tid;
+	t->core = core;
+	t->asnd = snd;
+
+done:
+	sl_cs_exit();
+
+	return t;
+}
+
+struct sl_xcore_thd *
+sl_xcore_thd_lookup(thdid_t tid, cpuid_t core)
+{
+	struct sl_xcore_thd *t = _sl_xcore_thd_backend_lookup(tid);
+
+	/* TODO: is this safe? a wrong coreid can cause DOS! */
+	if (unlikely(!(t->thd))) return _sl_xcore_thd_backend_init(tid, core, 0);
+	/* something wrong! */
+	if (unlikely(t->core != core)) return NULL;
+
+	return t;
+}
+
+#define SL_XCORE_REQ(req, typ, resp) do { 				\
+					req.type        = typ;		\
+					req.client_core = cos_cpuid();	\
+					req.client_thd  = cos_thdid();	\
+					req.response    = resp;		\
+					} while (0)
 
 extern struct sl_thd *sl_thd_alloc_no_cs(cos_thd_fn_t fn, void *data);
 
-int
+static inline int
+_sl_xcore_request_enqueue_no_cs(cpuid_t core, struct sl_xcore_request *rq)
+{
+	int ret = 0;
+	
+	if (unlikely(core == cos_cpuid())) return -1;
+	if (unlikely(!bitmap_check(sl__globals()->core_bmp, core))) return -1;
+	ret = ck_ring_enqueue_mpsc_xcore(sl__ring(core), sl__ring_buffer(core), rq);
+	if (unlikely(ret == false)) return -1;
+
+	return 0;
+}
+
+static inline int
+_sl_xcore_request_enqueue(cpuid_t core, struct sl_xcore_request *rq)
+{
+	int ret = 0;
+	/* asndcap_t snd = 0; */
+	
+	sl_cs_enter();
+	ret = _sl_xcore_request_enqueue_no_cs(core, rq);
+	sl_cs_exit();
+	if (unlikely(ret)) return -1;
+
+	/* snd = sl__globals()->xcore_asnd[cos_cpuid()][core]; */
+	/* assert(snd); */
+
+	/* send an IPI for the request */
+	/* if (snd) cos_asnd(snd, 0); */
+
+	return 0;
+}
+
+struct sl_xcore_thd *
 sl_xcore_thd_alloc(cpuid_t core, cos_thd_fn_t fn, void *data, int nparams, sched_param_t params[])
 {
 	int ret = 0;
 	asndcap_t snd = 0;
 	struct sl_xcore_request req;
+	volatile thdid_t xcore_tid = 0;
 
-	if (core == cos_cpuid()) return -EINVAL;
-	if (!bitmap_check(sl__globals()->core_bmp, core)) return -EINVAL;
+	SL_XCORE_REQ(req, SL_XCORE_THD_ALLOC, (vaddr_t)&xcore_tid);
+	req.sl_xcore_req_thd_alloc.fn = fn;
+	req.sl_xcore_req_thd_alloc.data = data;
+	if (nparams) memcpy(req.sl_xcore_req_thd_alloc.params, params, sizeof(sched_param_t) * nparams);
+	req.sl_xcore_req_thd_alloc.param_count = nparams;
 
-	sl_cs_enter();
+	ret = _sl_xcore_request_enqueue(core, &req);
+	if (unlikely(ret)) return NULL;
 
-	SL_REQ_THD_ALLOC(req, fn, data);
-	if (nparams) memcpy(req.params, params, sizeof(sched_param_t) * nparams);
-	req.param_count = nparams;
-	if (ck_ring_enqueue_mpsc_xcore(sl__ring(core), sl__ring_buffer(core), &req) != true) {
-		ret = -ENOMEM;
+	/* Other core will wake this up after creation! */
+	if (sl_thd_curr() != sl__globals_core()->sched_thd) {
+		sl_thd_block(0);
 	} else {
-		snd = sl__globals()->xcore_asnd[cos_cpuid()][core];
-		assert(snd);
+		while (!xcore_tid) sl_thd_yield(0);
 	}
-
-	sl_cs_exit();
-	/* if (!snd) return -1; */
-	/* send an IPI for the request */
-	/* cos_asnd(snd, 0); */
-
-	return ret;
+	assert(xcore_tid);
+	
+	return _sl_xcore_thd_backend_init(xcore_tid, core, 0);
 }
 
-int
+struct sl_xcore_thd *
 sl_xcore_thd_alloc_ext(cpuid_t core, struct cos_defcompinfo *dci, thdclosure_index_t idx, int nparams, sched_param_t params[])
 {
-	return -ENOTSUP;
+	return NULL;
 }
 
-int
+struct sl_xcore_thd *
 sl_xcore_aep_alloc(cpuid_t core, cos_thd_fn_t fn, void *data, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[])
 {
-	return -ENOTSUP;
+	return NULL;
 }
 
-int
+struct sl_xcore_thd *
 sl_xcore_aep_alloc_ext(cpuid_t core, struct cos_defcompinfo *dci, thdclosure_index_t idx, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[])
 {
-	return -ENOTSUP;
+	return NULL;
 }
 
-int
+struct sl_xcore_thd *
 sl_xcore_initaep_alloc(cpuid_t core, struct cos_defcompinfo *dci, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[])
 {
-	return -ENOTSUP;
+	return NULL;
 }
 
-int
+struct sl_xcore_thd *
 sl_xcore_initaep_alloc_ext(cpuid_t core, struct cos_defcompinfo *dci, struct cos_defcompinfo *sched, int own_tcap, cos_channelkey_t key, int nparams, sched_param_t params[])
 {
-	return -ENOTSUP;
+	return NULL;
+}
+
+void
+sl_xcore_thd_param_set(struct sl_xcore_thd *t, sched_param_t param)
+{
+	struct sl_xcore_request req;
+	cpuid_t core = sl_xcore_thd_core(t);
+
+	SL_XCORE_REQ(req, SL_XCORE_THD_PARAM_SET, 0);
+	req.sl_xcore_req_thd_param_set.tid   = sl_xcore_thd_thdid(t);
+	req.sl_xcore_req_thd_param_set.param = param;
+
+	_sl_xcore_request_enqueue(core, &req);
+}
+
+static inline void
+_sl_xcore_thd_wakeup_tid_no_cs(thdid_t tid, cpuid_t core)
+{
+	struct sl_xcore_request req;
+
+	SL_XCORE_REQ(req, SL_XCORE_THD_WAKEUP, 0);
+	req.sl_xcore_req_thd_wakeup.tid = tid;
+	_sl_xcore_request_enqueue_no_cs(core, &req);
+}
+
+void
+sl_xcore_thd_wakeup(struct sl_xcore_thd *t)
+{
+	struct sl_xcore_request req;
+	cpuid_t core = sl_xcore_thd_core(t);
+
+	if (unlikely(!t)) return;
+
+	SL_XCORE_REQ(req, SL_XCORE_THD_WAKEUP, 0);
+	req.sl_xcore_req_thd_wakeup.tid = sl_xcore_thd_thdid(t);
+	_sl_xcore_request_enqueue(core, &req);
+}
+
+void
+sl_xcore_thd_wakeup_tid(thdid_t tid, cpuid_t core)
+{
+	struct sl_xcore_thd *t = sl_xcore_thd_lookup(tid, core);
+
+	sl_xcore_thd_wakeup(t);
+}
+
+/******************************* Server-side ***************************/
+
+static inline int
+_sl_xcore_req_thd_alloc_no_cs(struct sl_xcore_request *req)
+{
+	cos_thd_fn_t   fn   = req->sl_xcore_req_thd_alloc.fn;
+	void          *data = req->sl_xcore_req_thd_alloc.data;
+	struct sl_thd *t;
+	int i;
+
+	assert(fn);
+
+	t = sl_thd_alloc_no_cs(fn, data);
+	assert(t);
+	if (likely(req->response)) *((thdid_t *)req->response) = sl_thd_thdid(t);
+	for (i = 0; i < req->sl_xcore_req_thd_alloc.param_count; i++) sl_thd_param_set(t, req->sl_xcore_req_thd_alloc.params[i]);
+	_sl_xcore_thd_wakeup_tid_no_cs(req->client_thd, req->client_core);
+
+	return 0;
+}
+
+static inline int
+_sl_xcore_req_thd_param_set_no_cs(struct sl_xcore_request *req)
+{
+	struct sl_thd *t = sl_thd_lkup(req->sl_xcore_req_thd_param_set.tid);
+
+	if (!t) return -1;
+	sl_thd_param_set(t, req->sl_xcore_req_thd_param_set.param);
+
+	return 0;
+}
+
+static inline int
+_sl_xcore_req_thd_wakeup_no_cs(struct sl_xcore_request *req)
+{
+	struct sl_thd *t = sl_thd_lkup(req->sl_xcore_req_thd_param_set.tid);
+
+	if (!t) return -1;
+	sl_thd_wakeup_no_cs(t);
+
+	return 0;
 }
 
 int
@@ -81,24 +243,12 @@ sl_xcore_process_no_cs(void)
 	struct sl_xcore_request xcore_req;
 
 	while (ck_ring_dequeue_mpsc_xcore(sl__ring_curr(), sl__ring_buffer_curr(), &xcore_req) == true) {
+		assert(xcore_req.client_core != cos_cpuid());
 
-		assert(xcore_req.client != cos_cpuid());
 		switch(xcore_req.type) {
 		case SL_XCORE_THD_ALLOC:
 		{
-			cos_thd_fn_t   fn   = xcore_req.sl_xcore_req_thd_alloc.fn;
-			void          *data = xcore_req.sl_xcore_req_thd_alloc.data;
-			struct sl_thd *t;
-			int i;
-
-			assert(fn);
-
-			t = sl_thd_alloc_no_cs(fn, data);
-			assert(t);
-			for (i = 0; i < xcore_req.param_count; i++) {
-				sl_thd_param_set(t, xcore_req.params[i]);
-			}
-
+			_sl_xcore_req_thd_alloc_no_cs(&xcore_req);
 			break;
 		}
 		case SL_XCORE_THD_ALLOC_EXT:
@@ -106,10 +256,26 @@ sl_xcore_process_no_cs(void)
 		case SL_XCORE_AEP_ALLOC_EXT:
 		case SL_XCORE_INITAEP_ALLOC:
 		case SL_XCORE_THD_DEALLOC:
-		default:
 		{
 			PRINTC("Unimplemented request! Aborting!\n");
 			assert(0);
+
+			break;
+		}
+		case SL_XCORE_THD_PARAM_SET:
+		{
+			_sl_xcore_req_thd_param_set_no_cs(&xcore_req);
+			break;
+		}
+		case SL_XCORE_THD_WAKEUP:
+		{
+			_sl_xcore_req_thd_wakeup_no_cs(&xcore_req);
+			break;
+		}
+		default:
+		{
+			PRINTC("Unrecognized request! Aborting!\n");
+			assert(0);
 		}
 		}
 		num ++;

From 4c83c035baa09ca668adbc149c71b40afd1fb14e Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Tue, 23 Apr 2019 12:01:42 -0400
Subject: [PATCH 049/127] Platform fixes beyond smp branch

---
 src/platform/i386/kernel.c |  3 +++
 src/platform/i386/lapic.h  | 19 ++++++++++---------
 src/platform/i386/serial.c |  7 +++++++
 src/platform/i386/serial.h |  1 +
 4 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/src/platform/i386/kernel.c b/src/platform/i386/kernel.c
index 904ff722c5..a91c6f5437 100644
--- a/src/platform/i386/kernel.c
+++ b/src/platform/i386/kernel.c
@@ -166,6 +166,9 @@ kmain(struct multiboot *mboot, u32_t mboot_magic, u32_t esp)
 	chal_irq_enable(HW_SERIAL, 0);
 	pic_init();
 	ioapic_init();
+#ifdef ENABLE_SERIAL
+	serial_late_init();
+#endif
 	smp_init(cores_ready);
 	cores_ready[INIT_CORE] = 1;
 
diff --git a/src/platform/i386/lapic.h b/src/platform/i386/lapic.h
index bfc80b8163..6156ffc708 100644
--- a/src/platform/i386/lapic.h
+++ b/src/platform/i386/lapic.h
@@ -3,16 +3,17 @@
 
 #include "apic_cntl.h"
 
-void         lapic_ack(void);
-void         lapic_iter(struct lapic_cntl *);
-u32_t        lapic_find_localaddr(void *l);
-void         lapic_set_page(u32_t page);
-void         lapic_timer_init(void);
-void         lapic_set_timer(int timer_type, cycles_t deadline);
-u32_t        lapic_get_ccr(void);
-void         lapic_timer_calibration(u32_t ratio);
+void  lapic_ack(void);
+void  lapic_iter(struct lapic_cntl *);
+u32_t lapic_find_localaddr(void *l);
+void  lapic_set_page(u32_t page);
+void  lapic_timer_init(void);
+void  lapic_set_timer(int timer_type, cycles_t deadline);
+u32_t lapic_get_ccr(void);
+void  lapic_timer_calibration(u32_t ratio);
+void  lapic_asnd_ipi_send(const cpuid_t cpu_id);
 
-extern u32_t lapic_timer_calib_init;
+extern volatile u32_t lapic_timer_calib_init;
 extern int apicids[NUM_CPU];
 extern u32_t logical_apicids[NUM_CPU];
 
diff --git a/src/platform/i386/serial.c b/src/platform/i386/serial.c
index 8f4f5adf91..5685938af6 100644
--- a/src/platform/i386/serial.c
+++ b/src/platform/i386/serial.c
@@ -94,3 +94,10 @@ serial_init(void)
 	outb(SERIAL_PORT_A + 1, 0x01); /* Enable interrupts on receive */
 	printk("Enabling serial I/O\n");
 }
+
+void
+serial_late_init(void)
+{
+	chal_irq_enable(HW_SERIAL, 0);
+	chal_irq_enable(HW_KEYBOARD, 0);
+}
diff --git a/src/platform/i386/serial.h b/src/platform/i386/serial.h
index bc8461644d..777c31078e 100644
--- a/src/platform/i386/serial.h
+++ b/src/platform/i386/serial.h
@@ -3,6 +3,7 @@
 
 #ifdef ENABLE_SERIAL
 void serial_init(void);
+void serial_late_init(void);
 #endif
 
 #endif

From 39e86e27ea6780741bb372a444bb968669d434b2 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Tue, 23 Apr 2019 12:02:19 -0400
Subject: [PATCH 050/127] 32bit compilation on a 64bit system

* NOTE: include changes in ps library also!
---
 src/components/Makefile.comp | 8 ++++++++
 src/platform/linker/Makefile | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/components/Makefile.comp b/src/components/Makefile.comp
index ed593faaec..2a0bd21f57 100644
--- a/src/components/Makefile.comp
+++ b/src/components/Makefile.comp
@@ -48,6 +48,14 @@ LDFLAGS=-melf_i386
 MUSLCFLAGS=$(CFLAGS) -lc -lgcc -Xlinker -r
 ASFLAGS=-m32 $(INC_PATH) $(SHARED_FLAGS)
 
+GCC_PIE=$(shell gcc -v 2>&1 | grep -c "\--enable-default-pie")
+ifeq ($(GCC_PIE),1)
+MUSLCFLAGS+=-no-pie
+LDFLAGS+=-no-pie
+CFLAGS+=-fno-pie
+CXXFLAGS+=-fno-pie
+endif
+
 SERVER_STUB=s_stub.o
 CLIENT_STUB=c_stub.o
 
diff --git a/src/platform/linker/Makefile b/src/platform/linker/Makefile
index 98394046d6..bafdde4485 100644
--- a/src/platform/linker/Makefile
+++ b/src/platform/linker/Makefile
@@ -2,7 +2,7 @@ include Makefile.src
 
 CC=gcc
 LD=ld
-CFLAGS=-D__x86__ -D_GNU_SOURCE -lpthread -Wall -Wextra -Wno-unused-parameter -Wno-unused-function -Wno-pointer-to-int-cast -Wno-int-to-pointer-cast -Wno-format -ggdb3 -I$(SHAREDINC)
+CFLAGS=-m32 -D__x86__ -D_GNU_SOURCE -lpthread -Wall -Wextra -Wno-unused-parameter -Wno-unused-function -Wno-pointer-to-int-cast -Wno-int-to-pointer-cast -Wno-format -ggdb3 -I$(SHAREDINC)
 LDFLAGS=-melf_i386
 PRODUCTS=cos_linker gen_client_stub
 

From 579d364b438c20bdff32d04be8f1ab3a47377d2d Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Tue, 23 Apr 2019 18:05:59 -0400
Subject: [PATCH 051/127] Bugfixes and cleanup after a massive merge

---
 src/components/implementation/capmgr/naive/init.c |  2 +-
 .../no_interface/llbooter/boot_deps.h             | 15 ---------------
 .../implementation/tests/micro_booter/test_ipi.c  |  2 ++
 src/components/include/hypercall.h                | 12 ------------
 src/components/lib/sl/sl_sched.c                  |  6 +++---
 src/components/lib/sl/sl_xcore.c                  |  2 ++
 src/kernel/include/pgtbl.h                        |  1 -
 src/kernel/include/shared/cos_config.h            |  2 +-
 8 files changed, 9 insertions(+), 33 deletions(-)

diff --git a/src/components/implementation/capmgr/naive/init.c b/src/components/implementation/capmgr/naive/init.c
index 1601463b01..fc8087c9c9 100644
--- a/src/components/implementation/capmgr/naive/init.c
+++ b/src/components/implementation/capmgr/naive/init.c
@@ -51,7 +51,7 @@ capmgr_comp_info_init(struct cap_comp_info *rci, spdid_t spdid)
 	assert(rci_sched && cap_info_init_check(rci_sched));
 	rci_cpu->parent = rci_sched;
 	rci_cpu->thd_used = 1;
-	cap_info_cpu_initdcb_init(rci);
+	if (cos_cpuid() != INIT_CORE) cap_info_cpu_initdcb_init(rci);
 
 	while ((remain_child = hypercall_comp_child_next(spdid, &childid, &ch_flags)) >= 0) {
 		bitmap_set(rci_cpu->child_bitmap, childid - 1);
diff --git a/src/components/implementation/no_interface/llbooter/boot_deps.h b/src/components/implementation/no_interface/llbooter/boot_deps.h
index 8395121f6c..f6a961dc0e 100644
--- a/src/components/implementation/no_interface/llbooter/boot_deps.h
+++ b/src/components/implementation/no_interface/llbooter/boot_deps.h
@@ -255,7 +255,6 @@ boot_newcomp_defcinfo_init(spdid_t spdid)
 	dcbcap_t dcbcap = 0;
 	dcboff_t dcboff = 0;
 
-
 	dcbcap = cos_dcb_alloc(boot_info, child_ci->pgtbl_cap, spdinfo->initdcbpgs + cos_cpuid() * PAGE_SIZE);
 	assert(dcbcap);
 
@@ -842,20 +841,6 @@ hypercall_entry(word_t *ret2, word_t *ret3, int op, word_t arg3, word_t arg4, wo
 
 		break;
 	}
-	case HYPERCALL_COMP_CAPFRONTIER_GET:
-	{
-		vaddr_t vasfr;
-		capid_t capfr;
-		spdid_t srcid = arg3;
-
-		if (!__hypercall_resource_access_check(client, srcid, 1)) return -EACCES;
-		ret1  = boot_comp_frontier_get(client, srcid, &vasfr, &capfr);
-		if (ret1) goto done;
-
-		*ret2 = vasfr;
-
-		break;
-	}
 	case HYPERCALL_COMP_CPUBITMAP_GET:
 	{
 		spdid_t srcid = arg3;
diff --git a/src/components/implementation/tests/micro_booter/test_ipi.c b/src/components/implementation/tests/micro_booter/test_ipi.c
index e032d7c0a8..282b12232a 100644
--- a/src/components/implementation/tests/micro_booter/test_ipi.c
+++ b/src/components/implementation/tests/micro_booter/test_ipi.c
@@ -337,6 +337,8 @@ test_ipi_full(void)
 	asndcap_t s = 0;
 	thdcap_t t = 0;
 
+	if (NUM_CPU < 2) return;
+
 	if (cos_cpuid() == TEST_RCV_CORE) {
 		t = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_rcv_fn, NULL, 0, 0);
 		assert(t);
diff --git a/src/components/include/hypercall.h b/src/components/include/hypercall.h
index a8ba773c51..aa545ff77f 100644
--- a/src/components/include/hypercall.h
+++ b/src/components/include/hypercall.h
@@ -13,7 +13,6 @@ enum hypercall_cntl {
 	HYPERCALL_COMP_COMPCAP_GET,
 	HYPERCALL_COMP_CAPTBLCAP_GET,
 	HYPERCALL_COMP_PGTBLCAP_GET,
-	HYPERCALL_COMP_CAPFRONTIER_GET,
 
 	HYPERCALL_COMP_INITAEP_GET,
 	HYPERCALL_COMP_CHILD_NEXT,
@@ -191,17 +190,6 @@ hypercall_comp_pgtblcap_get(spdid_t spdid)
 	return ptslot;
 }
 
-static inline capid_t
-hypercall_comp_capfrontier_get(spdid_t spdid)
-{
-	word_t unused;
-	capid_t cap_frontier;
-
-	if (cos_sinv_rets(BOOT_CAPTBL_SINV_CAP, HYPERCALL_COMP_CAPFRONTIER_GET, spdid, 0, 0, &cap_frontier, &unused)) return 0;
-
-	return cap_frontier;
-}
-
 static inline int
 hypercall_comp_cpubitmap_get(spdid_t spdid, u32_t *bmp)
 {
diff --git a/src/components/lib/sl/sl_sched.c b/src/components/lib/sl/sl_sched.c
index 69020f4bff..16def138f2 100644
--- a/src/components/lib/sl/sl_sched.c
+++ b/src/components/lib/sl/sl_sched.c
@@ -576,6 +576,7 @@ sl_global_init(u32_t *core_bmp)
 
 	memset(g, 0, sizeof(struct sl_global));
 	assert(sizeof(struct cos_scb_info) * NUM_CPU <= COS_SCB_SIZE && COS_SCB_SIZE == PAGE_SIZE);
+	g->scb_area = (struct cos_scb_info *)cos_scb_info_get();
 
 	for (i = 0; i < NUM_CPU; i++) {
 		if (!bitmap_check(core_bmp, i)) continue;
@@ -583,20 +584,19 @@ sl_global_init(u32_t *core_bmp)
 		bitmap_set(g->core_bmp, i);
 		ck_ring_init(sl__ring(i), SL_XCORE_RING_SIZE);
 	}
-	g->scb_area = (struct cos_scb_info *)cos_scb_info_get();
 }
 
 void
 sl_init_corebmp(microsec_t period, u32_t *corebmp)
 {
 	int i;
-	static volatile unsigned long  first = 1, init_done = 0;
+	static volatile unsigned long  first = NUM_CPU + 1, init_done = 0;
 	struct cos_defcompinfo        *dci   = cos_defcompinfo_curr_get();
 	struct cos_compinfo           *ci    = cos_compinfo_get(dci);
 	struct sl_global_core         *g     = sl__globals_core();
 	struct cos_aep_info           *ga    = cos_sched_aep_get(dci);
 
-	if (ps_cas((unsigned long *)&first, 1, 0)) {
+	if (ps_cas((unsigned long *)&first, NUM_CPU + 1, cos_cpuid())) {
 		sl_global_init(corebmp);
 		ps_faa((unsigned long *)&init_done, 1);
 	} else {
diff --git a/src/components/lib/sl/sl_xcore.c b/src/components/lib/sl/sl_xcore.c
index 0f4db433bc..10e149d868 100644
--- a/src/components/lib/sl/sl_xcore.c
+++ b/src/components/lib/sl/sl_xcore.c
@@ -63,6 +63,7 @@ _sl_xcore_request_enqueue_no_cs(cpuid_t core, struct sl_xcore_request *rq)
 {
 	int ret = 0;
 	
+	if (unlikely(core >= NUM_CPU)) return -1;
 	if (unlikely(core == cos_cpuid())) return -1;
 	if (unlikely(!bitmap_check(sl__globals()->core_bmp, core))) return -1;
 	ret = ck_ring_enqueue_mpsc_xcore(sl__ring(core), sl__ring_buffer(core), rq);
@@ -77,6 +78,7 @@ _sl_xcore_request_enqueue(cpuid_t core, struct sl_xcore_request *rq)
 	int ret = 0;
 	/* asndcap_t snd = 0; */
 	
+	if (unlikely(core >= NUM_CPU)) return -1;
 	sl_cs_enter();
 	ret = _sl_xcore_request_enqueue_no_cs(core, rq);
 	sl_cs_exit();
diff --git a/src/kernel/include/pgtbl.h b/src/kernel/include/pgtbl.h
index bef911d995..f07c4b4ad5 100644
--- a/src/kernel/include/pgtbl.h
+++ b/src/kernel/include/pgtbl.h
@@ -290,7 +290,6 @@ pgtbl_mapping_add(pgtbl_t pt, u32_t addr, u32_t page, u32_t flags)
 	                                          PGTBL_DEPTH, &accum);
 	if (!pte) return -ENOENT;
 	orig_v = (u32_t)(pte->next);
-//	printk("%p %x\n", pte, orig_v);
 
 	if (orig_v & PGTBL_PRESENT) return -EEXIST;
 	if (orig_v & PGTBL_COSFRAME) return -EPERM;
diff --git a/src/kernel/include/shared/cos_config.h b/src/kernel/include/shared/cos_config.h
index 8c46ae5377..bf501b3be9 100644
--- a/src/kernel/include/shared/cos_config.h
+++ b/src/kernel/include/shared/cos_config.h
@@ -17,7 +17,7 @@
 
 #include "cpu_ghz.h"
 
-#define NUM_CPU 2
+#define NUM_CPU 1
 #define NUM_CPU_BMP_BYTES ((NUM_CPU + 7) / 8)
 #define NUM_CPU_BMP_WORDS ((NUM_CPU_BMP_BYTES + 3) / 4)
 

From 0c75f1d6767bdc55ddff63207c460fe320ad2858 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Mon, 29 Apr 2019 12:13:25 -0400
Subject: [PATCH 052/127] ICV api in cos_omp and move cos_gomp to directory

---
 src/components/include/cos_omp.h             | 31 +++++++++
 src/components/lib/Makefile                  | 30 +++++---
 src/components/lib/cos_gomp/Makefile         | 20 ++++++
 src/components/lib/{ => cos_gomp}/cos_gomp.c |  0
 src/components/lib/cos_omp.c                 | 73 ++++++++++++++++++++
 5 files changed, 144 insertions(+), 10 deletions(-)
 create mode 100644 src/components/lib/cos_gomp/Makefile
 rename src/components/lib/{ => cos_gomp}/cos_gomp.c (100%)

diff --git a/src/components/include/cos_omp.h b/src/components/include/cos_omp.h
index 28cd98b035..61f65b7d1c 100644
--- a/src/components/include/cos_omp.h
+++ b/src/components/include/cos_omp.h
@@ -13,6 +13,37 @@
 
 #define COS_OMP_MAX_NUM_THREADS (NUM_CPU)
 
+struct cos_icv_data_env {
+	unsigned dyn_var;
+	unsigned nest_var;
+	unsigned nthreads_var;
+	unsigned run_sched_var;
+	unsigned bind_var;
+	unsigned thread_limit_var;
+	unsigned active_levels_var;
+	unsigned levels_var;
+	unsigned default_device_var;
+};
+
+struct cos_icv_global_env {
+	unsigned cancel_var;
+	unsigned max_task_priority_var;
+};
+
+struct cos_icv_implicittask_env {
+	unsigned place_partition_var;
+};
+
+struct cos_icv_device_env {
+	unsigned def_sched_var;
+	unsigned stacksize_var;
+	unsigned wait_policy_var;
+	unsigned max_active_levels_var;
+};
+
+extern void cos_omp_icv_data_init(struct cos_icv_data_env *icvde);
+extern void cos_omp_icv_implitsk_init(struct cos_icv_implicittask_env *icvite);
+extern void cos_omp_icv_device_init(struct cos_icv_device_env *icvdve, unsigned dev_no);
 extern void cos_omp_init(void);
 
 #endif /* COS_OMP_H */
diff --git a/src/components/lib/Makefile b/src/components/lib/Makefile
index 304313fd10..7bec69a198 100644
--- a/src/components/lib/Makefile
+++ b/src/components/lib/Makefile
@@ -1,6 +1,6 @@
 include Makefile.src Makefile.comp
 
-LIB_OBJS=heap.o cobj_format.o cos_kernel_api.o cos_defkernel_api.o cos_dcb.o cos_omp.o cos_gomp.o
+LIB_OBJS=heap.o cobj_format.o cos_kernel_api.o cos_defkernel_api.o cos_dcb.o cos_omp.o
 LIBS=$(LIB_OBJS:%.o=%.a)
 MANDITORY=c_stub.o cos_asm_upcall.o cos_asm_ainv.o cos_component.o
 MAND=$(MANDITORY_LIB)
@@ -10,8 +10,8 @@ SIMPLE_STKLIB=simple_stklib.o
 CINC_ENV=$(CINC)
 export CINC_ENV
 
-.PHONY: all sl ps ck sinv
-all: $(LIBS) $(MAND) $(SIMPLE_STKLIB) sl sinv
+.PHONY: all sl ps ck sinv cos_gomp posix cxx
+all: $(LIBS) $(MAND) $(SIMPLE_STKLIB) sl posix sinv cos_gomp
 
 # we have to compile these without dietlibc so that there are not
 # symbol conflicts and this is why we have the %.a here and don't
@@ -30,6 +30,9 @@ $(SIMPLE_STKLIB): $(SIMPLE_STACKS)
 sl:
 	make $(MAKEFLAGS) -C sl
 
+cos_gomp:
+	make $(MAKEFLAGS) -C cos_gomp
+
 sinv:
 	make $(MAKEFLAGS) -C sinv_async
 
@@ -42,17 +45,21 @@ sinv:
 	@$(CC) $(CFLAGS) $(CINC) -o $@ -c $^
 
 clean:
-	$(info |     [RM]   Cleaning up directory)
+	$(info |     [RM]   Cleaning up libraries and directories)
 	@rm -f a.out *.o *.a *.d *~
-	make -C sl clean
+	@make -C sl clean
+	@make -C sinv_async clean
+	@make -C posix clean
+	@make -C cos_gomp clean
 
 distclean:
+	$(info |     [RM]   Uninstalling external libraries)
 	make -C musl-1.1.11 distclean
 # keep the following commands in one line. make executes each line
 # with a new shell.
-	make -C posix clean
-	make -C libcxx clean
 	make -C ck uninstall
+	make -C ps clean
+	make -C libcxx clean
 
 musl:
 	cd musl-1.1.11; ./configure "CFLAGS=-m32 -O3" "LDFLAGS=-Wl,-melf_i386" --disable-shared --target=i386; cd ..
@@ -62,12 +69,15 @@ musl:
 ps:
 	cd ps; ./configure cos x86; cd ..; make -C ps config ; make -C ps all
 
-
 ck:
 	make -C ck all
 	make -C ck install
 
-init: clean distclean musl ck ps all
-# keep the following commands in one line. Same as above.
+posix:
 	make -C posix
+
+cxx:
 	make -C libcxx
+
+init: clean distclean musl ck ps cxx all
+# keep the following commands in one line. Same as above.
diff --git a/src/components/lib/cos_gomp/Makefile b/src/components/lib/cos_gomp/Makefile
new file mode 100644
index 0000000000..536e0ec430
--- /dev/null
+++ b/src/components/lib/cos_gomp/Makefile
@@ -0,0 +1,20 @@
+include Makefile.src Makefile.comp
+
+OBJS=cos_gomp.o
+LIB=cos_gomp
+CINC+=-m32
+
+.PHONY: all clean
+all: $(LIB)
+	@cp *.a ../
+
+%.o:%.c
+	$(info |     [CC]   Compiling C file $^ into $@)
+	@$(CC) $(CFLAGS) $(CINC) -o $@ -c $<
+
+$(LIB): $(OBJS)
+	$(info |     [LD]   Creating library file lib$(LIB).a)
+	@$(AR) cr lib$(LIB).a $^
+
+clean:
+	@rm -f *.o *.a *.d
diff --git a/src/components/lib/cos_gomp.c b/src/components/lib/cos_gomp/cos_gomp.c
similarity index 100%
rename from src/components/lib/cos_gomp.c
rename to src/components/lib/cos_gomp/cos_gomp.c
diff --git a/src/components/lib/cos_omp.c b/src/components/lib/cos_omp.c
index 54ef11c92d..f1c7bea1bb 100644
--- a/src/components/lib/cos_omp.c
+++ b/src/components/lib/cos_omp.c
@@ -9,6 +9,13 @@
 #include <cos_kernel_api.h>
 #include <cos_types.h>
 
+#define COS_OMP_NUM_DEVS 1
+
+static struct cos_icv_global_env       cos_icv_glbenv;
+static struct cos_icv_device_env       cos_icv_devenv[COS_OMP_NUM_DEVS];
+static struct cos_icv_data_env         cos_icv_init_dataenv;
+static struct cos_icv_implicittask_env cos_icv_init_implitskenv;
+static unsigned int _cos_omp_init_done = 0;
 static unsigned int _cycs_per_usec = 0;
 
 #define _USEC_TO_SEC_d(x) (((double)x)/(double)(1000*1000))
@@ -57,10 +64,76 @@ omp_get_thread_num(void)
 	return (cos_thdid() % omp_get_max_threads());
 }
 
+static inline void
+cos_omp_icv_global_init(void)
+{
+	assert(!_cos_omp_init_done);
+	/* TODO: what is not int? what is not zero? */
+	/* cos_icv_glbenv.xxxx = yyyy; */
+}
+
+void
+cos_omp_icv_data_init(struct cos_icv_data_env *icvde)
+{
+	if (unlikely(icvde == &cos_icv_init_dataenv)) {
+		assert(!_cos_omp_init_done); /* init only on startup! */
+
+		/* TODO: what is not int? what is not zero! */
+		return;
+	}
+
+	assert(_cos_omp_init_done);
+	memcpy(icvde, &cos_icv_init_dataenv, sizeof(struct cos_icv_data_env));
+}
+
+void
+cos_omp_icv_implitsk_init(struct cos_icv_implicittask_env *icvite)
+{
+	if (unlikely(icvite == &cos_icv_init_implitskenv)) {
+		assert(!_cos_omp_init_done); /* init only on startup! */
+
+		/* TODO: what is not int? what is not zero! */
+		return;
+	}
+
+	assert(_cos_omp_init_done);
+	memcpy(icvite, &cos_icv_init_implitskenv, sizeof(struct cos_icv_implicittask_env));
+}
+
+void
+cos_omp_icv_device_init(struct cos_icv_device_env *icvdve, unsigned dev_no)
+{
+	assert(dev_no < COS_OMP_NUM_DEVS);
+
+	if (unlikely(icvdve == &cos_icv_devenv[dev_no])) {
+		assert(!_cos_omp_init_done); /* init only on startup! */
+
+		/* TODO: what is not int? what is not zero! */
+		return;
+	}
+
+	assert(_cos_omp_init_done);
+	memcpy(icvdve, &cos_icv_devenv[dev_no], sizeof(struct cos_icv_device_env));
+}
+
+static inline void
+cos_omp_icv_init(void)
+{
+	cos_omp_icv_global_init();
+
+	cos_omp_icv_device_init(&cos_icv_devenv[0], 0);
+
+	cos_omp_icv_data_init(&cos_icv_init_dataenv);
+	cos_omp_icv_implitsk_init(&cos_icv_init_implitskenv);
+}
+
 void
 cos_omp_init(void)
 {
 	_cycs_per_usec = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
 
 	assert(_cycs_per_usec);
+
+	cos_omp_icv_init();
+	_cos_omp_init_done = 1;
 }

From 4022a732154e33ca9e0ffd12044a1802d69c06bf Mon Sep 17 00:00:00 2001
From: Gabe Parmer <gparmer@gwu.edu>
Date: Sun, 5 May 2019 22:21:38 -0400
Subject: [PATCH 053/127] Initial Composite RunTime (crt) support for block
 points and mutexes

- blockpoints added, along with a simple lock on top
- added blkpts to sl to support the blkpt library
- added a stacklist for stack-allocated block-tracking structures
---
 .../implementation/tests/crt_tests/Makefile   |   8 +
 .../implementation/tests/crt_tests/crttests.c |  88 ++++++
 src/components/include/crt_blkpt.h            | 296 ++++++++++++++++++
 src/components/include/crt_lock.h             |  59 ++++
 src/components/include/sl.h                   |   8 +
 src/components/include/stacklist.h            |  50 +++
 src/components/lib/sl/Makefile                |   2 +-
 src/components/lib/sl/sl_blkpt.c              | 124 ++++++++
 src/platform/i386/runscripts/crttests.sh      |   4 +
 9 files changed, 638 insertions(+), 1 deletion(-)
 create mode 100644 src/components/implementation/tests/crt_tests/Makefile
 create mode 100644 src/components/implementation/tests/crt_tests/crttests.c
 create mode 100644 src/components/include/crt_blkpt.h
 create mode 100644 src/components/include/crt_lock.h
 create mode 100644 src/components/include/stacklist.h
 create mode 100644 src/components/lib/sl/sl_blkpt.c
 create mode 100644 src/platform/i386/runscripts/crttests.sh

diff --git a/src/components/implementation/tests/crt_tests/Makefile b/src/components/implementation/tests/crt_tests/Makefile
new file mode 100644
index 0000000000..1469929f49
--- /dev/null
+++ b/src/components/implementation/tests/crt_tests/Makefile
@@ -0,0 +1,8 @@
+COMPONENT=crtt.o
+INTERFACES=
+DEPENDENCIES=
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format $(LIBSLRAW) -lsl_mod_fprr -lsl_thd_static_backend -lsl_blkpt
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/tests/crt_tests/crttests.c b/src/components/implementation/tests/crt_tests/crttests.c
new file mode 100644
index 0000000000..4e0c254e42
--- /dev/null
+++ b/src/components/implementation/tests/crt_tests/crttests.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2016, Phani Gadepalli and Gabriel Parmer, GWU, gparmer@gwu.edu.
+ *
+ * This uses a two clause BSD License.
+ */
+
+#include <cos_component.h>
+#include <cos_defkernel_api.h>
+#include <llprint.h>
+#include <sl.h>
+
+#include <crt_lock.h>
+
+#define LOCK_ITER 10
+#define NTHDS 4
+struct crt_lock lock;
+struct sl_thd *lock_thds[NTHDS];
+struct cos_compinfo *ci;
+
+unsigned int
+next_off(unsigned int off)
+{
+	return cos_thdid() * 7 + 3;
+}
+
+void
+lock_thd(void *d)
+{
+	int i;
+	unsigned int off = cos_thdid();
+
+	sl_thd_yield(sl_thd_thdid(lock_thds[1]));
+
+	for (i = 0; i < LOCK_ITER; i++) {
+		off = next_off(off);
+
+		printc("Thread %d: attempt take\n", cos_thdid());
+		crt_lock_take(&lock);
+		printc("switchto %d -> %d\n", cos_thdid(), sl_thd_thdid(lock_thds[off % NTHDS]));
+		sl_thd_yield(sl_thd_thdid(lock_thds[off % NTHDS]));
+		crt_lock_release(&lock);
+		off = next_off(off);
+		printc("switchto %d -> %d\n", cos_thdid(), sl_thd_thdid(lock_thds[off % NTHDS]));
+		sl_thd_yield(sl_thd_thdid(lock_thds[off % NTHDS]));
+	}
+}
+
+void
+test_lock(void)
+{
+	int i;
+	union sched_param_union sps[] = {
+		{.c = {.type = SCHEDP_PRIO, .value = 5}},
+		{.c = {.type = SCHEDP_PRIO, .value = 6}},
+		{.c = {.type = SCHEDP_PRIO, .value = 6}},
+		{.c = {.type = SCHEDP_PRIO, .value = 7}}
+	};
+
+	crt_lock_init(&lock);
+
+	printc("Create threads:\n");
+	for (i = 0; i < NTHDS; i++) {
+		lock_thds[i] = sl_thd_alloc(lock_thd, NULL);
+		printc("\tcreating thread %d at prio %d\n", sl_thd_thdid(lock_thds[i]), sps[i].c.value);
+		sl_thd_param_set(lock_thds[i], sps[i].v);
+	}
+}
+
+void
+cos_init(void)
+{
+	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
+	ci = cos_compinfo_get(defci);
+
+	printc("Unit-test for the crt (sl)\n");
+	cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
+	cos_defcompinfo_init();
+	sl_init(SL_MIN_PERIOD_US);
+
+	test_lock();
+
+	printc("Running benchmark...\n");
+	sl_sched_loop_nonblock();
+
+	assert(0);
+
+	return;
+}
diff --git a/src/components/include/crt_blkpt.h b/src/components/include/crt_blkpt.h
new file mode 100644
index 0000000000..add6c19fe9
--- /dev/null
+++ b/src/components/include/crt_blkpt.h
@@ -0,0 +1,296 @@
+#ifndef CRT_BLKPT_H
+#define CRT_BLKPT_H
+
+#include <cos_types.h>
+#include <ps.h>
+#include <sl.h>
+
+/***
+ * The event count/block point is an abstraction to synchronize the
+ * blocking behavior of different threads on abstract events. The
+ * events are usually tied to a specific state of another
+ * data-structure (into which the blkpt is embedded).  For example, a
+ * lock is taken and released thus generating an event for any
+ * blocking threads, or a ring buffer has a data item inserted into
+ * it, thus generating an event for any threads waiting for
+ * data. Concretely, we want a number of threads to be able to block,
+ * and a thread to be able to wake up one, or all of them. The
+ * challenge is solving a single race-condition:
+ *
+ * thd 0: check data-structure, determine the need for blocking and
+ *        waiting for an event
+ * thd 0: preemption, switching to thd 1
+ * thd 1: check data-structure, determine that an event is generated
+ * thd 1: call the scheduler, and wake all blocked threads (not
+ *        including thd 0 yet)
+ * thd 1: preempt, and switch to thd 0
+ * thd 0: call scheduler to block
+ *
+ * The resulting state is that thd 1 should have unblocked thd 0, but
+ * due to a race, the thd 0 will be blocked awaiting the *next* event
+ * that may never come. Event counts are meant to solve this
+ * problem. Traditional systems solve this problem using condition
+ * variables and a lock around the scheduling logic, but if you want
+ * to decouple the data-structure from the scheduler (e.g. as they are
+ * in different modes, or components), this is a fundamental problem.
+ *
+ * The event count abstraction:
+ *
+ * Assume the data-structure generating events has at least three
+ * states:
+ * S0: available
+ * S1: unavailable
+ * S2: unavailable & subscribed
+ *
+ * The transitions within the data-structure are:
+ * {S0->S1, S1->S0, S1->S2, S2->S0}
+ *
+ * Every transition into S0 is an abstract *event*. Threads that look
+ * at the state of the data-structure, and must block waiting for its
+ * state to change, wait for such an event to wakeup.
+ *
+ * The data-structure must define its own mapping to this state
+ * machine. A few examples:
+ *
+ * Mutexes:
+ * S0: Not locked.
+ * S1: Locked and held by thread 0.
+ * S2: Locked and held by thread 0, and threads 1...N contend the lock
+ *
+ * Ring buffer (for simplicity, assuming it never fills):
+ * S0: data items in ring buffer
+ * S1: no data in ring buffer
+ * S2: no data in ring buffer, and thread(s) are waiting for data
+ *
+ * The event counts are used to track the threads that use the
+ * data-structure when transitioning from S1->S2 (block thread), when
+ * it is in S2 (block additional threads), and when it transitions
+ * from S2->S0 (wakeup blocked threads).
+ *
+ * The event count is used in the following way:
+ *
+ * S0->S1:
+ *     data-structure (DS) operation
+ *     E.g. not locked -> locked, or
+ *          dequeue from ring with single data item
+ *
+ * S1->S0:
+ *     blkpt_checkpoint(ec) (not used)
+ *     data-structure (DS) operation
+ *     assert(blkpt_has_blocked(ec) == false) (as we're in S1)
+ *     blkpt_trigger(ec) (won't do much as noone is blocked)
+ *     E.g. unlock with no contention, or
+ *          enqueue with no dequeuing threads
+ *
+ * S1->S2:
+ *     cp = blkpt_checkpoint(ec)
+ *     data-structure (DS) operation, determine we need to await event
+ *     blkpt_wait(ec, cp)
+ *     retry (this is why event counts can be used with lock-free data-structs)
+ *     E.g. locked -> contended
+ *          empty ring -> waiting for data
+ *
+ * S2->S0:
+ *     data-structure (DS) operation
+ *     assert(blkpt_has_blocked(ec) == true) (as we're in S2)
+ *     blkpt_trigger(ec) (wake blocked threads!)
+ *     E.g. unlock with contention, or
+ *          enqueue with dequeuing threads
+ *
+ * Event count *optimization*:
+ *
+ * We prevent the race above using an epoch (count) for the events
+ * thus the name. However, to avoid rapid wraparound on the epoch, we
+ * only increment the epoch when the race condition is possible. That
+ * is to say, we only increment the event count when the
+ * data-structure has blocked threads. This not only delays
+ * wraparound, it also will avoid an atomic instruction for all
+ * operations that don't involve blocked threads (a common-case,
+ * exemplified by futexes, for example).
+ *
+ * Usage optimization:
+ *
+ * Because of the event counter optimization to only use expensive
+ * operations when triggering there are blocked threads, the user of
+ * this API can trigger whenever transitioning back to S0.
+ */
+
+struct crt_blkpt {
+	sched_blkpt_id_t  id;
+	/* most significant bit specifies blocked thds */
+	sched_blkpt_epoch_t epoch_blocked;
+};
+
+struct crt_blkpt_checkpoint {
+	sched_blkpt_epoch_t epoch_blocked;
+};
+
+typedef enum {
+	CRT_BLKPT_UNIPROC   = 1, 	/* are the event operations only called on a single core? */
+	CRT_BLKPT_CRIT_SECT = 2,	/* is only one thread ever going to trigger at a time? */
+} crt_blkpt_flags_t;
+
+#define CRT_BLKPT_EPOCH_BLKED_BITS (sizeof(sched_blkpt_epoch_t) * 8)
+#define CRT_BLKPT_BLKED_MASK       ((1 << (CRT_BLKPT_EPOCH_BLKED_BITS - 2)) - 1)
+#define CRT_BLKPT_BLKED(e)         ((e) &  CRT_BLKPT_BLKED_MASK)
+#define CRT_BLKPT_EPOCH(e)         ((e) & ~CRT_BLKPT_BLKED_MASK)
+
+/* Return != 0 on failure: no ids to allocate */
+static inline int
+crt_blkpt_init(struct crt_blkpt *blkpt)
+{
+	sched_blkpt_id_t id;
+
+	id = sched_blkpt_alloc();
+	if (id == SCHED_BLKPT_NULL) return -1;
+
+	*blkpt = (struct crt_blkpt){
+		.id = id,
+		.epoch_blocked = 0
+	};
+
+	return 0;
+}
+
+static inline int
+crt_blkpt_teardown(struct crt_blkpt *blkpt)
+{
+	return sched_blkpt_free(blkpt->id);
+}
+
+/* Internal APIs that must be inlined to remove the branches */
+static inline int
+__crt_blkpt_atomic_trigger(sched_blkpt_epoch_t *ec, sched_blkpt_epoch_t chkpt, crt_blkpt_flags_t flags)
+{
+	/*
+	 * Assume that the most significant bit is the blocked
+	 * indicator. This math might reset it to zero, which we want
+	 * to do anyway (as part of CRT_BLKPT_EPOCH).
+	 */
+	sched_blkpt_epoch_t new = CRT_BLKPT_EPOCH(chkpt + 1);
+
+	/* inlined so that constant propagation will get rid of condition */
+	if (flags == CRT_BLKPT_UNIPROC) {
+		return ps_upcas(ec, chkpt, new);
+	} else {
+		return ps_cas(ec, chkpt, new);
+	}
+	/* TODO: faa for CRT_BLKPT_CRIT_SECT? */
+}
+
+/*
+ * If we return 1, then the caller will attempt to block, otherwise,
+ * return 0 and it will re-check the data-structure assuming that
+ * something happened in the mean time.
+ */
+static inline int
+__crt_blkpt_atomic_wait(sched_blkpt_epoch_t *ec, sched_blkpt_epoch_t chkpt, crt_blkpt_flags_t flags)
+{
+	sched_blkpt_epoch_t cached = ps_load(ec);
+	sched_blkpt_epoch_t new    = cached | CRT_BLKPT_BLKED_MASK;
+	int ret;
+
+	/*
+	 * We are the second or later blocker. Blocked already
+	 * set. We're done here.
+	 *
+	 * It isn't clear if it is better to have the additional
+	 * branch here for this to avoid atomic instructions, or to
+	 * just always do the atomic instructions and possibly fail.
+	 */
+	if (cached == new) return 1;
+
+	/* function is inlined so that constant propagation will get rid of condition */
+	if (flags == CRT_BLKPT_UNIPROC) {
+		ret = ps_upcas(ec, chkpt, new);
+	} else {
+		ret = ps_cas(ec, chkpt, new);
+	}
+	if (unlikely(!ret)) {
+		/*
+		 * CAS failure can mean that 1. another thread
+		 * blocked, and set the blocked bit, or 2. an event is
+		 * triggered. In the former case, we still want to
+		 * block. In the latter case, we want to go back to
+		 * the data-structure.
+		 */
+		return ps_load(ec) == new; /* same epoch with blocked set? == success */
+	}
+
+	return 1;
+}
+
+/* Trigger an event, waking blocked threads. */
+static inline void
+crt_blkpt_trigger(struct crt_blkpt *blkpt, crt_blkpt_flags_t flags)
+{
+	/*
+	 * Note that the flags should likely be passed in statically,
+	 * as constants. That way they will be inlined the conditions
+	 * in the *_atomic_* function will be removed.
+	 */
+	sched_blkpt_epoch_t saved = ps_load(&blkpt->epoch_blocked);
+
+	/* The optimization: don't increment events if noone's listening */
+	if (likely(!CRT_BLKPT_BLKED(saved))) return;
+
+	/* slow(er) path for when we have blocked threads */
+	if (!__crt_blkpt_atomic_trigger(&blkpt->epoch_blocked, saved, flags)) {
+		/*
+		 * Race here between triggering threads. In this case,
+		 * someone else already incremented the epoch and
+		 * unblocked the threads. Yeah, helping algorithms!
+		 */
+		return;
+	}
+	/*
+	 * Note that there is a race here. Multiple threads triggering
+	 * events might pass different epochs down to the next
+	 * level. This is OK as the next level always takes the epoch
+	 * = max(epoch, ...) (for some wraparound-aware version of
+	 * max).
+	 */
+	sched_blkpt_trigger(blkpt->id, CRT_BLKPT_EPOCH(saved), 0);
+}
+
+/* Wake only a single, specified thread (tracked manually in the data-structure) */
+/* void crt_blkpt_trigger_one(struct crt_blkpt *blkpt, crt_blkpt_flags_t flags, cos_thdid_t thdid); */
+
+/*
+ * Checkpoint the state of the current event counter. This checkpoint
+ * is the one that is active during our operations on the
+ * data-structure. If we determine that we want to wait for an event
+ * (thus blocking), then the state of the checkpoint will be compared
+ * versus the state of the event counter to see if we're working off
+ * of outdated information.
+ */
+static inline void
+crt_blkpt_checkpoint(struct crt_blkpt *blkpt, struct crt_blkpt_checkpoint *chkpt)
+{
+	chkpt->epoch_blocked = ps_load(&blkpt->epoch_blocked);
+}
+
+/* Wait for an event. */
+static inline void
+crt_blkpt_wait(struct crt_blkpt *blkpt, crt_blkpt_flags_t flags, struct crt_blkpt_checkpoint *chkpt)
+{
+	/*
+	 * If blocked is already set, we can try and block
+	 * directly. Otherwise, go through and try to atomically set
+	 * it. If that fails, then either epoch or blocked has been
+	 * updated, so return and try accessing the data-structure
+	 * again.
+	 */
+	if (!CRT_BLKPT_BLKED(chkpt->epoch_blocked) &&
+	    !__crt_blkpt_atomic_wait(&blkpt->epoch_blocked, chkpt->epoch_blocked, flags)) return;
+
+	sched_blkpt_block(blkpt->id, CRT_BLKPT_EPOCH(chkpt->epoch_blocked), 0);
+}
+
+/*
+ * Create an execution dependency on the specified thread for,
+ * e.g. priority inheritance.
+ */
+/* void crt_blkpt_wait_dep(struct crt_blkpt *blkpt, crt_blkpt_flags_t flags, struct crt_blkpt_checkpoint *chkpt, cos_thdid_t thdid); */
+
+#endif /* CRT_BLKPT_H */
diff --git a/src/components/include/crt_lock.h b/src/components/include/crt_lock.h
new file mode 100644
index 0000000000..c393c53648
--- /dev/null
+++ b/src/components/include/crt_lock.h
@@ -0,0 +1,59 @@
+#ifndef CRT_LOCK_H
+#define CRT_LOCK_H
+
+/***
+ * Simple blocking lock. Uses blockpoints to enable the blocking and
+ * waking of contending threads. This has little to no intelligence,
+ * for example, not expressing dependencies for PI.
+ */
+
+#include <cos_component.h>
+#include <crt_blkpt.h>
+
+struct crt_lock {
+	unsigned long owner;
+	struct crt_blkpt blkpt;
+};
+
+static inline int
+crt_lock_init(struct crt_lock *l)
+{
+	l->owner = 0;
+
+	return crt_blkpt_init(&l->blkpt);
+}
+
+static inline int
+crt_lock_teardown(struct crt_lock *l)
+{
+	assert(l->owner == 0);
+
+	return crt_blkpt_teardown(&l->blkpt);
+}
+
+static inline void
+crt_lock_take(struct crt_lock *l)
+{
+	struct crt_blkpt_checkpoint chkpt;
+
+	while (1) {
+		crt_blkpt_checkpoint(&l->blkpt, &chkpt);
+
+		if (ps_cas(&l->owner, 0, (unsigned long)cos_thdid())) {
+			return;	/* success! */
+		}
+		/* failure: try and block */
+		crt_blkpt_wait(&l->blkpt, 0, &chkpt);
+	}
+}
+
+static inline void
+crt_lock_release(struct crt_lock *l)
+{
+	assert(l->owner == cos_thdid());
+	l->owner = 0;
+	/* if there are blocked threads, wake 'em up! */
+	crt_blkpt_trigger(&l->blkpt, 0);
+}
+
+#endif /* CRT_LOCK_H */
diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index 1529c7835c..f8c21e2259 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -70,6 +70,10 @@ struct sl_global_cpu {
 
 extern struct sl_global_cpu sl_global_cpu_data[];
 
+typedef u32_t sched_blkpt_id_t;
+#define SCHED_BLKPT_NULL 0
+typedef word_t sched_blkpt_epoch_t;
+
 static inline struct sl_global_cpu *
 sl__globals_cpu(void)
 {
@@ -120,6 +124,10 @@ sl_thdid(void)
 	return tid;
 }
 
+sched_blkpt_id_t sched_blkpt_alloc(void);
+int sched_blkpt_free(sched_blkpt_id_t id);
+int sched_blkpt_trigger(sched_blkpt_id_t blkpt, sched_blkpt_epoch_t epoch, int single);
+int sched_blkpt_block(sched_blkpt_id_t blkpt, sched_blkpt_epoch_t epoch, thdid_t dependency);
 
 static inline struct sl_thd *
 sl_thd_curr(void)
diff --git a/src/components/include/stacklist.h b/src/components/include/stacklist.h
new file mode 100644
index 0000000000..eb6fb70671
--- /dev/null
+++ b/src/components/include/stacklist.h
@@ -0,0 +1,50 @@
+#ifndef STACKLIST_H
+#define STACKLIST_H
+
+#include <ps_list.h>
+
+struct stacklist_head {
+	struct ps_list_head head;
+};
+
+struct stacklist {
+	thdid_t thdid;
+	struct ps_list list;
+};
+
+static inline void
+stacklist_init(struct stacklist_head *h)
+{
+	ps_list_head_init(&h->head);
+}
+
+/* Remove a thread from the list that has been woken */
+static inline void
+stacklist_rem(struct stacklist *l)
+{
+	ps_list_rem_d(l);
+}
+
+/* Add a thread that is going to block */
+static inline void
+stacklist_add(struct stacklist_head *h, struct stacklist *l)
+{
+	ps_list_init_d(l);
+	ps_list_head_add_d(&h->head, l);
+}
+
+/* Get a thread to wake up, and remove its record! */
+static inline thdid_t
+stacklist_dequeue(struct stacklist_head *h)
+{
+	struct stacklist *sl;
+
+	if (ps_list_head_empty(&h->head)) return 0;
+
+	sl = ps_list_head_first_d(&h->head, struct stacklist);
+	stacklist_rem(sl);
+
+	return sl->thdid;
+}
+
+#endif	/* STACKLIST_H */
diff --git a/src/components/lib/sl/Makefile b/src/components/lib/sl/Makefile
index 6e908cda0b..39859fd419 100644
--- a/src/components/lib/sl/Makefile
+++ b/src/components/lib/sl/Makefile
@@ -1,6 +1,6 @@
 include Makefile.src Makefile.comp
 
-LIB_OBJS=sl_capmgr.o sl_raw.o sl_sched.o sl_xcpu.o sl_child.o sl_mod_fprr.o sl_lock.o sl_thd_static_backend.o
+LIB_OBJS=sl_capmgr.o sl_raw.o sl_sched.o sl_xcpu.o sl_child.o sl_mod_fprr.o sl_lock.o sl_thd_static_backend.o sl_blkpt.o
 LIBS=$(LIB_OBJS:%.o=%.a)
 CINC+=-m32
 
diff --git a/src/components/lib/sl/sl_blkpt.c b/src/components/lib/sl/sl_blkpt.c
new file mode 100644
index 0000000000..400084481d
--- /dev/null
+++ b/src/components/lib/sl/sl_blkpt.c
@@ -0,0 +1,124 @@
+#include <sl.h>
+#include <stacklist.h>
+
+#define NBLKPTS 64
+struct blkpt_mem {
+	sched_blkpt_id_t      id;
+	sched_blkpt_epoch_t   epoch;
+	struct stacklist_head blocked;
+};
+static struct blkpt_mem __blkpts[NBLKPTS];
+static int __blkpt_offset = 1;
+
+#define BLKPT_EPOCH_BLKED_BITS ((sizeof(sched_blkpt_epoch_t) * 8)
+#define BLKPT_EPOCH_DIFF       (BLKPT_EPOCH_BLKED_BITS - 2)/2)
+#define BLKPT_BLKED_MASK       ((1 << (BLKPT_EPOCH_BLKED_BITS - 2)) - 1)
+
+static int
+blkpt_epoch_expired(sched_blkpt_epoch_t e, sched_blkpt_epoch_t cmp)
+{
+	return (e > cmp && e - cmp > BLKPT_EPOCH_DIFF) || (e < cmp && cmp - e < BLKPT_EPOCH_DIFF);
+}
+
+static struct blkpt_mem *
+blkpt_get(sched_blkpt_id_t id)
+{
+	if (id - 1 == NBLKPTS) return NULL;
+
+	return &__blkpts[id-1];
+}
+
+sched_blkpt_id_t
+sched_blkpt_alloc(void)
+{
+	sched_blkpt_id_t id;
+	struct blkpt_mem *m;
+	sched_blkpt_id_t ret = SCHED_BLKPT_NULL;
+
+	sl_cs_enter();
+
+	id = (sched_blkpt_id_t)__blkpt_offset;
+	m  = blkpt_get(id);
+	if (!m) ERR_THROW(SCHED_BLKPT_NULL, unlock);
+
+	m->id    = id;
+	ret      = id;
+	m->epoch = 0;
+	stacklist_init(&m->blocked);
+	__blkpt_offset++;
+unlock:
+	sl_cs_exit();
+
+	return ret;
+}
+
+int
+sched_blkpt_free(sched_blkpt_id_t id)
+{
+	/* alloc only for now */
+	return 0;
+}
+
+int
+sched_blkpt_trigger(sched_blkpt_id_t blkpt, sched_blkpt_epoch_t epoch, int single)
+{
+	thdid_t tid;
+	struct sl_thd *t;
+	struct blkpt_mem *m;
+	int ret = 0;
+
+	sl_cs_enter();
+
+	m = blkpt_get(blkpt);
+	if (!m) ERR_THROW(-1, unlock);
+
+	/* is the new epoch more recent than the existing? */
+	if (!blkpt_epoch_expired(epoch, m->epoch)) ERR_THROW(0, unlock);
+
+	m->epoch = epoch;
+	while ((tid = stacklist_dequeue(&m->blocked)) != 0) {
+		t = sl_thd_lkup(tid);
+		assert(t);
+
+		sl_thd_wakeup_no_cs(t); /* ignore retval: process next thread */
+	}
+	/* most likely we switch to a woken thread here */
+	sl_cs_exit_schedule();
+
+	return 0;
+unlock:
+	sl_cs_exit();
+
+	return ret;
+}
+
+int
+sched_blkpt_block(sched_blkpt_id_t blkpt, sched_blkpt_epoch_t epoch, thdid_t dependency)
+{
+	struct blkpt_mem *m;
+	struct sl_thd    *t;
+	struct stacklist  sl; 	/* The stack-based structure we'll use to track ourself */
+	int ret = 0;
+
+	sl_cs_enter();
+
+	m = blkpt_get(blkpt);
+	if (!m) ERR_THROW(-1, unlock);
+
+	/* Outdated event? don't block! */
+	if (blkpt_epoch_expired(m->epoch, epoch)) ERR_THROW(0, unlock);
+
+	/* Block! */
+	stacklist_add(&m->blocked, &sl);
+
+	t = sl_thd_curr();
+	if (sl_thd_block_no_cs(t, SL_THD_BLOCKED, 0)) ERR_THROW(-1, unlock);
+
+	sl_cs_exit_schedule();
+
+	return 0;
+unlock:
+	sl_cs_exit();
+
+	return ret;
+}
diff --git a/src/platform/i386/runscripts/crttests.sh b/src/platform/i386/runscripts/crttests.sh
new file mode 100644
index 0000000000..55c6b0792b
--- /dev/null
+++ b/src/platform/i386/runscripts/crttests.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+cp tests.crt_tests.o llboot.o
+./cos_linker "llboot.o, :" ./gen_client_stub

From 36a980cc51836abcd505cbdda4fd689cf71e97e4 Mon Sep 17 00:00:00 2001
From: Gabe Parmer <gparmer@gwu.edu>
Date: Mon, 6 May 2019 14:01:58 -0400
Subject: [PATCH 054/127] Refined the tests for the blkpt-based lock
 implementation.

Note that this is currently NOT multi-core safe as it relies on the sl
critical section. Will need to change the stacklist to track trebor
stacks of threads to work for multi-core.
---
 .../implementation/tests/crt_tests/crttests.c | 53 +++++++++++++------
 src/components/include/crt_blkpt.h            |  8 +--
 src/components/include/stacklist.h            |  2 +
 src/components/lib/sl/sl_blkpt.c              | 14 +++--
 4 files changed, 54 insertions(+), 23 deletions(-)

diff --git a/src/components/implementation/tests/crt_tests/crttests.c b/src/components/implementation/tests/crt_tests/crttests.c
index 4e0c254e42..d7632a7c17 100644
--- a/src/components/implementation/tests/crt_tests/crttests.c
+++ b/src/components/implementation/tests/crt_tests/crttests.c
@@ -11,38 +11,61 @@
 
 #include <crt_lock.h>
 
-#define LOCK_ITER 10
+#define LOCK_ITER 1000000
 #define NTHDS 4
 struct crt_lock lock;
-struct sl_thd *lock_thds[NTHDS];
+struct sl_thd  *lock_thds[NTHDS] = {NULL, };
+unsigned int    progress[NTHDS] = {0, };
 struct cos_compinfo *ci;
 
-unsigned int
-next_off(unsigned int off)
+thdid_t
+next_thd(void)
 {
-	return cos_thdid() * 7 + 3;
+	return sl_thd_thdid(lock_thds[(unsigned int)(ps_tsc() % NTHDS)]);
 }
 
+volatile thdid_t holder;
+
 void
 lock_thd(void *d)
 {
-	int i;
-	unsigned int off = cos_thdid();
+	int i, cnt, me = -1;
+
+	for (i = 0; i < NTHDS; i++) {
+		if (sl_thd_thdid(lock_thds[i]) != cos_thdid()) continue;
+
+		me = i;
+	}
+	assert(me != -1);
 
 	sl_thd_yield(sl_thd_thdid(lock_thds[1]));
 
 	for (i = 0; i < LOCK_ITER; i++) {
-		off = next_off(off);
-
-		printc("Thread %d: attempt take\n", cos_thdid());
 		crt_lock_take(&lock);
-		printc("switchto %d -> %d\n", cos_thdid(), sl_thd_thdid(lock_thds[off % NTHDS]));
-		sl_thd_yield(sl_thd_thdid(lock_thds[off % NTHDS]));
+
+		progress[me]++;
+		holder = cos_thdid();
+
+		sl_thd_yield(next_thd());
+
+		if (holder != cos_thdid()) {
+			printc("FAILURE\n");
+			BUG();
+		}
 		crt_lock_release(&lock);
-		off = next_off(off);
-		printc("switchto %d -> %d\n", cos_thdid(), sl_thd_thdid(lock_thds[off % NTHDS]));
-		sl_thd_yield(sl_thd_thdid(lock_thds[off % NTHDS]));
+		sl_thd_yield(next_thd());
 	}
+
+	for (i = 0; i < NTHDS; i++) {
+		if (i == me) continue;
+
+		if (progress[i] < LOCK_ITER) {
+			sl_thd_yield(sl_thd_thdid(lock_thds[i]));
+		}
+	}
+
+	printc("SUCCESS!");
+	while (1) ;
 }
 
 void
diff --git a/src/components/include/crt_blkpt.h b/src/components/include/crt_blkpt.h
index add6c19fe9..d647dc50d9 100644
--- a/src/components/include/crt_blkpt.h
+++ b/src/components/include/crt_blkpt.h
@@ -131,7 +131,7 @@ typedef enum {
 } crt_blkpt_flags_t;
 
 #define CRT_BLKPT_EPOCH_BLKED_BITS (sizeof(sched_blkpt_epoch_t) * 8)
-#define CRT_BLKPT_BLKED_MASK       ((1 << (CRT_BLKPT_EPOCH_BLKED_BITS - 2)) - 1)
+#define CRT_BLKPT_BLKED_MASK       (1 << (CRT_BLKPT_EPOCH_BLKED_BITS - 2))
 #define CRT_BLKPT_BLKED(e)         ((e) &  CRT_BLKPT_BLKED_MASK)
 #define CRT_BLKPT_EPOCH(e)         ((e) & ~CRT_BLKPT_BLKED_MASK)
 
@@ -250,7 +250,7 @@ crt_blkpt_trigger(struct crt_blkpt *blkpt, crt_blkpt_flags_t flags)
 	 * = max(epoch, ...) (for some wraparound-aware version of
 	 * max).
 	 */
-	sched_blkpt_trigger(blkpt->id, CRT_BLKPT_EPOCH(saved), 0);
+	sched_blkpt_trigger(blkpt->id, CRT_BLKPT_EPOCH(saved + 1), 0);
 }
 
 /* Wake only a single, specified thread (tracked manually in the data-structure) */
@@ -284,7 +284,9 @@ crt_blkpt_wait(struct crt_blkpt *blkpt, crt_blkpt_flags_t flags, struct crt_blkp
 	if (!CRT_BLKPT_BLKED(chkpt->epoch_blocked) &&
 	    !__crt_blkpt_atomic_wait(&blkpt->epoch_blocked, chkpt->epoch_blocked, flags)) return;
 
-	sched_blkpt_block(blkpt->id, CRT_BLKPT_EPOCH(chkpt->epoch_blocked), 0);
+	if (unlikely(sched_blkpt_block(blkpt->id, CRT_BLKPT_EPOCH(chkpt->epoch_blocked), 0))) {
+		BUG(); 		/* we are using a blkpt id that doesn't exist! */
+	}
 }
 
 /*
diff --git a/src/components/include/stacklist.h b/src/components/include/stacklist.h
index eb6fb70671..2f9f19a63b 100644
--- a/src/components/include/stacklist.h
+++ b/src/components/include/stacklist.h
@@ -1,6 +1,7 @@
 #ifndef STACKLIST_H
 #define STACKLIST_H
 
+#include <cos_component.h>
 #include <ps_list.h>
 
 struct stacklist_head {
@@ -31,6 +32,7 @@ stacklist_add(struct stacklist_head *h, struct stacklist *l)
 {
 	ps_list_init_d(l);
 	ps_list_head_add_d(&h->head, l);
+	l->thdid = cos_thdid();
 }
 
 /* Get a thread to wake up, and remove its record! */
diff --git a/src/components/lib/sl/sl_blkpt.c b/src/components/lib/sl/sl_blkpt.c
index 400084481d..423649b021 100644
--- a/src/components/lib/sl/sl_blkpt.c
+++ b/src/components/lib/sl/sl_blkpt.c
@@ -12,12 +12,16 @@ static int __blkpt_offset = 1;
 
 #define BLKPT_EPOCH_BLKED_BITS ((sizeof(sched_blkpt_epoch_t) * 8)
 #define BLKPT_EPOCH_DIFF       (BLKPT_EPOCH_BLKED_BITS - 2)/2)
-#define BLKPT_BLKED_MASK       ((1 << (BLKPT_EPOCH_BLKED_BITS - 2)) - 1)
 
+/*
+ * Is cmp > e? This is more complicated than it seems it should be
+ * only because of wrap-around. We have to consider the case that we
+ * have, and that we haven't wrapped around.
+ */
 static int
-blkpt_epoch_expired(sched_blkpt_epoch_t e, sched_blkpt_epoch_t cmp)
+blkpt_epoch_is_higher(sched_blkpt_epoch_t e, sched_blkpt_epoch_t cmp)
 {
-	return (e > cmp && e - cmp > BLKPT_EPOCH_DIFF) || (e < cmp && cmp - e < BLKPT_EPOCH_DIFF);
+	return (e > cmp && (e - cmp) > BLKPT_EPOCH_DIFF) || (e < cmp && (cmp - e) < BLKPT_EPOCH_DIFF);
 }
 
 static struct blkpt_mem *
@@ -73,7 +77,7 @@ sched_blkpt_trigger(sched_blkpt_id_t blkpt, sched_blkpt_epoch_t epoch, int singl
 	if (!m) ERR_THROW(-1, unlock);
 
 	/* is the new epoch more recent than the existing? */
-	if (!blkpt_epoch_expired(epoch, m->epoch)) ERR_THROW(0, unlock);
+	if (!blkpt_epoch_is_higher(m->epoch, epoch)) ERR_THROW(0, unlock);
 
 	m->epoch = epoch;
 	while ((tid = stacklist_dequeue(&m->blocked)) != 0) {
@@ -106,7 +110,7 @@ sched_blkpt_block(sched_blkpt_id_t blkpt, sched_blkpt_epoch_t epoch, thdid_t dep
 	if (!m) ERR_THROW(-1, unlock);
 
 	/* Outdated event? don't block! */
-	if (blkpt_epoch_expired(m->epoch, epoch)) ERR_THROW(0, unlock);
+	if (blkpt_epoch_is_higher(m->epoch, epoch)) ERR_THROW(0, unlock);
 
 	/* Block! */
 	stacklist_add(&m->blocked, &sl);

From 1f721c562d6c5af429461e839b2d2636ea5138ac Mon Sep 17 00:00:00 2001
From: Gabe Parmer <gparmer@gwu.edu>
Date: Mon, 6 May 2019 20:08:14 -0400
Subject: [PATCH 055/127] Maked one aspect of blockpoints work on multi-core:
 blocked thread tracking

- Simply used a Treiber Stack (CAS-driven stack) that doesn't have ABA
  problems given the blocking behavior of threads.
- Remaining problem: we don't use `sl`'s support to wake threads
  across cores.
---
 src/components/include/stacklist.h | 72 +++++++++++++++++++++++-------
 src/components/lib/sl/sl_blkpt.c   |  1 +
 2 files changed, 58 insertions(+), 15 deletions(-)

diff --git a/src/components/include/stacklist.h b/src/components/include/stacklist.h
index 2f9f19a63b..fdd57d0714 100644
--- a/src/components/include/stacklist.h
+++ b/src/components/include/stacklist.h
@@ -1,38 +1,59 @@
 #ifndef STACKLIST_H
 #define STACKLIST_H
 
-#include <cos_component.h>
-#include <ps_list.h>
+/**
+ * Modified to support multi-core via a Trebor stack. This is not 100%
+ * a great solution as it isn't FIFO. However, we release *all*
+ * threads when unlocking, so the priority scheduling should take over
+ * at that point.
+ */
 
-struct stacklist_head {
-	struct ps_list_head head;
-};
+#include <cos_component.h>
+#include <ps.h>
 
 struct stacklist {
 	thdid_t thdid;
-	struct ps_list list;
+	struct stacklist *next;
+};
+
+struct stacklist_head {
+	struct stacklist *head;
 };
 
 static inline void
 stacklist_init(struct stacklist_head *h)
 {
-	ps_list_head_init(&h->head);
+	h->head = NULL;
 }
 
-/* Remove a thread from the list that has been woken */
-static inline void
+/*
+ * Remove a thread from the list that has been woken. Return 0 on
+ * success, and 1 if it could not be removed.
+ */
+static inline int
 stacklist_rem(struct stacklist *l)
 {
-	ps_list_rem_d(l);
+	/*
+	 * Not currently supported with Trebor Stack. Threads that
+	 * wake early still have to wait their turn.
+	 */
+	return 1;
 }
 
 /* Add a thread that is going to block */
 static inline void
 stacklist_add(struct stacklist_head *h, struct stacklist *l)
 {
-	ps_list_init_d(l);
-	ps_list_head_add_d(&h->head, l);
 	l->thdid = cos_thdid();
+	l->next  = NULL;
+	assert(h);
+
+	while (1) {
+		struct stacklist *n = ps_load(&h->head);
+
+		l->next = n;
+		if (ps_cas((unsigned long *)&h->head, (unsigned long)n, (unsigned long)l)) break;
+	}
 }
 
 /* Get a thread to wake up, and remove its record! */
@@ -41,12 +62,33 @@ stacklist_dequeue(struct stacklist_head *h)
 {
 	struct stacklist *sl;
 
-	if (ps_list_head_empty(&h->head)) return 0;
+	if (!h->head) return 0;
+
+	/*
+	 * Only a single thread should trigger an event, and dequeue
+	 * threads, but we'll implement this conservatively. Given
+	 * this, please note that this should *not* iterate more than
+	 * once.
+	 */
+	while (1) {
+		sl = ps_load(&h->head);
 
-	sl = ps_list_head_first_d(&h->head, struct stacklist);
-	stacklist_rem(sl);
+		if (ps_cas((unsigned long *)&h->head, (unsigned long)sl, (unsigned long)sl->next)) break;
+	}
+	sl->next = NULL;
 
 	return sl->thdid;
 }
 
+/*
+ * A thread that wakes up after blocking using a stacklist should be
+ * able to assume that it is no longer on the list. This enables them
+ * to assert on that fact.
+ */
+static inline int
+stacklist_is_removed(struct stacklist *l)
+{
+	return l->next == NULL;
+}
+
 #endif	/* STACKLIST_H */
diff --git a/src/components/lib/sl/sl_blkpt.c b/src/components/lib/sl/sl_blkpt.c
index 423649b021..dac56db1d1 100644
--- a/src/components/lib/sl/sl_blkpt.c
+++ b/src/components/lib/sl/sl_blkpt.c
@@ -119,6 +119,7 @@ sched_blkpt_block(sched_blkpt_id_t blkpt, sched_blkpt_epoch_t epoch, thdid_t dep
 	if (sl_thd_block_no_cs(t, SL_THD_BLOCKED, 0)) ERR_THROW(-1, unlock);
 
 	sl_cs_exit_schedule();
+	assert(stacklist_is_removed(&sl)); /* we cannot still be on the list */
 
 	return 0;
 unlock:

From ecce29d3bdefa7641d51ec48dd8168cb6d976e72 Mon Sep 17 00:00:00 2001
From: Gabe Parmer <gparmer@gwu.edu>
Date: Mon, 6 May 2019 20:14:34 -0400
Subject: [PATCH 056/127] Fixed spelling error in stacklist header comment

---
 src/components/include/stacklist.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/components/include/stacklist.h b/src/components/include/stacklist.h
index fdd57d0714..8651a408a7 100644
--- a/src/components/include/stacklist.h
+++ b/src/components/include/stacklist.h
@@ -2,7 +2,7 @@
 #define STACKLIST_H
 
 /**
- * Modified to support multi-core via a Trebor stack. This is not 100%
+ * Modified to support multi-core via a Treiber stack. This is not 100%
  * a great solution as it isn't FIFO. However, we release *all*
  * threads when unlocking, so the priority scheduling should take over
  * at that point.

From 4f1820fd644b50f73c68c077e5b4f31b7573e0e4 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Wed, 8 May 2019 15:27:52 -0400
Subject: [PATCH 057/127] part: parallel runtime in composite, first draft

* with usage in cos_gomp
* TODO: pretty much everything!
---
 .../no_interface/omp_dijkstra/Makefile        |   2 +-
 .../no_interface/omp_dijkstra/dijkstra_omp.c  |   7 +-
 .../no_interface/omp_hello/Makefile           |   2 +-
 .../no_interface/omp_hello/hello_omp.c        |  19 +-
 src/components/include/cirque.h               | 128 ++++++++
 src/components/include/cos_debug.h            |   2 +-
 src/components/include/cos_omp.h              |   3 +-
 src/components/include/deque.h                | 111 +++++++
 src/components/include/part.h                 | 214 +++++++++++++
 src/components/include/part_task.h            | 205 +++++++++++++
 src/components/include/sl.h                   |   6 +-
 src/components/include/sl_thd.h               |   2 +
 src/components/include/sl_xcore.h             |   6 +-
 src/components/lib/Makefile                   |   2 +-
 src/components/lib/cos_gomp/Makefile          |   2 +-
 src/components/lib/cos_gomp/cos_gomp.c        | 290 ++++++++++++++----
 src/components/lib/cos_gomp/cos_gomp.h        |  11 +
 src/components/lib/{ => cos_gomp}/cos_omp.c   |  25 +-
 src/components/lib/part.c                     |  57 ++++
 src/components/lib/sl/sl_mod_fifo.c           |   1 -
 src/components/lib/sl/sl_xcore.c              |  15 +-
 21 files changed, 1009 insertions(+), 101 deletions(-)
 create mode 100644 src/components/include/cirque.h
 create mode 100644 src/components/include/deque.h
 create mode 100644 src/components/include/part.h
 create mode 100644 src/components/include/part_task.h
 create mode 100644 src/components/lib/cos_gomp/cos_gomp.h
 rename src/components/lib/{ => cos_gomp}/cos_omp.c (89%)
 create mode 100644 src/components/lib/part.c

diff --git a/src/components/implementation/no_interface/omp_dijkstra/Makefile b/src/components/implementation/no_interface/omp_dijkstra/Makefile
index 664e201ce2..1a7d4146ce 100644
--- a/src/components/implementation/no_interface/omp_dijkstra/Makefile
+++ b/src/components/implementation/no_interface/omp_dijkstra/Makefile
@@ -2,7 +2,7 @@ COMPONENT=omp_dijkstra.o
 INTERFACES=
 DEPENDENCIES=capmgr
 IF_LIB=
-ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp -lcos_omp $(LIBSLCAPMGR) -lsl_mod_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLCAPMGR) -lsl_mod_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/no_interface/omp_dijkstra/dijkstra_omp.c b/src/components/implementation/no_interface/omp_dijkstra/dijkstra_omp.c
index fd3b60d4e1..da0fb3c4e9 100644
--- a/src/components/implementation/no_interface/omp_dijkstra/dijkstra_omp.c
+++ b/src/components/implementation/no_interface/omp_dijkstra/dijkstra_omp.c
@@ -571,6 +571,7 @@ cos_main(void *d)
 	main();
 }
 
+extern void cos_gomp_init(void);
 void
 cos_init(void *d)
 {
@@ -583,14 +584,12 @@ cos_init(void *d)
 		PRINTC("In OpenMP-based Hello Program!\n");
 		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
 		cos_defcompinfo_init();
-		cos_omp_init();
-
 	} else {
 		while (!ps_load((unsigned long *)&init_done[first])) ;
 
 		cos_defcompinfo_sched_init();
-		sl_init(SL_MIN_PERIOD_US*100);
 	}
+	sl_init(SL_MIN_PERIOD_US*100);
 	ps_faa((unsigned long *)&init_done[cos_cpuid()], 1);
 
 	/* make sure the INITTHD of the scheduler is created on all cores.. for cross-core sl initialization to work! */
@@ -601,7 +600,7 @@ cos_init(void *d)
 	if (!cos_cpuid()) {
 		struct sl_thd *t = NULL;
 
-		sl_init(SL_MIN_PERIOD_US*100);
+		cos_gomp_init();
 		t = sl_thd_alloc(cos_main, NULL);
 		assert(t);
 		sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX));
diff --git a/src/components/implementation/no_interface/omp_hello/Makefile b/src/components/implementation/no_interface/omp_hello/Makefile
index e62f427203..9ecad31df4 100644
--- a/src/components/implementation/no_interface/omp_hello/Makefile
+++ b/src/components/implementation/no_interface/omp_hello/Makefile
@@ -2,7 +2,7 @@ COMPONENT=omp_hello.o
 INTERFACES=
 DEPENDENCIES=capmgr
 IF_LIB=
-ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp -lcos_omp $(LIBSLCAPMGR) -lsl_mod_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLCAPMGR) -lsl_mod_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/no_interface/omp_hello/hello_omp.c b/src/components/implementation/no_interface/omp_hello/hello_omp.c
index 081b4a5821..e9c0932b5c 100644
--- a/src/components/implementation/no_interface/omp_hello/hello_omp.c
+++ b/src/components/implementation/no_interface/omp_hello/hello_omp.c
@@ -57,14 +57,16 @@ int main ( void )
 /*
   INSIDE THE PARALLEL REGION, have each thread say hello.
 */
-#if 0
+#if 1
 #pragma omp parallel
-#pragma omp for
+  {
+#pragma omp for schedule(dynamic)
   for (id = 0; id < 10; id++) {
 	  PRINTC("id:%u\n", id);
   }
+  }
 #else
-# pragma omp parallel \
+# pragma omp parallel\
   private ( id )
   {
     id = omp_get_thread_num ( );
@@ -95,8 +97,12 @@ static void
 cos_main(void *d)
 {
 	main();
+
+	while (1);
 }
 
+extern void cos_gomp_init(void);
+
 void
 cos_init(void *d)
 {
@@ -109,14 +115,12 @@ cos_init(void *d)
 		PRINTC("In OpenMP-based Hello Program!\n");
 		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
 		cos_defcompinfo_init();
-		cos_omp_init();
-
 	} else {
 		while (!ps_load((unsigned long *)&init_done[first])) ;
 
 		cos_defcompinfo_sched_init();
-		sl_init(SL_MIN_PERIOD_US*100);
 	}
+	sl_init(SL_MIN_PERIOD_US*100);
 	ps_faa((unsigned long *)&init_done[cos_cpuid()], 1);
 
 	/* make sure the INITTHD of the scheduler is created on all cores.. for cross-core sl initialization to work! */
@@ -127,7 +131,8 @@ cos_init(void *d)
 	if (!cos_cpuid()) {
 		struct sl_thd *t = NULL;
 
-		sl_init(SL_MIN_PERIOD_US*100);
+		cos_gomp_init();
+
 		t = sl_thd_alloc(cos_main, NULL);
 		assert(t);
 		sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX));
diff --git a/src/components/include/cirque.h b/src/components/include/cirque.h
new file mode 100644
index 0000000000..8c63772322
--- /dev/null
+++ b/src/components/include/cirque.h
@@ -0,0 +1,128 @@
+/**
+ * Redistribution of this file is permitted under the BSD two clause license.
+ *
+ * Copyright 2019, The George Washington University
+ * Author: Phani Gadepalli, phanikishoreg@gwu.edu
+ */
+#ifndef CIRQUE_H
+#define CIRQUE_H
+
+/* remember to use multi-core locks as these are really single producer, single consumer */
+#define CIRQUE_MAX_SZ 4096
+
+#define CIRQUE_PROTOTYPE(name, type)						\
+struct cirque_##name {								\
+	type wrk[CIRQUE_MAX_SZ];						\
+	size_t size;								\
+	size_t mask;								\
+										\
+	volatile long head;							\
+	volatile long tail;							\
+};										\
+										\
+static inline void								\
+cirque_init_##name(struct cirque_##name *q, size_t sz)				\
+{										\
+	memset(q, 0, sizeof(struct cirque_##name));				\
+										\
+	if (sz) {								\
+		/* only for size with pow of 2 */				\
+		assert(round_to_pow2(sz) == sz);				\
+		assert(sz <= CIRQUE_MAX_SZ);					\
+	} else {								\
+		sz = CIRQUE_MAX_SZ;						\
+	}									\
+										\
+	q->head = q->tail = 0;							\
+	q->size = sz;								\
+	q->mask = sz - 1;							\
+}										\
+										\
+static inline int								\
+cirque_insert_##name(struct cirque_##name *q, type *w)				\
+{										\
+	long ct = ps_load((unsigned long *)&q->tail); 				\
+	long ch = ps_load((unsigned long *)&q->head);				\
+										\
+	if ((ct == 0 && ch == q->mask) || 					\
+	    ((ch + 1) & q->mask) == ct) return -ENOSPC;				\
+										\
+	ps_mem_fence();								\
+	if (!ps_cas((unsigned long *)q->head, ch,				\
+		    (ch + 1) & q->mask)) return -EAGAIN;			\
+	q->wrk[ch] = *w;							\
+										\
+	return 0;								\
+}										\
+										\
+static inline int								\
+cirque_delete_##name(struct cirque_##name *q, type *w)				\
+{										\
+	long ct = ps_load((unsigned long *)&q->tail); 				\
+	long ch = ps_load((unsigned long *)&q->head);				\
+										\
+	if (ct >= ch) return -ENOENT;						\
+										\
+	*w = q->wrk[ct];							\
+	if (!ps_cas((unsigned long *)q->tail, ct, 				\
+		    (ct + 1) & q->mask)) return -EAGAIN;			\
+										\
+	return 0;								\
+}										\
+										\
+static inline int								\
+cirque_peek_##name(struct cirque_##name *q, type *w)				\
+{										\
+	long ct = ps_load((unsigned long *)&q->tail); 				\
+	long ch = ps_load((unsigned long *)&q->head);				\
+										\
+	if (ct >= ch) return -ENOENT;						\
+										\
+	*w = q->wrk[ct];							\
+										\
+	return 0;								\
+}										\
+										\
+static inline type *								\
+cirque_allocptr_##name(struct cirque_##name *q)					\
+{										\
+	long ct = ps_load((unsigned long *)&q->tail); 				\
+	long ch = ps_load((unsigned long *)&q->head);				\
+										\
+	if ((ct == 0 && ch == q->mask) || 					\
+	    ((ch + 1) & q->mask) == ct) return NULL;				\
+										\
+	ps_mem_fence();								\
+	if (!ps_cas((unsigned long *)q->head, ch,				\
+		    (ch + 1) & q->mask)) return NULL;				\
+										\
+	return &q->wrk[ch];							\
+}										\
+										\
+static inline void								\
+cirque_freeptr_##name(struct cirque_##name *q)					\
+{										\
+	long ct = ps_load((unsigned long *)&q->tail); 				\
+	long ch = ps_load((unsigned long *)&q->head);				\
+										\
+	if (ct >= ch) return;							\
+										\
+	if (ps_cas((unsigned long *)q->tail, ct, (ct + 1) & q->mask)) {		\
+		memset(&q->wrk[ct], 0, sizeof(type));				\
+	}									\
+										\
+	return;									\
+}										\
+										\
+static inline type *								\
+cirque_peekptr_##name(struct cirque_##name *q)					\
+{										\
+	long ct = ps_load((unsigned long *)&q->tail); 				\
+	long ch = ps_load((unsigned long *)&q->head);				\
+										\
+	if (ct >= ch) return NULL;						\
+										\
+	return &q->wrk[ct];							\
+}
+
+#endif /* CIRQUE_H */
diff --git a/src/components/include/cos_debug.h b/src/components/include/cos_debug.h
index c646c1b977..6e8bb00825 100644
--- a/src/components/include/cos_debug.h
+++ b/src/components/include/cos_debug.h
@@ -9,7 +9,7 @@
 #endif
 
 #ifndef PRINT_FN
-#define PRINT_FN prints
+#define PRINT_FN PRINTC 
 #endif
 
 #include <llprint.h>
diff --git a/src/components/include/cos_omp.h b/src/components/include/cos_omp.h
index 61f65b7d1c..8933449ae9 100644
--- a/src/components/include/cos_omp.h
+++ b/src/components/include/cos_omp.h
@@ -8,10 +8,11 @@
 #ifndef COS_OMP_H
 #define COS_OMP_H
 
+#include <part_task.h>
 #include <cos_types.h>
 #include <omp.h>
 
-#define COS_OMP_MAX_NUM_THREADS (NUM_CPU)
+#define COS_OMP_MAX_NUM_THREADS (PART_MAX_THDS)
 
 struct cos_icv_data_env {
 	unsigned dyn_var;
diff --git a/src/components/include/deque.h b/src/components/include/deque.h
new file mode 100644
index 0000000000..21422eab26
--- /dev/null
+++ b/src/components/include/deque.h
@@ -0,0 +1,111 @@
+/**
+ * Redistribution of this file is permitted under the BSD two clause license.
+ *
+ * Copyright 2019, The George Washington University
+ * Author: Phani Gadepalli, phanikishoreg@gwu.edu
+ */
+#ifndef DEQUE_H
+#define DEQUE_H
+
+/*
+ * This was implemented by referring to:
+ * https://github.com/cpp-taskflow/cpp-taskflow/blob/9c28ccec910346a9937c40db7bdb542262053f9c/taskflow/executor/workstealing.hpp
+ *
+ * which is based on the following papers:
+ *
+ * The work stealing queue described in the paper, "Dynamic Circular Work-stealing Deque," SPAA, 2015.
+ * Only the queue owner can perform pop and push operations, while others can steal data from the queue.
+ *
+ * PPoPP implementation paper, "Correct and Efficient Work-Stealing for Weak Memory Models"
+ * https://www.di.ens.fr/~zappa/readings/ppopp13.pdf
+ */
+#define DEQUE_MAX_SZ 4096
+
+#define DEQUE_PROTOTYPE(name, type)							\
+struct deque_##name {									\
+	type wrk[DEQUE_MAX_SZ];								\
+	long size;									\
+											\
+	volatile long top;								\
+	volatile long bottom;								\
+};											\
+											\
+static inline void									\
+deque_init_##name(struct deque_##name *q, size_t sz)					\
+{											\
+	memset(q, 0, sizeof(struct deque_##name));					\
+											\
+	if (sz) {									\
+		/* only for size with pow of 2 */					\
+		assert(sz & (sz - 1));							\
+		assert(sz <= DEQUE_MAX_SZ);						\
+	} else {									\
+		sz = DEQUE_MAX_SZ;							\
+	}										\
+											\
+	q->size = sz;									\
+}											\
+											\
+/* Use mutual exclusion locks around push/pop if multi-threaded. */			\
+static inline int									\
+deque_push_##name(struct deque_##name *q, type *w)					\
+{											\
+	long ct, cb;									\
+											\
+	ct = ps_load((unsigned long *)&q->top);						\
+       	cb = ps_load((unsigned long *)&q->bottom);					\
+											\
+	/* nope, fixed size only */							\
+	if (q->size - 1 < (cb - ct)) return -ENOSPC;					\
+											\
+	q->wrk[cb] = *w;								\
+	ps_mem_fence();									\
+	if (!ps_upcas((unsigned long *)&q->bottom, cb, cb + 1)) assert(0);		\
+											\
+	return -EAGAIN;									\
+}											\
+											\
+/* Use mutual exclusion locks around push/pop if multi-threaded. */			\
+static inline int									\
+deque_pop_##name(struct deque_##name *q, type *w)					\
+{											\
+	long ct = ps_load((unsigned long *)&q->top); 					\
+	long cb = ps_load((unsigned long *)&q->bottom) - 1;				\
+	long sz = cb - ct;								\
+	int ret = 0;									\
+											\
+	if (!ps_upcas((unsigned long *)&q->bottom, cb + 1, cb)) assert(0);		\
+											\
+	if (sz < 0) {									\
+		if (!ps_cas((unsigned long *)&q->bottom, cb, cb + 1)) assert(0);	\
+											\
+		return -ENOENT;								\
+	}										\
+											\
+	*w = q->wrk[cb];								\
+	if (sz > 0) return 0;								\
+											\
+	ret = ps_cas((unsigned long *)&q->top, ct, ct + 1);				\
+	if (!ps_upcas((unsigned long *)&q->bottom, cb, ct + 1)) assert(0);		\
+	if (!ret) return -ENOENT;							\
+											\
+	return 0;									\
+}											\
+											\
+static inline int									\
+deque_steal_##name(struct deque_##name *q, type *w)					\
+{											\
+	long ct, cb;									\
+											\
+	ct = ps_load((unsigned long *)&q->top);						\
+       	cb = ps_load((unsigned long *)&q->bottom);					\
+											\
+	if (ct >= cb) return -ENOENT;							\
+											\
+	*w = q->wrk[ct];								\
+	if (!ps_cas((unsigned long *)&q->top, ct, ct + 1)) return -EAGAIN;		\
+											\
+	return 0;									\
+}
+
+#endif /* DEQUE_H */
diff --git a/src/components/include/part.h b/src/components/include/part.h
new file mode 100644
index 0000000000..57a505909c
--- /dev/null
+++ b/src/components/include/part.h
@@ -0,0 +1,214 @@
+#ifndef PART_H
+#define PART_H
+
+#include <part_task.h>
+#include <ps_list.h>
+#include <deque.h>
+
+#include <sl.h>
+//#include <cirque.h>
+
+DEQUE_PROTOTYPE(part, struct part_task *);
+//CIRQUE_PROTOTYPE(part, struct part_task);
+
+extern struct deque_part part_dq_percore[];
+//extern struct cirque_par parcq_global;
+extern struct ps_list_head part_l_global;
+
+static inline struct deque_part *
+part_deque_curr(void)
+{
+	return &part_dq_percore[cos_cpuid()];
+}
+
+static inline struct deque_part *
+part_deque_core(cpuid_t c)
+{
+	assert(c < NUM_CPU);
+
+	return &part_dq_percore[c];
+}
+
+//static inline struct cirque_par *
+//part_cirque(void)
+//{
+//	return &parcq_global;
+//}
+
+static inline struct ps_list_head *
+part_list(void)
+{
+	return &part_l_global;
+}
+
+static inline int 
+part_deque_push(struct part_task *t)
+{
+	int ret;
+
+	sl_cs_enter();
+	ret = deque_push_part(part_deque_curr(), &t);
+	sl_cs_exit();
+
+	return ret;
+}
+
+static inline int
+part_deque_pop(struct part_task *t)
+{
+	int ret;
+
+	sl_cs_enter();
+	ret = deque_pop_part(part_deque_curr(), &t);
+	sl_cs_exit();
+
+	return ret;
+}
+
+static inline struct part_task * 
+part_deque_steal(cpuid_t core)
+{
+	int ret;
+	struct part_task *t = NULL;
+
+	ret = deque_steal_part(part_deque_core(core), &t);
+	if (ret) return NULL;
+
+	return t;
+}
+
+static inline struct part_task * 
+part_deque_steal_any(void)
+{
+	unsigned i = 0, c = (unsigned)(ps_tsc() % NUM_CPU);
+
+	do {
+		struct part_task *t = NULL;
+
+		i ++;
+		if (c == (unsigned)cos_cpuid()) c = (c + 1) % NUM_CPU;
+
+		t = part_deque_steal(c);
+		if (t) return t;
+	} while (i < NUM_CPU);
+
+	return NULL;
+}
+
+///* ds memory in a circular queue */
+//static inline struct part_task * 
+//part_cirque_alloc(void)
+//{
+//	return cirque_allocptr_par(part_cirque());
+//}
+//
+//static inline void
+//part_cirque_free(void)
+//{
+//	cirque_freeptr_par(part_cirque());
+//}
+//
+//static inline struct part_task * 
+//part_cirque_peek(void)
+//{
+//	return cirque_peekptr_par(part_cirque());
+//}
+
+/* TODO: lock for shared list! */
+static inline void
+part_list_append(struct part_task *t)
+{
+	assert(ps_list_singleton(t, partask));
+	assert(t->type == PART_TASK_T_WORKSHARE);
+
+	ps_list_head_append(part_list(), t, partask);
+}
+
+static inline void
+part_list_remove(struct part_task *t)
+{
+	assert(t->type == PART_TASK_T_WORKSHARE);
+	assert(!ps_list_singleton(t, partask));
+
+	ps_list_rem(t, partask);
+}
+
+static inline struct part_task *
+part_list_peek(void)
+{
+	struct part_task *t = NULL;
+
+	if (ps_list_head_empty(part_list())) return NULL;
+	/* not great! traversing from the first element always! */
+	/* TODO: perhaps traverse from the current task? */
+	ps_list_foreach(part_list(), t, partask) {
+		int i;
+
+		assert(t);
+
+		assert(t->type == PART_TASK_T_WORKSHARE);
+		/* coz, master thread adds to list the implicit task and doesn't defer it */
+		i = part_task_work_try(t);
+		assert(i != 0);
+
+		if (i > 0) return t; 
+	}
+
+	return NULL;
+}
+
+void part_init(void);
+
+unsigned part_isready(void);
+
+static inline void
+part_thd_fn(void *d)
+{
+	struct sl_thd *curr = sl_thd_curr();
+
+	while (!part_isready()) sl_thd_yield(0);
+	while (ps_list_head_empty(part_list())) sl_thd_yield(0);
+
+	while (1) {
+		struct part_task *t = NULL;
+		int ret;
+		int thdnum = 0;
+		unsigned thd = cos_cpuid() << 16 | cos_thdid();
+
+		/* FIXME: nested parallel needs love! */
+		t = part_list_peek();
+		if (t) goto found;
+
+single:
+		ret = part_deque_pop(t);
+		if (ret == 0) {
+			assert(t->type != PART_TASK_T_WORKSHARE);
+
+			goto found;
+		}
+
+		if (ret == -EAGAIN) goto single;
+
+		t = part_deque_steal_any();
+		if (!t) {
+			sl_thd_yield(0);
+			continue;
+		}
+		assert(t->type != PART_TASK_T_WORKSHARE);
+found:
+		thdnum = part_task_work_try(t);
+		if (thdnum < 0) continue;
+		if (t->type != PART_TASK_T_WORKSHARE) assert(thdnum == 0);
+		curr->part_context = (void *)t;
+
+		t->cs.fn(t->cs.data);
+
+		if (t->type != PART_TASK_T_WORKSHARE) continue;
+
+		part_task_barrier(t);
+	}
+
+	sl_thd_exit();
+}
+
+#endif /* PART_H */
diff --git a/src/components/include/part_task.h b/src/components/include/part_task.h
new file mode 100644
index 0000000000..a88ce9b23f
--- /dev/null
+++ b/src/components/include/part_task.h
@@ -0,0 +1,205 @@
+#ifndef PART_TASK_H
+#define PART_TASK_H
+
+#include <sl.h>
+#include <ps.h>
+#include <ps_list.h>
+#include <cos_types.h>
+
+#define PART_THD(c, t) (cos_cpuid() << 16 | cos_thdid())
+#define PART_CURR_THD  PART_THD(cos_cpuid(), cos_thdid()) 
+
+#define PART_MAX            4 
+#define PART_MAX_CORE_THDS  4
+#define PART_MAX_THDS       PART_MAX_CORE_THDS*NUM_CPU
+#define PART_MAX_CHILD      PART_MAX
+#define PART_MAX_WORKSHARES 8
+
+typedef void (*part_fn_t)(void *);
+
+typedef enum {
+	PART_TASK_S_FREED,
+	PART_TASK_S_ALLOCATED,
+	PART_TASK_S_RUNNING,
+	PART_TASK_S_CHILD_WAIT, /* WAIT FOR CHILD TASKS */
+	PART_TASK_S_SIBLING_WAIT, /* WAIT FOR SIBLING TASKS */
+	PART_TASK_S_PARENT_WAIT, /* WAIT FOR PARENT TASK */
+	PART_TASK_S_IN_BARRIER, /* WAIT FOR ALL OTHER THREADS */
+} part_task_state_t;
+
+typedef enum {
+	PART_TASK_T_NONE,
+	PART_TASK_T_WORKSHARE, /* task to put in a shared fifo queue */
+} part_task_type_t;
+
+typedef enum {
+	PART_WORKSHARE_NONE,
+	PART_WORKSHARE_LOOP_STATIC,
+	PART_WORKSHARE_LOOP_DYNAMIC,
+	PART_WORKSHARE_LOOP_GUIDED,
+	PART_WORKSHARE_LOOP_RUNTIME,
+	PART_WORKSHARE_SECTIONS,
+	PART_WORKSHARE_SINGLE,
+} part_workshare_type_t;
+
+struct part_workshare {
+	part_workshare_type_t type;
+
+	long chunk_sz;
+
+	long st, end, inc;
+
+	long next;
+
+	unsigned worker_bmp;
+};
+
+struct part_closure {
+	part_fn_t  fn;
+	void     *data;
+};
+
+struct part_task {
+	part_task_state_t state;
+	part_task_type_t  type;
+
+	struct part_workshare ws[PART_MAX_WORKSHARES];
+	struct part_closure   cs;
+
+	unsigned nthds; /* number of threads for this task, 1 in case of non-workshare work */
+	unsigned workers[PART_MAX_THDS]; /* threads sharing this work or thread doing this work! */
+	int ws_off[PART_MAX_THDS]; /* progress of the workshares in each participating thread */
+	//unsigned nwsdone;
+	unsigned master; /* coreid << 16 | thdid of the master */
+	unsigned barrier_in, barrier_out;
+
+	/* TODO: parent to wait on all child tasks for taskwait synchronization! */
+	struct part_task *parent;
+	struct part_task *child[PART_MAX_CHILD];
+
+	struct ps_list partask;
+} CACHE_ALIGNED;
+
+static inline void
+part_task_init(struct part_task *t, part_task_type_t type, struct part_task *p, unsigned nthds, part_fn_t fn, void *data)
+{
+	int i;
+
+	memset(t, 0, sizeof(struct part_task));
+
+	ps_list_init(t, partask);
+	t->type = type;
+	t->state = PART_TASK_S_ALLOCATED;
+	t->parent = p;
+	t->nthds = nthds;
+
+	t->master = PART_CURR_THD;
+	t->cs.fn = fn;
+	t->cs.data = data;
+
+	for (i = 0; i < PART_MAX_THDS; i++) t->ws_off[i] = -1;
+
+	/* if it's worksharing, current thread is the master and does take part in the par section */
+	if (type == PART_TASK_T_WORKSHARE) t->workers[0] = t->master;
+}
+
+static inline int
+part_task_add_child(struct part_task *t, struct part_task *c)
+{
+	int i;
+
+	for (i = 0; i < PART_MAX_CHILD; i++) {
+		if (t->child[i] == 0 && ps_cas((unsigned long *)&t->child[i], 0, (unsigned long)c)) return i;
+	}
+
+	return -1;
+}
+
+static inline void
+part_task_remove_child(struct part_task *t, struct part_task *c)
+{
+	int i;
+
+	if (!t || !c) return;
+
+	for (i = 0; i < PART_MAX_CHILD; i++) {
+		if (t->child[i] != c) continue;
+
+		if (!ps_cas((unsigned long *)&t->child[i], (unsigned long)c, 0)) assert(0);
+	}
+}
+
+static inline int
+part_task_work_try(struct part_task *t)
+{
+	int i = 0;
+        unsigned key = PART_CURR_THD;
+
+	if (t->type != PART_TASK_T_WORKSHARE) {
+		assert(t->nthds == 1);
+	} else {
+		assert(t->master != key && t->master == t->workers[0]);
+		i = 1;
+	}
+
+	for (; i < (int)t->nthds; i++)
+	{
+		if (t->workers[i] == key) return i;
+		if (t->workers[i]) continue;
+
+		if (ps_cas((unsigned long *)&t->workers[i], 0, key)) return i;
+	}
+
+	return -1;
+}
+
+static inline int
+part_task_work_thd_num(struct part_task *t)
+{
+	int i; 
+	unsigned key = PART_CURR_THD;
+
+	if (t->type != PART_TASK_T_WORKSHARE) assert(t->nthds == 1);
+
+	if (key == t->master) return 0;
+	for (i = 1; i < (int)t->nthds; i++) {
+		if (t->workers[i] == key) return i;
+	}
+
+	return -1;
+}
+
+static inline void
+part_task_barrier(struct part_task *t)
+{
+	int tn = part_task_work_thd_num(t);
+	unsigned cin = 0, cout = 0;
+
+	assert(tn >= 0);
+
+	if (t->nthds == 1) {
+		assert(tn == 0 && t->barrier_in == 0);
+
+		return;
+	}
+
+	/* wait for all siblings to have seen the previous barrier */
+	while (ps_load((unsigned long *)&t->barrier_out) % t->nthds) sl_thd_yield(0);
+
+	cin = ps_faa((unsigned long *)&t->barrier_in, 1);
+	if (cin % t->nthds == t->nthds - 1) {
+		int i;
+
+		/* wait for all child tasks to complete, including explicit tasks */
+		for (i = 0; i < PART_MAX_CHILD; i++) {
+			while (ps_load((unsigned long *)&t->child[i])) sl_thd_yield(0);
+		}
+	} else {
+		/* wait for all sibling tasks to reach in barrier! */
+		while (ps_load((unsigned long *)&t->barrier_in) % t->nthds != 0) sl_thd_yield(0);
+	}
+
+	ps_faa((unsigned long *)&t->barrier_out, 1);
+}
+
+#endif /* PART_TASK_H */
diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index b2acbeef4e..f54725831e 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -425,10 +425,12 @@ static inline int
 sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 {
 	struct cos_scb_info *scb = sl_scb_info_core();
+	struct cos_dcb_info *cd = sl_thd_dcbinfo(curr), *nd = sl_thd_dcbinfo(next);
 
-	if (unlikely(!sl_thd_dcbinfo(curr) || !sl_thd_dcbinfo(next))) {
+	if (unlikely(!cd || !nd)) {
 		return sl_thd_kern_dispatch(sl_thd_thdcap(next));
 	}
+
 	/*
 	 * jump labels in the asm routine:
 	 *
@@ -467,7 +469,7 @@ sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 		"3:\n\t"				\
 		"popl %%ebp\n\t"			\
 		:
-		: "a" (sl_thd_dcbinfo(curr)), "b" (sl_thd_dcbinfo(next)),
+		: "a" (cd), "b" (nd),
 		  "S" ((u32_t)((u64_t)tok >> 32)), "D" ((u32_t)(((u64_t)tok << 32) >> 32)),
 		  "c" (&(scb->curr_thd)), "d" (sl_thd_thdcap(next))
 		: "memory", "cc");
diff --git a/src/components/include/sl_thd.h b/src/components/include/sl_thd.h
index f2c0107484..25bfd572e1 100644
--- a/src/components/include/sl_thd.h
+++ b/src/components/include/sl_thd.h
@@ -91,6 +91,8 @@ struct sl_thd {
 	struct ps_list    SL_THD_EVENT_LIST; /* list of events for the scheduler end-point */
 
 	struct cos_dcb_info *dcb;
+
+	void *part_context; /* used by the parallelism stuff! */
 };
 
 static inline struct cos_dcb_info *
diff --git a/src/components/include/sl_xcore.h b/src/components/include/sl_xcore.h
index 383d45fe67..618246dd9d 100644
--- a/src/components/include/sl_xcore.h
+++ b/src/components/include/sl_xcore.h
@@ -84,13 +84,15 @@ CK_RING_PROTOTYPE(xcore, sl_xcore_request);
  * branches around in the code for core-local scheduling!
  * Also, making this struct explicit, makes API use explicit.
  * I should only be able to use: param_set(), wakeup() and perhaps free(). 
+ *
+ * Change my mind! This is a shit ton of wastage with CACHE_ALIGNED!
  */
 struct sl_xcore_thd {
 	thdid_t thd;
 	cpuid_t core;
 
-	asndcap_t asnd;
-};
+	asndcap_t asnd[NUM_CPU];
+} CACHE_ALIGNED;
 
 struct sl_xcore_thd *sl_xcore_thd_lookup(thdid_t tid, cpuid_t core);
 static inline thdid_t
diff --git a/src/components/lib/Makefile b/src/components/lib/Makefile
index 7bec69a198..0255456e16 100644
--- a/src/components/lib/Makefile
+++ b/src/components/lib/Makefile
@@ -1,6 +1,6 @@
 include Makefile.src Makefile.comp
 
-LIB_OBJS=heap.o cobj_format.o cos_kernel_api.o cos_defkernel_api.o cos_dcb.o cos_omp.o
+LIB_OBJS=heap.o cobj_format.o cos_kernel_api.o cos_defkernel_api.o cos_dcb.o part.o
 LIBS=$(LIB_OBJS:%.o=%.a)
 MANDITORY=c_stub.o cos_asm_upcall.o cos_asm_ainv.o cos_component.o
 MAND=$(MANDITORY_LIB)
diff --git a/src/components/lib/cos_gomp/Makefile b/src/components/lib/cos_gomp/Makefile
index 536e0ec430..ad4c1f75f9 100644
--- a/src/components/lib/cos_gomp/Makefile
+++ b/src/components/lib/cos_gomp/Makefile
@@ -1,6 +1,6 @@
 include Makefile.src Makefile.comp
 
-OBJS=cos_gomp.o
+OBJS=cos_omp.o cos_gomp.o
 LIB=cos_gomp
 CINC+=-m32
 
diff --git a/src/components/lib/cos_gomp/cos_gomp.c b/src/components/lib/cos_gomp/cos_gomp.c
index 49b65a28ac..78f3ea23af 100644
--- a/src/components/lib/cos_gomp/cos_gomp.c
+++ b/src/components/lib/cos_gomp/cos_gomp.c
@@ -12,105 +12,285 @@
 
 #include <res_spec.h>
 #include <sl.h>
+#include <sl_thd.h>
 #include <sl_lock.h> /* for now, single core lock! */
 #include <cos_omp.h>
 
-#define _THD_FIXED_PRIO 1
-#define _THD_LOCAL_ACTIVATE(t) sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, _THD_FIXED_PRIO))
-static struct sl_lock _cos_gomp_lock = SL_LOCK_STATIC_INIT();
+#include "cos_gomp.h"
+#include <part.h>
 
-static void
-_cos_gomp_thd_fn(void *d)
+#define COS_GOMP_MAX_EXPLICIT_TASKS 1024
+#define COS_GOMP_MAX_IMPLICIT_TASKS 512
+
+static struct part_task _itasks[COS_GOMP_MAX_IMPLICIT_TASKS], _etasks[COS_GOMP_MAX_EXPLICIT_TASKS];
+static unsigned _itask_free, _etask_free;
+
+static inline struct part_task *
+_cos_gomp_alloc_implicit(void)
 {
-	int *ndone = (int *)d;
-	struct sl_thd *t = sl_thd_curr();
-	struct cos_aep_info *a = sl_thd_aepinfo(t);
-	cos_thd_fn_t fn = NULL;
+	unsigned i = ps_faa((unsigned long *)&_itask_free, 1);
 
-	/*
-	 * TODO:
-	 * 1. Understand how gomp works with fn & data and what exactly is being passed!
-	 * 2. If work-stealing.. well, where am I stealing from! (void *d) should help with that!
-	 */
+	assert(i < COS_GOMP_MAX_IMPLICIT_TASKS);
+	return &_itasks[i];
+}
 
-	assert(a->fn);
-	fn = (cos_thd_fn_t)a->fn;
-	fn(a->data);
-	ps_faa((unsigned long *)ndone, 1);
+static inline struct part_task *
+_cos_gomp_alloc_explicit(void)
+{
+	unsigned i = ps_faa((unsigned long *)&_etask_free, 1);
 
-	sl_thd_exit();
+	assert(i < COS_GOMP_MAX_EXPLICIT_TASKS);
+	return &_etasks[i];
 }
 
-static inline unsigned
-_cos_gomp_num_threads(unsigned num_thds)
+void
+cos_gomp_init(void)
 {
-	return num_thds > 0 ? num_thds : (unsigned)omp_get_max_threads();
+	memset(_itasks, 0, sizeof(struct part_task) * COS_GOMP_MAX_IMPLICIT_TASKS);
+	memset(_etasks, 0, sizeof(struct part_task) * COS_GOMP_MAX_EXPLICIT_TASKS);
+	_itask_free = _etask_free = 0;
+
+	cos_omp_init();
+	part_init();
+}
+
+static inline void
+_gomp_parallel_start(struct part_task *pt, void (*fn) (void *), void *data, unsigned num_threads, unsigned flags)
+{
+	int parent_off;
+	struct sl_thd *t = sl_thd_curr();
+	struct part_task *parent = (struct part_task *)t->part_context;
+
+	num_threads = num_threads ? ((num_threads > COS_GOMP_MAX_THDS) ? COS_GOMP_MAX_THDS : num_threads) : PART_MAX;
+	part_task_init(pt, PART_TASK_T_WORKSHARE, parent, num_threads, fn, data);
+	if (parent) {
+		parent_off = part_task_add_child(parent, pt);
+		assert(parent_off >= 0);
+	}
+	t->part_context = pt;
+
+	if (num_threads > 1) part_list_append(pt);
+}
+
+static inline void
+_gomp_parallel_end(struct part_task *pt)
+{
+	struct sl_thd *t = sl_thd_curr();
+
+	/* implicit barrier */
+	part_task_barrier(pt);
+
+	if (pt->nthds > 1) part_list_remove(pt);
+
+	t->part_context = pt->parent;
+	part_task_remove_child(pt->parent, pt);
 }
 
 /* GOMP_parallel prototype from libgomp within gcc */
 void
 GOMP_parallel (void (*fn) (void *), void *data, unsigned num_threads,
-               unsigned int flags)
+	       unsigned int flags)
 {
-	/* FIXME: improve everything! */
-	unsigned i;
-	unsigned long num_done = 0;
+	struct part_task pt;
+
+	_gomp_parallel_start(&pt, fn, data, num_threads, flags);
+	fn(data);
+	_gomp_parallel_end(&pt);
+}
+
+bool
+GOMP_single_start(void)
+{
+	struct part_task *t = (struct part_task *)sl_thd_curr()->part_context;
+	int i;
+	int coff = part_task_work_thd_num(t);
+	unsigned b = 1 << coff;
+
+	assert(coff >= 0 && coff < (int)t->nthds);
+	for (i = t->ws_off[coff] + 1; i < PART_MAX_WORKSHARES; i++) {
+		struct part_workshare *pw = &t->ws[i];
+		unsigned c;
 
-	num_threads = _cos_gomp_num_threads(num_threads);
-	assert(num_threads <= MAX_NUM_THREADS);
-	for (i = 1; i < num_threads; i++) {
-		struct sl_thd *t = NULL;
-		struct cos_aep_info *a = NULL;
+		if (ps_load((unsigned long *)&pw->type) == PART_WORKSHARE_NONE) {
+			/* perhaps one of the threads just converted it to a single */
+			if (!ps_cas((unsigned long *)&pw->type, PART_WORKSHARE_NONE, PART_WORKSHARE_SINGLE)) assert(pw->type == PART_WORKSHARE_SINGLE);
+		}
+		if (ps_load((unsigned long *)&pw->type) != PART_WORKSHARE_SINGLE) continue;
 
-		/* TODO: any handling of AEPs? */
-		t = sl_thd_alloc(_cos_gomp_thd_fn, (void *)&num_done);
-		assert(t);
+retry_bmp:
+		c = ps_load((unsigned long *)&pw->worker_bmp);
+		/* if already went through this, should not have called start! */
+		assert(!(c & b));
 
-		a       = sl_thd_aepinfo(t);
-		a->fn   = (cos_aepthd_fn_t)fn;
-		a->data = data;
+		/* 
+		 * this thd, add to worker bmp to indicate it reached the construct.
+		 * if this is the first to reach, then return "true", else "false".
+		 *
+		 * if cas failed, try again as you have to indicate that this thd
+		 * has done this construct!
+		 */
+		if (ps_cas((unsigned long *)&pw->worker_bmp, c, c | b)) {
+			t->ws_off[coff] = i;
 
-		_THD_LOCAL_ACTIVATE(t);
+			return c ? false : true;
+		}
+		goto retry_bmp;
 	}
 
-	sl_thd_yield(0);
+	assert(0); /* exceed the number of workshares? */
 
-	fn(data);
-	ps_faa((unsigned long *)&num_done, 1);
-	/* TODO: anything else to do in this master? thread */
+	return false;
+}
 
-	while (ps_load((unsigned long *)&num_done) < (unsigned long)num_threads) sl_thd_yield(0);
+void
+GOMP_barrier (void)
+{
+	struct part_task *t = (struct part_task *)sl_thd_curr()->part_context;
+
+	part_task_barrier(t);
+}
+
+static inline bool
+_gomp_loop_dynamic_next(struct part_task *t, struct part_workshare *w, long *s, long *e)
+{
+	long cn, left, wrk = 0;
+
+retry:
+	cn = ps_load((unsigned long *)&w->next);
+	left = w->end - cn;
+
+	if (left == 0) return false;
+	/* todo: incr <= 0 */
+	assert(w->inc > 0);
+
+	wrk = w->chunk_sz;
+	if (left < wrk) wrk = left;
+	if (!ps_cas((unsigned long *)&w->next, cn, cn + wrk)) goto retry;
+
+	*s = cn;
+	*e = cn + wrk;
+
+	return true;
 }
 
 bool
-GOMP_single_start (void)
+GOMP_loop_dynamic_start (long start, long end, long incr, long chunk_size,
+			 long *istart, long *iend)
 {
-	static thdid_t t = 0;
+	struct part_task *t = (struct part_task *)sl_thd_curr()->part_context;
+	int i;
+	int coff = part_task_work_thd_num(t);
+	unsigned b = 1 << coff;
+
+	assert(coff >= 0 && coff < (int)t->nthds);
+	for (i = t->ws_off[coff] + 1; i < PART_MAX_WORKSHARES; i++) {
+		struct part_workshare *pw = &t->ws[i];
+		unsigned c;
+
+		if (ps_load((unsigned long *)&pw->type) == PART_WORKSHARE_NONE) {
+			/* perhaps one of the threads just converted it to a loop */
+			if (!ps_cas((unsigned long *)&pw->type, PART_WORKSHARE_NONE, PART_WORKSHARE_LOOP_DYNAMIC)) assert(pw->type == PART_WORKSHARE_LOOP_DYNAMIC);
+		}
+
+		if (ps_load((unsigned long *)&pw->type) != PART_WORKSHARE_LOOP_DYNAMIC) continue;
 
-	/* TODO: intelligence! */
-	if (ps_cas((unsigned long *)&t, 0, cos_thdid())) return true;
-	if (t == cos_thdid()) return true;
+retry_bmp:
+		c = ps_load((unsigned long *)&pw->worker_bmp);
+		/* if already went through this, should not have called start! */
+		assert(!(c & b));
+
+		/* 
+		 * this thd, add to worker bmp to indicate it reached the construct.
+		 */
+		if (ps_cas((unsigned long *)&pw->worker_bmp, c, c | b)) t->ws_off[coff] = i;
+		else goto retry_bmp;
+
+		/* all threads participating will initialize to the same values */
+		if (!pw->end) {
+			pw->chunk_sz = chunk_size;
+			pw->inc = incr;
+			pw->st = start;
+			pw->end = end;
+		}
+
+		if (istart && iend) return _gomp_loop_dynamic_next(t, pw, istart, iend);
+		else return true;
+	}
+
+	assert(0);
 
 	return false;
 }
 
 void
-GOMP_barrier (void)
+GOMP_parallel_loop_dynamic (void (*fn) (void *), void *data,
+			    unsigned num_threads, long start, long end,
+			    long incr, long chunk_size, unsigned flags)
+{
+	struct part_task pt;
+	bool ret;
+
+	_gomp_parallel_start(&pt, fn, data, num_threads, flags);
+	ret = GOMP_loop_dynamic_start(start, end, incr, chunk_size, NULL, NULL);
+	assert(ret == true);
+
+	fn(data);
+	_gomp_parallel_end(&pt);
+}
+
+bool
+GOMP_loop_dynamic_next (long *istart, long *iend)
+{
+	struct part_task *t = (struct part_task *)sl_thd_curr()->part_context;
+	unsigned coff = part_task_work_thd_num(t);
+	int woff = t->ws_off[coff];
+
+	woff = woff < 0 ? 0 : woff;
+	t->ws_off[coff] = woff;
+	assert(t->ws[woff].type == PART_WORKSHARE_LOOP_DYNAMIC);
+
+	return _gomp_loop_dynamic_next(t, &t->ws[woff], istart, iend);
+}
+
+void
+GOMP_loop_end (void)
 {
-	/* TODO: intelligence to wait for all threads in the team! */ 
-	sl_thd_yield(0);
+	struct part_task *t = (struct part_task *)sl_thd_curr()->part_context;
+	unsigned coff = part_task_work_thd_num(t);
+	int woff = t->ws_off[coff], c = 0;
+
+	assert(t->ws[woff].type == PART_WORKSHARE_LOOP_DYNAMIC);
+
+	part_task_barrier(t);
+
+//	do {
+//		c = ps_load((unsigned long *)&t->nwsdone);
+//	} while (!ps_cas((unsigned long *)&t->nwsdone, c, c | (1 << woff)));
+}
+
+void
+GOMP_loop_end_nowait (void)
+{
+	struct part_task *t = (struct part_task *)sl_thd_curr()->part_context;
+	unsigned coff = part_task_work_thd_num(t);
+	int woff = t->ws_off[coff], c = 0;
+
+	assert(t->ws[woff].type == PART_WORKSHARE_LOOP_DYNAMIC);
+//	do {
+//		c = ps_load((unsigned long *)&t->nwsdone);
+//	} while (!ps_cas((unsigned long *)&t->nwsdone, c, c | (1 << woff)));
 }
 
 void
 GOMP_critical_start (void)
 {
-	/* TODO: a multi-core lock! */
-	sl_lock_take(&_cos_gomp_lock);
+//	/* TODO: a multi-core lock! */
+//	sl_lock_take(&_cos_gomp_lock);
 }
 
 void
 GOMP_critical_end (void)
 {
-	/* TODO: a multi-core lock! */
-	sl_lock_release(&_cos_gomp_lock);
+//	/* TODO: a multi-core lock! */
+//	sl_lock_release(&_cos_gomp_lock);
 }
diff --git a/src/components/lib/cos_gomp/cos_gomp.h b/src/components/lib/cos_gomp/cos_gomp.h
new file mode 100644
index 0000000000..9d11a85df0
--- /dev/null
+++ b/src/components/lib/cos_gomp/cos_gomp.h
@@ -0,0 +1,11 @@
+#ifndef COS_GOMP_H
+#define COS_GOMP_H
+
+#include <part.h>
+
+#define COS_GOMP_MAX_THDS PART_MAX_THDS
+#define COS_GOMP_CORE_MAX_THDS PART_MAX_CORE_THDS
+#define COS_GOMP_MAX_CHILD PART_MAX_CHILD
+#define COS_GOMP_MAX_TASKS 4096
+
+#endif /* COS_GOMP_H */
diff --git a/src/components/lib/cos_omp.c b/src/components/lib/cos_gomp/cos_omp.c
similarity index 89%
rename from src/components/lib/cos_omp.c
rename to src/components/lib/cos_gomp/cos_omp.c
index f1c7bea1bb..f271311648 100644
--- a/src/components/lib/cos_omp.c
+++ b/src/components/lib/cos_gomp/cos_omp.c
@@ -5,6 +5,7 @@
  * Author: Phani Gadepalli, phanikishoreg@gwu.edu
  */
 
+#include <part_task.h>
 #include <cos_omp.h>
 #include <cos_kernel_api.h>
 #include <cos_types.h>
@@ -45,23 +46,23 @@ omp_get_max_threads(void)
 __GOMP_NOTHROW int
 omp_get_num_threads(void)
 {
-	/* FIXME: number of threads in the current team! */
-	return omp_get_max_threads();
+	struct sl_thd *t = sl_thd_curr();
+	struct part_task *pt = (struct part_task *)t->part_context;
+
+	if (pt) return pt->nthds;
+
+	return 1;
 }
 
 __GOMP_NOTHROW int
 omp_get_thread_num(void)
 {
-	/* 
-	 * thread number within a team of a parallel construct! 
-	 * master thd will be = 0
-	 * not the physical thread id.
-	 *
-	 * TODO: fetch from team structure?
-	 *
-	 * For now though, a big hack!
-	 */
-	return (cos_thdid() % omp_get_max_threads());
+	struct sl_thd *t = sl_thd_curr();
+	struct part_task *pt = (struct part_task *)t->part_context;
+
+	if (!pt) return 0;
+	
+	return part_task_work_thd_num(pt);
 }
 
 static inline void
diff --git a/src/components/lib/part.c b/src/components/lib/part.c
new file mode 100644
index 0000000000..59e0add4f6
--- /dev/null
+++ b/src/components/lib/part.c
@@ -0,0 +1,57 @@
+#include <cos_types.h>
+#include <cos_component.h>
+#include <part_task.h>
+#include <part.h>
+
+#include <sl.h>
+#include <sl_xcore.h>
+
+struct deque_part part_dq_percore[NUM_CPU];
+//struct cirque_par parcq_global;
+struct ps_list_head part_l_global;
+static unsigned part_ready = 0;
+
+#define _PART_PRIO 1
+#define _PART_PRIO_PACK() sched_param_pack(SCHEDP_PRIO, _PART_PRIO)
+
+unsigned
+part_isready(void)
+{ return part_ready; }
+
+void
+part_init(void)
+{
+	int j;
+	struct sl_xcore_thd *x;
+	sched_param_t p = _PART_PRIO_PACK();
+	sched_param_t pa[1] = { p };
+	struct sl_thd *t;
+	static int is_first = NUM_CPU;
+
+	ps_list_head_init(&part_l_global);
+	if (!ps_cas((unsigned long *)&is_first, NUM_CPU, cos_cpuid())) return;
+	
+	for (j = 0; j < NUM_CPU; j++) {
+		int k;
+
+		if (j == cos_cpuid()) {
+			for (k = 0; k < PART_MAX_CORE_THDS; k++) {
+				t = sl_thd_alloc(part_thd_fn, NULL);
+				assert(t);
+
+				sl_thd_param_set(t, p);
+
+				x = sl_xcore_thd_lookup(sl_thd_thdid(t), cos_cpuid());
+				assert(x);
+			}
+
+		} else {
+			for (k = 0; k < PART_MAX_CORE_THDS; k++) {
+				x = sl_xcore_thd_alloc(j, part_thd_fn, NULL, 1, pa);
+				assert(x);
+			}
+		}
+	}
+
+	part_ready = 1;
+}
diff --git a/src/components/lib/sl/sl_mod_fifo.c b/src/components/lib/sl/sl_mod_fifo.c
index 4f6618f1f0..83b975f883 100644
--- a/src/components/lib/sl/sl_mod_fifo.c
+++ b/src/components/lib/sl/sl_mod_fifo.c
@@ -48,7 +48,6 @@ sl_mod_wakeup(struct sl_thd_policy *t)
 void
 sl_mod_yield(struct sl_thd_policy *t, struct sl_thd_policy *yield_to)
 {
-	/* should yield move the current thread to end of the runQ? don't think so! FIFO scheduler, so yield doesn't change the sched order! */
 	ps_list_rem_d(t);
 	ps_list_head_append_d(&threads[cos_cpuid()], t);
 }
diff --git a/src/components/lib/sl/sl_xcore.c b/src/components/lib/sl/sl_xcore.c
index 10e149d868..283cf9fb79 100644
--- a/src/components/lib/sl/sl_xcore.c
+++ b/src/components/lib/sl/sl_xcore.c
@@ -7,16 +7,12 @@
 /******************************* Client-side ***************************/
 
 /* static xcore thread backend! mainly for bookkeeping across cores! */
-struct _sl_xcore_thds {
-	struct sl_xcore_thd _thds[MAX_NUM_THREADS];
-} CACHE_ALIGNED;
-
-static struct _sl_xcore_thds _xcore_thds[NUM_CPU];
+static struct sl_xcore_thd _xcore_thds[MAX_NUM_THREADS];
 
 static inline struct sl_xcore_thd *
 _sl_xcore_thd_backend_lookup(thdid_t tid)
 {
-	return &(_xcore_thds[cos_cpuid()]._thds[tid]);
+	return &_xcore_thds[tid];
 }
 
 static inline struct sl_xcore_thd *
@@ -24,14 +20,9 @@ _sl_xcore_thd_backend_init(thdid_t tid, cpuid_t core, asndcap_t snd)
 {
 	struct sl_xcore_thd *t = _sl_xcore_thd_backend_lookup(tid);
 
-	sl_cs_enter();
-	if (unlikely(t->thd)) goto done;
+	if (unlikely(t->thd)) return t;
 	t->thd  = tid;
 	t->core = core;
-	t->asnd = snd;
-
-done:
-	sl_cs_exit();
 
 	return t;
 }

From 98169a71b0fc07fd02a58caa7cb479b21317fc5d Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Thu, 9 May 2019 17:10:33 -0400
Subject: [PATCH 058/127] FPRR: bitmap for priorities

---
 .../tests/unit_fprr/unit_fprr.c               | 14 ++--
 src/components/include/sl.h                   | 55 +++++++--------
 src/components/lib/sl/sl_mod_fifo.c           |  3 +-
 src/components/lib/sl/sl_mod_fprr.c           | 68 +++++++++++++------
 src/components/lib/sl/sl_mod_rr.c             |  3 +-
 src/components/lib/sl/sl_sched.c              |  2 +
 6 files changed, 85 insertions(+), 60 deletions(-)

diff --git a/src/components/implementation/tests/unit_fprr/unit_fprr.c b/src/components/implementation/tests/unit_fprr/unit_fprr.c
index 9a72960420..093ba6c25b 100644
--- a/src/components/implementation/tests/unit_fprr/unit_fprr.c
+++ b/src/components/implementation/tests/unit_fprr/unit_fprr.c
@@ -161,12 +161,12 @@ run_xcore_tests()
 static void
 run_tests()
 {
-//	test_highest_is_scheduled();
-//	PRINTC("%s: Schedule highest priority thread only!\n", high_thd_test_status[cos_cpuid()] ? "FAILURE" : "SUCCESS");
-//	test_swapping();
-//	PRINTC("%s: Swap back and forth!\n", (thd1_ran[cos_cpuid()] && thd2_ran[cos_cpuid()]) ? "SUCCESS" : "FAILURE");
+	test_highest_is_scheduled();
+	PRINTC("%s: Schedule highest priority thread only!\n", high_thd_test_status[cos_cpuid()] ? "FAILURE" : "SUCCESS");
+	test_swapping();
+	PRINTC("%s: Swap back and forth!\n", (thd1_ran[cos_cpuid()] && thd2_ran[cos_cpuid()]) ? "SUCCESS" : "FAILURE");
 
-	run_xcore_tests();
+//	run_xcore_tests();
 
 	PRINTC("Unit-test done!\n");
 	sl_thd_exit();
@@ -181,8 +181,6 @@ cos_init(void)
 	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
 	struct cos_compinfo    *ci    = cos_compinfo_get(defci);
 
-	PRINTC("Unit-test for the scheduling library (sl)\n");
-
 	if (ps_cas(&first, NUM_CPU + 1, cos_cpuid())) {
 		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
 		cos_defcompinfo_llinit();
@@ -196,6 +194,8 @@ cos_init(void)
 		while (!ps_load(&init_done[i])) ;
 	}
 
+	PRINTC("Unit-test for the scheduling library (sl)\n");
+
 	sl_init(SL_MIN_PERIOD_US);
 
 	testing_thread = sl_thd_alloc(run_tests, NULL);
diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index 056cf8ba6b..f3ed3cb440 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -40,8 +40,9 @@
 #include <sl_xcore.h>
 #include <heap.h>
 
-#undef SL_TIMEOUTS
+#undef  SL_TIMEOUTS
 #define SL_CS
+#undef  SL_REPLENISH
 
 /* Critical section (cs) API to protect scheduler data-structures */
 struct sl_cs {
@@ -193,7 +194,6 @@ sl_cs_enter_nospin(void)
 	cached.v = csi.v;
 
 	if (unlikely(csi.s.owner)) {
-		assert(0);
 		return sl_cs_enter_contention(&csi, &cached, gcore, t, cos_sched_sync());
 	}
 
@@ -244,7 +244,6 @@ sl_cs_exit(void)
 	cached.v = csi.v;
 
 	if (unlikely(csi.s.contention)) {
-		assert(0);
 		if (sl_cs_exit_contention(&csi, &cached, gcore, cos_sched_sync())) goto retry;
 
 		return;
@@ -490,12 +489,11 @@ sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 static inline int
 sl_thd_activate(struct sl_thd *t, sched_tok_t tok)
 {
-//	struct cos_defcompinfo *dci = cos_defcompinfo_curr_get();
-//	struct cos_compinfo    *ci  = &dci->ci;
-//	struct sl_global_core  *g   = sl__globals_core();
-//	int ret = 0;
+	struct cos_defcompinfo *dci = cos_defcompinfo_curr_get();
+	struct cos_compinfo    *ci  = &dci->ci;
+	struct sl_global_core  *g   = sl__globals_core();
+	int ret = 0;
 
-#if 0
 	if (t->properties & SL_THD_PROPERTY_SEND) {
 		return cos_sched_asnd(t->sndcap, g->timeout_next, g->sched_rcv, tok);
 	} else if (t->properties & SL_THD_PROPERTY_OWN_TCAP) {
@@ -511,15 +509,9 @@ sl_thd_activate(struct sl_thd *t, sched_tok_t tok)
 		 * Attempting to activate scheduler thread or idle thread failed for no budget in it's tcap.
 		 * Force switch to the scheduler with current tcap.
 		 */
-		return cos_switch(sl_thd_thdcap(g->sched_thd), 0, t->prio, 0, g->sched_rcv, tok);
-#endif
-		/* TODO: can't use if you're reprogramming a timer/prio */
-		return sl_thd_dispatch(t, tok, sl_thd_curr());
-		//return cos_switch(sl_thd_thdcap(t), g->sched_tcap, t->prio,
-		//		  g->timeout_next, g->sched_rcv, tok);
-#if 0
+		return cos_switch(sl_thd_thdcap(t), g->sched_tcap, t->prio,
+				  g->timeout_next, g->sched_rcv, tok);
 	}
-#endif
 }
 
 static inline int
@@ -563,14 +555,14 @@ sl_cs_exit_schedule_nospin_arg_c(struct sl_thd *curr, struct sl_thd *next)
 static inline int
 sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 {
-//	return sl_thd_dispatch(to, cos_sched_sync(), sl_thd_curr());
-#if 1
 	struct sl_thd         *t = to;
-//	struct sl_global_core *globals = sl__globals_core();
+	struct sl_global_core *globals = sl__globals_core();
 	sched_tok_t            tok;
-//	cycles_t               now;
-//	s64_t                  offset;
-//	int                    ret;
+#if defined(SL_TIMEOUTS) || defined(SL_REPLENISH)
+	cycles_t               now;
+#endif
+	s64_t                  offset;
+	int                    ret;
 
 	/* Don't abuse this, it is only to enable the tight loop around this function for races... */
 #ifdef SL_CS
@@ -578,7 +570,9 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 #endif
 
 	tok    = cos_sched_sync();
-//	now    = sl_now();
+#if defined(SL_TIMEOUTS) || defined(SL_REPLENISH)
+	now    = sl_now();
+#endif
 
 #ifdef SL_TIMEOUTS
 	offset = (s64_t)(globals->timer_next - now);
@@ -606,7 +600,7 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 			t = sl_mod_thd_get(pt);
 	}
 
-#if 0
+#ifdef SL_REPLENISH
 	if (t->properties & SL_THD_PROPERTY_OWN_TCAP && t->budget) {
 		struct cos_compinfo *ci = cos_compinfo_get(cos_defcompinfo_curr_get());
 
@@ -640,9 +634,13 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 	if (t == sl__globals_core()->idle_thd) t = sl__globals_core()->sched_thd;
 	if (t == sl_thd_curr()) return 0;
 
-	return sl_thd_dispatch(t, tok, sl_thd_curr());
-//	ret = sl_thd_activate(t, tok);
-#if 0
+#ifdef SL_TIMEOUTS
+	ret = sl_thd_activate(t, tok);
+#else
+	ret = sl_thd_dispatch(t, tok, sl_thd_curr());
+#endif
+
+#ifdef SL_REPLENISH 
 	/*
 	 * dispatch failed with -EPERM because tcap associated with thread t does not have budget.
 	 * Block the thread until it's next replenishment and return to the scheduler thread.
@@ -657,8 +655,7 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 	}
 #endif
 
-//	return ret;
-#endif
+	return ret;
 }
 
 static inline int
diff --git a/src/components/lib/sl/sl_mod_fifo.c b/src/components/lib/sl/sl_mod_fifo.c
index 83b975f883..b4c7d5cab1 100644
--- a/src/components/lib/sl/sl_mod_fifo.c
+++ b/src/components/lib/sl/sl_mod_fifo.c
@@ -12,9 +12,8 @@
 
 #define SL_FPRR_PERIOD_US_MIN  SL_MIN_PERIOD_US
 
-struct ps_list_head threads[NUM_CPU] CACHE_ALIGNED;
+static struct ps_list_head threads[NUM_CPU] CACHE_ALIGNED;
 
-/* No RR yet */
 void
 sl_mod_execution(struct sl_thd_policy *t, cycles_t cycles)
 { }
diff --git a/src/components/lib/sl/sl_mod_fprr.c b/src/components/lib/sl/sl_mod_fprr.c
index 5d1c5dd202..31d74b0566 100644
--- a/src/components/lib/sl/sl_mod_fprr.c
+++ b/src/components/lib/sl/sl_mod_fprr.c
@@ -9,9 +9,9 @@
 
 #define SL_FPRR_PERIOD_US_MIN  SL_MIN_PERIOD_US
 
-struct ps_list_head threads[NUM_CPU][SL_FPRR_NPRIOS] CACHE_ALIGNED;
+static unsigned int thdlist_bmp[NUM_CPU] CACHE_ALIGNED;
+static struct ps_list_head threads[NUM_CPU][SL_FPRR_NPRIOS] CACHE_ALIGNED;
 
-/* No RR yet */
 void
 sl_mod_execution(struct sl_thd_policy *t, cycles_t cycles)
 { }
@@ -20,37 +20,59 @@ struct sl_thd_policy *
 sl_mod_schedule(void)
 {
 	int i;
-	struct sl_thd_policy *t;
+	struct sl_thd_policy *t = NULL;
 
-	for (i = 0 ; i < SL_FPRR_NPRIOS ; i++) {
-		if (ps_list_head_empty(&threads[cos_cpuid()][i])) continue;
-		t = ps_list_head_first_d(&threads[cos_cpuid()][i], struct sl_thd_policy);
+	if (unlikely(!thdlist_bmp[cos_cpuid()])) return NULL;
+	i = __builtin_ctz(thdlist_bmp[cos_cpuid()]);
+	assert(i < SL_FPRR_NPRIOS);
+	assert(!ps_list_head_empty(&threads[cos_cpuid()][i]));
+	t = ps_list_head_first_d(&threads[cos_cpuid()][i], struct sl_thd_policy);
+	assert(t);
 
-		/*
-		 * We want to move the selected thread to the back of the list.
-		 * Otherwise fprr won't be truly round robin
-		 */
-		ps_list_rem_d(t);
-		ps_list_head_append_d(&threads[cos_cpuid()][i], t);
+	ps_list_rem_d(t);
+	ps_list_head_append_d(&threads[cos_cpuid()][i], t);
 
-		return t;
-	}
+	return t;
+}
 
-	return NULL;
+static inline void
+__sl_mod_bmp_unset(struct sl_thd_policy *t)
+{
+	unsigned int ctb = ps_load(&thdlist_bmp[cos_cpuid()]);
+	unsigned int p = t->priority - 1, b = 1 << p;
+
+	if (!ps_list_head_empty(&threads[cos_cpuid()][p])) return;
+
+	/* unset from bitmap if there are no threads at this priority */
+	if (unlikely(!ps_upcas(&thdlist_bmp[cos_cpuid()], ctb, ctb & ~b))) assert(0);
+}
+
+static inline void
+__sl_mod_bmp_set(struct sl_thd_policy *t)
+{
+	unsigned int ctb = ps_load(&thdlist_bmp[cos_cpuid()]);
+	unsigned int p = t->priority - 1, b = 1 << p;
+
+	if (unlikely(ctb & b)) return; 
+
+	assert(!ps_list_head_empty(&threads[cos_cpuid()][p]));
+	/* set to bitmap if this is the first element added at this prio! */
+	if (unlikely(!ps_upcas(&thdlist_bmp[cos_cpuid()], ctb, ctb | b))) assert(0);
 }
 
 void
 sl_mod_block(struct sl_thd_policy *t)
 {
 	ps_list_rem_d(t);
+	__sl_mod_bmp_unset(t);
 }
 
 void
 sl_mod_wakeup(struct sl_thd_policy *t)
 {
 	assert(ps_list_singleton_d(t));
-
 	ps_list_head_append_d(&threads[cos_cpuid()][t->priority - 1], t);
+	__sl_mod_bmp_set(t);
 }
 
 void
@@ -72,7 +94,10 @@ sl_mod_thd_create(struct sl_thd_policy *t)
 
 void
 sl_mod_thd_delete(struct sl_thd_policy *t)
-{ ps_list_rem_d(t); }
+{
+	ps_list_rem_d(t); 
+	__sl_mod_bmp_unset(t);
+}
 
 void
 sl_mod_thd_param_set(struct sl_thd_policy *t, sched_param_type_t type, unsigned int v)
@@ -81,10 +106,12 @@ sl_mod_thd_param_set(struct sl_thd_policy *t, sched_param_type_t type, unsigned
 	case SCHEDP_PRIO:
 	{
 		assert(v >= SL_FPRR_PRIO_HIGHEST && v <= SL_FPRR_PRIO_LOWEST);
-		ps_list_rem_d(t); /* if we're already on a list, and we're updating priority */
+		/* should not have been on any prio before, this is FP */
+		assert(ps_list_singleton_d(t));
 		t->priority = v;
-		ps_list_head_append_d(&threads[cos_cpuid()][t->priority - 1], t);
-		sl_thd_setprio(sl_mod_thd_get(t), t->priority);
+		ps_list_head_append_d(&threads[cos_cpuid()][v - 1], t);
+		__sl_mod_bmp_set(t);
+		sl_thd_setprio(sl_mod_thd_get(t), v);
 
 		break;
 	}
@@ -110,6 +137,7 @@ sl_mod_init(void)
 {
 	int i;
 
+	thdlist_bmp[cos_cpuid()] = 0;
 	memset(threads[cos_cpuid()], 0, sizeof(struct ps_list_head) * SL_FPRR_NPRIOS);
 	for (i = 0 ; i < SL_FPRR_NPRIOS ; i++) {
 		ps_list_head_init(&threads[cos_cpuid()][i]);
diff --git a/src/components/lib/sl/sl_mod_rr.c b/src/components/lib/sl/sl_mod_rr.c
index 3db300a735..d02bf502dd 100644
--- a/src/components/lib/sl/sl_mod_rr.c
+++ b/src/components/lib/sl/sl_mod_rr.c
@@ -5,9 +5,8 @@
 
 #define SL_FPRR_PERIOD_US_MIN  SL_MIN_PERIOD_US
 
-struct ps_list_head threads[NUM_CPU] CACHE_ALIGNED;
+static struct ps_list_head threads[NUM_CPU] CACHE_ALIGNED;
 
-/* No RR yet */
 void
 sl_mod_execution(struct sl_thd_policy *t, cycles_t cycles)
 { }
diff --git a/src/components/lib/sl/sl_sched.c b/src/components/lib/sl/sl_sched.c
index 16def138f2..7502e41c64 100644
--- a/src/components/lib/sl/sl_sched.c
+++ b/src/components/lib/sl/sl_sched.c
@@ -531,6 +531,7 @@ sl_thd_param_set(struct sl_thd *t, sched_param_t sp)
 
 	assert(t);
 
+	sl_cs_enter();
 	sched_param_get(sp, &type, &value);
 
 	switch (type) {
@@ -549,6 +550,7 @@ sl_thd_param_set(struct sl_thd *t, sched_param_t sp)
 	}
 
 	sl_mod_thd_param_set(sl_mod_thd_policy_get(t), type, value);
+	sl_cs_exit();
 }
 
 void

From a2b6e580664c1cdc1a4817c44b596ffe280f5a6d Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Fri, 10 May 2019 18:38:35 -0400
Subject: [PATCH 059/127] crt_lock, stack-list multi-core and bugfixes in
 sl/capmgr/part/gomp

---
 .../implementation/capmgr/naive/init.c        |   1 +
 .../no_interface/omp_dijkstra/Makefile        |   2 +-
 .../no_interface/omp_dijkstra/dijkstra_omp.c  |   8 +-
 .../no_interface/omp_hello/Makefile           |   2 +-
 .../no_interface/omp_hello/hello_omp.c        |  11 +-
 src/components/include/crt_lock.h             |   4 +-
 src/components/include/part.h                 |  82 ++++++--
 src/components/include/part_task.h            |  37 ++--
 src/components/include/sl.h                   |   4 +
 src/components/include/sl_xcore.h             |   3 +-
 src/components/include/stacklist.h            |  11 +-
 src/components/lib/cos_gomp/cos_gomp.c        | 185 +++++++++++++-----
 src/components/lib/part.c                     |  56 +++---
 src/components/lib/sl/sl_blkpt.c              |  27 ++-
 src/components/lib/sl/sl_capmgr.c             |   1 +
 src/components/lib/sl/sl_sched.c              |  13 +-
 src/components/lib/sl/sl_xcore.c              |  20 +-
 src/kernel/include/shared/cos_config.h        |   2 +-
 18 files changed, 326 insertions(+), 143 deletions(-)

diff --git a/src/components/implementation/capmgr/naive/init.c b/src/components/implementation/capmgr/naive/init.c
index fc8087c9c9..35f5b0edd1 100644
--- a/src/components/implementation/capmgr/naive/init.c
+++ b/src/components/implementation/capmgr/naive/init.c
@@ -96,6 +96,7 @@ capmgr_comp_info_init(struct cap_comp_info *rci, spdid_t spdid)
 		ret = hypercall_root_initaep_set(spdid, sl_thd_aepinfo(ithd));
 		assert(ret == 0);
 		cap_info_initthd_init(rci, ithd, 0);
+		cap_comminfo_init(ithd, 0, 0);
 	}
 
 	return;
diff --git a/src/components/implementation/no_interface/omp_dijkstra/Makefile b/src/components/implementation/no_interface/omp_dijkstra/Makefile
index 1a7d4146ce..c018ed38c1 100644
--- a/src/components/implementation/no_interface/omp_dijkstra/Makefile
+++ b/src/components/implementation/no_interface/omp_dijkstra/Makefile
@@ -2,7 +2,7 @@ COMPONENT=omp_dijkstra.o
 INTERFACES=
 DEPENDENCIES=capmgr
 IF_LIB=
-ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLCAPMGR) -lsl_mod_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLCAPMGR) -lsl_mod_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart -lsl_blkpt
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/no_interface/omp_dijkstra/dijkstra_omp.c b/src/components/implementation/no_interface/omp_dijkstra/dijkstra_omp.c
index da0fb3c4e9..34a648ef95 100644
--- a/src/components/implementation/no_interface/omp_dijkstra/dijkstra_omp.c
+++ b/src/components/implementation/no_interface/omp_dijkstra/dijkstra_omp.c
@@ -3,6 +3,7 @@
 #include <sl.h>
 #include <llprint.h>
 #include <cos_omp.h>
+#include <hypercall.h>
 
 # define NV 6
 
@@ -580,8 +581,8 @@ cos_init(void *d)
 	int i;
 	static volatile unsigned long first = NUM_CPU + 1, init_done[NUM_CPU] = { 0 };
 
+	PRINTC("In OpenMP-based Hello Program!\n");
 	if (ps_cas((unsigned long *)&first, NUM_CPU + 1, cos_cpuid())) {
-		PRINTC("In OpenMP-based Hello Program!\n");
 		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
 		cos_defcompinfo_init();
 	} else {
@@ -589,18 +590,19 @@ cos_init(void *d)
 
 		cos_defcompinfo_sched_init();
 	}
-	sl_init(SL_MIN_PERIOD_US*100);
 	ps_faa((unsigned long *)&init_done[cos_cpuid()], 1);
 
 	/* make sure the INITTHD of the scheduler is created on all cores.. for cross-core sl initialization to work! */
 	for (i = 0; i < NUM_CPU; i++) {
 		while (!ps_load((unsigned long *)&init_done[i])) ;
 	}
+	sl_init(SL_MIN_PERIOD_US*100);
+	cos_gomp_init();
+	hypercall_comp_init_done();
 
 	if (!cos_cpuid()) {
 		struct sl_thd *t = NULL;
 
-		cos_gomp_init();
 		t = sl_thd_alloc(cos_main, NULL);
 		assert(t);
 		sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX));
diff --git a/src/components/implementation/no_interface/omp_hello/Makefile b/src/components/implementation/no_interface/omp_hello/Makefile
index 9ecad31df4..ac2bc81844 100644
--- a/src/components/implementation/no_interface/omp_hello/Makefile
+++ b/src/components/implementation/no_interface/omp_hello/Makefile
@@ -2,7 +2,7 @@ COMPONENT=omp_hello.o
 INTERFACES=
 DEPENDENCIES=capmgr
 IF_LIB=
-ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLCAPMGR) -lsl_mod_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLCAPMGR) -lsl_mod_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart -lsl_blkpt
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/no_interface/omp_hello/hello_omp.c b/src/components/implementation/no_interface/omp_hello/hello_omp.c
index e9c0932b5c..f2d2495aa9 100644
--- a/src/components/implementation/no_interface/omp_hello/hello_omp.c
+++ b/src/components/implementation/no_interface/omp_hello/hello_omp.c
@@ -3,6 +3,7 @@
 #include <llprint.h>
 #include <sl.h>
 #include <cos_omp.h>
+#include <hypercall.h>
 
 /******************************************************************************/
 
@@ -60,7 +61,7 @@ int main ( void )
 #if 1
 #pragma omp parallel
   {
-#pragma omp for schedule(dynamic)
+#pragma omp for
   for (id = 0; id < 10; id++) {
 	  PRINTC("id:%u\n", id);
   }
@@ -111,8 +112,8 @@ cos_init(void *d)
 	int i;
 	static volatile unsigned long first = NUM_CPU + 1, init_done[NUM_CPU] = { 0 };
 
+	PRINTC("In OpenMP-based Hello Program!\n");
 	if (ps_cas((unsigned long *)&first, NUM_CPU + 1, cos_cpuid())) {
-		PRINTC("In OpenMP-based Hello Program!\n");
 		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
 		cos_defcompinfo_init();
 	} else {
@@ -120,19 +121,19 @@ cos_init(void *d)
 
 		cos_defcompinfo_sched_init();
 	}
-	sl_init(SL_MIN_PERIOD_US*100);
 	ps_faa((unsigned long *)&init_done[cos_cpuid()], 1);
 
 	/* make sure the INITTHD of the scheduler is created on all cores.. for cross-core sl initialization to work! */
 	for (i = 0; i < NUM_CPU; i++) {
 		while (!ps_load((unsigned long *)&init_done[i])) ;
 	}
+	sl_init(SL_MIN_PERIOD_US*100);
+	cos_gomp_init();
+	hypercall_comp_init_done();
 
 	if (!cos_cpuid()) {
 		struct sl_thd *t = NULL;
 
-		cos_gomp_init();
-
 		t = sl_thd_alloc(cos_main, NULL);
 		assert(t);
 		sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX));
diff --git a/src/components/include/crt_lock.h b/src/components/include/crt_lock.h
index c393c53648..95e901e52b 100644
--- a/src/components/include/crt_lock.h
+++ b/src/components/include/crt_lock.h
@@ -39,7 +39,7 @@ crt_lock_take(struct crt_lock *l)
 	while (1) {
 		crt_blkpt_checkpoint(&l->blkpt, &chkpt);
 
-		if (ps_cas(&l->owner, 0, (unsigned long)cos_thdid())) {
+		if (ps_cas(&l->owner, 0, (unsigned long)(cos_cpuid() << 16 | cos_thdid()))) {
 			return;	/* success! */
 		}
 		/* failure: try and block */
@@ -50,7 +50,7 @@ crt_lock_take(struct crt_lock *l)
 static inline void
 crt_lock_release(struct crt_lock *l)
 {
-	assert(l->owner == cos_thdid());
+	assert(l->owner == (unsigned long)(cos_cpuid() << 16 | cos_thdid()));
 	l->owner = 0;
 	/* if there are blocked threads, wake 'em up! */
 	crt_blkpt_trigger(&l->blkpt, 0);
diff --git a/src/components/include/part.h b/src/components/include/part.h
index 57a505909c..1cf459e997 100644
--- a/src/components/include/part.h
+++ b/src/components/include/part.h
@@ -4,8 +4,11 @@
 #include <part_task.h>
 #include <ps_list.h>
 #include <deque.h>
+#include <crt_lock.h>
 
 #include <sl.h>
+
+#define PART_NESTED 0 /* 0 - disabled, 1 - enabled */
 //#include <cirque.h>
 
 DEQUE_PROTOTYPE(part, struct part_task *);
@@ -13,7 +16,9 @@ DEQUE_PROTOTYPE(part, struct part_task *);
 
 extern struct deque_part part_dq_percore[];
 //extern struct cirque_par parcq_global;
+/* FIXME: use stacklist or another stack like data structure? */
 extern struct ps_list_head part_l_global;
+extern struct crt_lock     part_l_lock;
 
 static inline struct deque_part *
 part_deque_curr(void)
@@ -86,10 +91,10 @@ part_deque_steal_any(void)
 		struct part_task *t = NULL;
 
 		i ++;
-		if (c == (unsigned)cos_cpuid()) c = (c + 1) % NUM_CPU;
+		if (unlikely(c == (unsigned)cos_cpuid())) c = (c + 1) % NUM_CPU;
 
 		t = part_deque_steal(c);
-		if (t) return t;
+		if (likely(t)) return t;
 	} while (i < NUM_CPU);
 
 	return NULL;
@@ -121,7 +126,9 @@ part_list_append(struct part_task *t)
 	assert(ps_list_singleton(t, partask));
 	assert(t->type == PART_TASK_T_WORKSHARE);
 
+	crt_lock_take(&part_l_lock);
 	ps_list_head_append(part_list(), t, partask);
+	crt_lock_release(&part_l_lock);
 }
 
 static inline void
@@ -130,44 +137,89 @@ part_list_remove(struct part_task *t)
 	assert(t->type == PART_TASK_T_WORKSHARE);
 	assert(!ps_list_singleton(t, partask));
 
+	crt_lock_take(&part_l_lock);
 	ps_list_rem(t, partask);
+	crt_lock_release(&part_l_lock);
 }
 
 static inline struct part_task *
 part_list_peek(void)
 {
 	struct part_task *t = NULL;
+	int found = 0;
 
-	if (ps_list_head_empty(part_list())) return NULL;
+	crt_lock_take(&part_l_lock);
+	if (unlikely(ps_list_head_empty(part_list()))) goto done;
 	/* not great! traversing from the first element always! */
 	/* TODO: perhaps traverse from the current task? */
 	ps_list_foreach(part_list(), t, partask) {
 		int i;
 
 		assert(t);
-
 		assert(t->type == PART_TASK_T_WORKSHARE);
 		/* coz, master thread adds to list the implicit task and doesn't defer it */
 		i = part_task_work_try(t);
 		assert(i != 0);
 
-		if (i > 0) return t; 
+		if (likely(i > 0 && !ps_load(&t->end))) {
+			found = 1;
+			break;
+		}
 	}
 
-	return NULL;
+done:
+	crt_lock_release(&part_l_lock);
+
+	if (unlikely(!found)) return NULL;
+
+	return t;
 }
 
 void part_init(void);
 
 unsigned part_isready(void);
 
+/* a part_task.h api but uses part_list_remove in the master thread, so here! */
+static inline void
+part_task_end(struct part_task *t)
+{
+	struct sl_thd *ts = sl_thd_curr();
+	int tn = part_task_work_thd_num(t);
+
+	part_task_barrier(t);
+
+	assert(tn >= 0 && t->nthds >= 1);
+	assert(ts->part_context == (void *)t);
+	if (t->nthds == 1) {
+		assert(tn == 0);
+
+		return;
+	}
+
+	if (tn == 0) {
+		if (t->type == PART_TASK_T_WORKSHARE) part_list_remove(t);
+		ts->part_context = t->parent;
+		part_task_remove_child(t->parent, t);
+		ps_faa(&t->end, 1);
+	} else {
+		ps_faa(&t->end, 1);
+		while (ps_load(&t->end) != t->nthds) sl_thd_yield(0);
+
+		ts->part_context = NULL;
+	}
+}
+
+
 static inline void
 part_thd_fn(void *d)
 {
 	struct sl_thd *curr = sl_thd_curr();
 
-	while (!part_isready()) sl_thd_yield(0);
-	while (ps_list_head_empty(part_list())) sl_thd_yield(0);
+	/* parallel runtime not ready? */
+	while (unlikely(!part_isready())) sl_thd_yield(0);
+
+	/* no parallel sections? */
+	while (unlikely(ps_list_head_empty(part_list()))) sl_thd_yield(0);
 
 	while (1) {
 		struct part_task *t = NULL;
@@ -177,35 +229,33 @@ part_thd_fn(void *d)
 
 		/* FIXME: nested parallel needs love! */
 		t = part_list_peek();
-		if (t) goto found;
+		if (likely(t)) goto found;
 
 single:
 		ret = part_deque_pop(t);
-		if (ret == 0) {
+		if (likely(ret == 0)) {
 			assert(t->type != PART_TASK_T_WORKSHARE);
 
 			goto found;
 		}
 
-		if (ret == -EAGAIN) goto single;
+		if (unlikely(ret == -EAGAIN)) goto single;
 
 		t = part_deque_steal_any();
-		if (!t) {
+		if (unlikely(!t)) {
 			sl_thd_yield(0);
 			continue;
 		}
 		assert(t->type != PART_TASK_T_WORKSHARE);
 found:
 		thdnum = part_task_work_try(t);
-		if (thdnum < 0) continue;
+		if (unlikely(thdnum < 0)) continue;
 		if (t->type != PART_TASK_T_WORKSHARE) assert(thdnum == 0);
 		curr->part_context = (void *)t;
 
 		t->cs.fn(t->cs.data);
 
-		if (t->type != PART_TASK_T_WORKSHARE) continue;
-
-		part_task_barrier(t);
+		part_task_end(t);
 	}
 
 	sl_thd_exit();
diff --git a/src/components/include/part_task.h b/src/components/include/part_task.h
index a88ce9b23f..3c6be3519e 100644
--- a/src/components/include/part_task.h
+++ b/src/components/include/part_task.h
@@ -13,7 +13,7 @@
 #define PART_MAX_CORE_THDS  4
 #define PART_MAX_THDS       PART_MAX_CORE_THDS*NUM_CPU
 #define PART_MAX_CHILD      PART_MAX
-#define PART_MAX_WORKSHARES 8
+#define PART_MAX_WORKSHARES 16
 
 typedef void (*part_fn_t)(void *);
 
@@ -28,8 +28,7 @@ typedef enum {
 } part_task_state_t;
 
 typedef enum {
-	PART_TASK_T_NONE,
-	PART_TASK_T_WORKSHARE, /* task to put in a shared fifo queue */
+	PART_TASK_T_WORKSHARE = 1, /* task to put in a shared fifo queue */
 } part_task_type_t;
 
 typedef enum {
@@ -69,9 +68,8 @@ struct part_task {
 	unsigned nthds; /* number of threads for this task, 1 in case of non-workshare work */
 	unsigned workers[PART_MAX_THDS]; /* threads sharing this work or thread doing this work! */
 	int ws_off[PART_MAX_THDS]; /* progress of the workshares in each participating thread */
-	//unsigned nwsdone;
 	unsigned master; /* coreid << 16 | thdid of the master */
-	unsigned barrier_in, barrier_out;
+	unsigned barrier_in, barrier_out, end;
 
 	/* TODO: parent to wait on all child tasks for taskwait synchronization! */
 	struct part_task *parent;
@@ -108,8 +106,10 @@ part_task_add_child(struct part_task *t, struct part_task *c)
 {
 	int i;
 
+	if (unlikely(!t || !c)) return -1;
+
 	for (i = 0; i < PART_MAX_CHILD; i++) {
-		if (t->child[i] == 0 && ps_cas((unsigned long *)&t->child[i], 0, (unsigned long)c)) return i;
+		if (likely(t->child[i] == 0 && ps_cas(&t->child[i], 0, (unsigned long)c))) return i;
 	}
 
 	return -1;
@@ -120,34 +120,34 @@ part_task_remove_child(struct part_task *t, struct part_task *c)
 {
 	int i;
 
-	if (!t || !c) return;
+	if (unlikely(!t || !c)) return;
 
 	for (i = 0; i < PART_MAX_CHILD; i++) {
 		if (t->child[i] != c) continue;
 
-		if (!ps_cas((unsigned long *)&t->child[i], (unsigned long)c, 0)) assert(0);
+		if (unlikely(!ps_cas(&t->child[i], (unsigned long)c, 0))) assert(0);
 	}
 }
 
 static inline int
 part_task_work_try(struct part_task *t)
 {
-	int i = 0;
+	unsigned i = 0;
         unsigned key = PART_CURR_THD;
 
 	if (t->type != PART_TASK_T_WORKSHARE) {
 		assert(t->nthds == 1);
 	} else {
 		assert(t->master != key && t->master == t->workers[0]);
-		i = 1;
+		assert(t->nthds >= 1);
 	}
 
-	for (; i < (int)t->nthds; i++)
+	for (; i < t->nthds; i++)
 	{
 		if (t->workers[i] == key) return i;
 		if (t->workers[i]) continue;
 
-		if (ps_cas((unsigned long *)&t->workers[i], 0, key)) return i;
+		if (likely(ps_cas(&t->workers[i], 0, key))) return i;
 	}
 
 	return -1;
@@ -172,10 +172,11 @@ part_task_work_thd_num(struct part_task *t)
 static inline void
 part_task_barrier(struct part_task *t)
 {
+	struct sl_thd *ts = sl_thd_curr();
 	int tn = part_task_work_thd_num(t);
 	unsigned cin = 0, cout = 0;
 
-	assert(tn >= 0);
+	assert(tn >= 0 && t->nthds >= 1);
 
 	if (t->nthds == 1) {
 		assert(tn == 0 && t->barrier_in == 0);
@@ -184,22 +185,22 @@ part_task_barrier(struct part_task *t)
 	}
 
 	/* wait for all siblings to have seen the previous barrier */
-	while (ps_load((unsigned long *)&t->barrier_out) % t->nthds) sl_thd_yield(0);
+	while (ps_load(&t->barrier_out) % t->nthds) sl_thd_yield(0);
 
-	cin = ps_faa((unsigned long *)&t->barrier_in, 1);
+	cin = ps_faa(&t->barrier_in, 1);
 	if (cin % t->nthds == t->nthds - 1) {
 		int i;
 
 		/* wait for all child tasks to complete, including explicit tasks */
 		for (i = 0; i < PART_MAX_CHILD; i++) {
-			while (ps_load((unsigned long *)&t->child[i])) sl_thd_yield(0);
+			while (ps_load(&t->child[i])) sl_thd_yield(0);
 		}
 	} else {
 		/* wait for all sibling tasks to reach in barrier! */
-		while (ps_load((unsigned long *)&t->barrier_in) % t->nthds != 0) sl_thd_yield(0);
+		while (ps_load(&t->barrier_in) % t->nthds != 0) sl_thd_yield(0);
 	}
 
-	ps_faa((unsigned long *)&t->barrier_out, 1);
+	ps_faa(&t->barrier_out, 1);
 }
 
 #endif /* PART_TASK_H */
diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index f3ed3cb440..16eef7e028 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -104,11 +104,15 @@ static inline struct sl_thd *
 sl_thd_lkup(thdid_t tid)
 {
 	struct sl_thd *t;
+	struct sl_xcore_thd *xt;
 
 	if (unlikely(tid < 1 || tid > MAX_NUM_THREADS)) return NULL;
 	t = sl_mod_thd_get(sl_thd_lookup_backend(tid));
 	if (likely(t && sl_thd_aepinfo(t))) return t;
+	xt = sl_xcore_thd_lookup(tid);
+	if (unlikely(xt && xt->core != cos_cpuid())) return NULL;
 
+	/* FIXME: cross-core child threads must be handled in retrieve */
 	return sl_thd_retrieve_lazy(tid);
 }
 
diff --git a/src/components/include/sl_xcore.h b/src/components/include/sl_xcore.h
index 618246dd9d..8cf838142e 100644
--- a/src/components/include/sl_xcore.h
+++ b/src/components/include/sl_xcore.h
@@ -94,7 +94,8 @@ struct sl_xcore_thd {
 	asndcap_t asnd[NUM_CPU];
 } CACHE_ALIGNED;
 
-struct sl_xcore_thd *sl_xcore_thd_lookup(thdid_t tid, cpuid_t core);
+struct sl_xcore_thd *sl_xcore_thd_lookup(thdid_t tid);
+struct sl_xcore_thd *sl_xcore_thd_lookup_init(thdid_t tid, cpuid_t core);
 static inline thdid_t
 sl_xcore_thd_thdid(struct sl_xcore_thd *t)
 {
diff --git a/src/components/include/stacklist.h b/src/components/include/stacklist.h
index 8651a408a7..f1ddd8078d 100644
--- a/src/components/include/stacklist.h
+++ b/src/components/include/stacklist.h
@@ -12,6 +12,7 @@
 #include <ps.h>
 
 struct stacklist {
+	cpuid_t coreid;
 	thdid_t thdid;
 	struct stacklist *next;
 };
@@ -44,8 +45,9 @@ stacklist_rem(struct stacklist *l)
 static inline void
 stacklist_add(struct stacklist_head *h, struct stacklist *l)
 {
-	l->thdid = cos_thdid();
-	l->next  = NULL;
+	l->coreid = cos_cpuid();
+	l->thdid  = cos_thdid();
+	l->next   = NULL;
 	assert(h);
 
 	while (1) {
@@ -57,8 +59,8 @@ stacklist_add(struct stacklist_head *h, struct stacklist *l)
 }
 
 /* Get a thread to wake up, and remove its record! */
-static inline thdid_t
-stacklist_dequeue(struct stacklist_head *h)
+static inline thdid_t 
+stacklist_dequeue(cpuid_t *core, struct stacklist_head *h)
 {
 	struct stacklist *sl;
 
@@ -76,6 +78,7 @@ stacklist_dequeue(struct stacklist_head *h)
 		if (ps_cas((unsigned long *)&h->head, (unsigned long)sl, (unsigned long)sl->next)) break;
 	}
 	sl->next = NULL;
+	*core    = sl->coreid;
 
 	return sl->thdid;
 }
diff --git a/src/components/lib/cos_gomp/cos_gomp.c b/src/components/lib/cos_gomp/cos_gomp.c
index 78f3ea23af..90bda9d1a7 100644
--- a/src/components/lib/cos_gomp/cos_gomp.c
+++ b/src/components/lib/cos_gomp/cos_gomp.c
@@ -17,18 +17,26 @@
 #include <cos_omp.h>
 
 #include "cos_gomp.h"
+#include <crt_lock.h>
 #include <part.h>
 
 #define COS_GOMP_MAX_EXPLICIT_TASKS 1024
 #define COS_GOMP_MAX_IMPLICIT_TASKS 512
 
+#define COS_GOMP_MAX_ARGS    8
+#define COS_GOMP_MAX_ARG_SZ  64
+#define COS_GOMP_MAX_ARGS_SZ (COS_GOMP_MAX_ARGS * COS_GOMP_MAX_ARG_SZ)
+
 static struct part_task _itasks[COS_GOMP_MAX_IMPLICIT_TASKS], _etasks[COS_GOMP_MAX_EXPLICIT_TASKS];
-static unsigned _itask_free, _etask_free;
+static unsigned _itask_free, _etask_free, _etask_data_free;
+static char _etask_data[COS_GOMP_MAX_EXPLICIT_TASKS][COS_GOMP_MAX_ARGS_SZ];
+
+static struct crt_lock _glock; /* global lock for critical sections */
 
 static inline struct part_task *
 _cos_gomp_alloc_implicit(void)
 {
-	unsigned i = ps_faa((unsigned long *)&_itask_free, 1);
+	unsigned i = ps_faa(&_itask_free, 1);
 
 	assert(i < COS_GOMP_MAX_IMPLICIT_TASKS);
 	return &_itasks[i];
@@ -37,20 +45,38 @@ _cos_gomp_alloc_implicit(void)
 static inline struct part_task *
 _cos_gomp_alloc_explicit(void)
 {
-	unsigned i = ps_faa((unsigned long *)&_etask_free, 1);
+	unsigned i = ps_faa(&_etask_free, 1);
 
 	assert(i < COS_GOMP_MAX_EXPLICIT_TASKS);
 	return &_etasks[i];
 }
 
+static inline char *
+_cos_gomp_alloc_data_explicit(void)
+{
+	unsigned i = ps_faa(&_etask_data_free, 1);
+
+	assert(i < COS_GOMP_MAX_EXPLICIT_TASKS);
+	return _etask_data[i];
+}
+
 void
 cos_gomp_init(void)
 {
-	memset(_itasks, 0, sizeof(struct part_task) * COS_GOMP_MAX_IMPLICIT_TASKS);
-	memset(_etasks, 0, sizeof(struct part_task) * COS_GOMP_MAX_EXPLICIT_TASKS);
-	_itask_free = _etask_free = 0;
-
-	cos_omp_init();
+	static int first_one = NUM_CPU, init_done = 0;
+
+	if (ps_cas(&first_one, NUM_CPU, cos_cpuid())) {
+		memset(_itasks, 0, sizeof(struct part_task) * COS_GOMP_MAX_IMPLICIT_TASKS);
+		memset(_etasks, 0, sizeof(struct part_task) * COS_GOMP_MAX_EXPLICIT_TASKS);
+		memset(_etask_data, 0, sizeof(char) * COS_GOMP_MAX_EXPLICIT_TASKS * COS_GOMP_MAX_ARGS_SZ);
+		_itask_free = _etask_free = _etask_data_free = 0;
+
+		crt_lock_init(&_glock);
+		cos_omp_init();
+		init_done = 1;
+	} else {
+		while(!ps_load(&init_done)) ;
+	}
 	part_init();
 }
 
@@ -62,28 +88,25 @@ _gomp_parallel_start(struct part_task *pt, void (*fn) (void *), void *data, unsi
 	struct part_task *parent = (struct part_task *)t->part_context;
 
 	num_threads = num_threads ? ((num_threads > COS_GOMP_MAX_THDS) ? COS_GOMP_MAX_THDS : num_threads) : PART_MAX;
+	/* nesting? */
+	if (unlikely(parent && PART_NESTED == 0)) num_threads = 1;
+
 	part_task_init(pt, PART_TASK_T_WORKSHARE, parent, num_threads, fn, data);
-	if (parent) {
+	assert(pt->nthds == num_threads);
+	if (unlikely(parent)) {
 		parent_off = part_task_add_child(parent, pt);
 		assert(parent_off >= 0);
 	}
 	t->part_context = pt;
 
-	if (num_threads > 1) part_list_append(pt);
+	if (unlikely(num_threads > 1)) part_list_append(pt);
 }
 
 static inline void
 _gomp_parallel_end(struct part_task *pt)
 {
-	struct sl_thd *t = sl_thd_curr();
-
-	/* implicit barrier */
-	part_task_barrier(pt);
-
-	if (pt->nthds > 1) part_list_remove(pt);
-
-	t->part_context = pt->parent;
-	part_task_remove_child(pt->parent, pt);
+	/* implicit hard barrier. only master thread to deinit task and all other threads just go back to pool */
+	part_task_end(pt);
 }
 
 /* GOMP_parallel prototype from libgomp within gcc */
@@ -111,14 +134,14 @@ GOMP_single_start(void)
 		struct part_workshare *pw = &t->ws[i];
 		unsigned c;
 
-		if (ps_load((unsigned long *)&pw->type) == PART_WORKSHARE_NONE) {
+		if (ps_load(&pw->type) == PART_WORKSHARE_NONE) {
 			/* perhaps one of the threads just converted it to a single */
-			if (!ps_cas((unsigned long *)&pw->type, PART_WORKSHARE_NONE, PART_WORKSHARE_SINGLE)) assert(pw->type == PART_WORKSHARE_SINGLE);
+			if (!ps_cas(&pw->type, PART_WORKSHARE_NONE, PART_WORKSHARE_SINGLE)) assert(pw->type == PART_WORKSHARE_SINGLE);
 		}
-		if (ps_load((unsigned long *)&pw->type) != PART_WORKSHARE_SINGLE) continue;
+		if (ps_load(&pw->type) != PART_WORKSHARE_SINGLE) continue;
 
 retry_bmp:
-		c = ps_load((unsigned long *)&pw->worker_bmp);
+		c = ps_load(&pw->worker_bmp);
 		/* if already went through this, should not have called start! */
 		assert(!(c & b));
 
@@ -129,7 +152,7 @@ GOMP_single_start(void)
 		 * if cas failed, try again as you have to indicate that this thd
 		 * has done this construct!
 		 */
-		if (ps_cas((unsigned long *)&pw->worker_bmp, c, c | b)) {
+		if (ps_cas(&pw->worker_bmp, c, c | b)) {
 			t->ws_off[coff] = i;
 
 			return c ? false : true;
@@ -156,16 +179,16 @@ _gomp_loop_dynamic_next(struct part_task *t, struct part_workshare *w, long *s,
 	long cn, left, wrk = 0;
 
 retry:
-	cn = ps_load((unsigned long *)&w->next);
+	cn = ps_load(&w->next);
 	left = w->end - cn;
 
-	if (left == 0) return false;
+	if (unlikely(left == 0)) return false;
 	/* todo: incr <= 0 */
 	assert(w->inc > 0);
 
 	wrk = w->chunk_sz;
-	if (left < wrk) wrk = left;
-	if (!ps_cas((unsigned long *)&w->next, cn, cn + wrk)) goto retry;
+	if (unlikely(left < wrk)) wrk = left;
+	if (!ps_cas(&w->next, cn, cn + wrk)) goto retry;
 
 	*s = cn;
 	*e = cn + wrk;
@@ -187,33 +210,33 @@ GOMP_loop_dynamic_start (long start, long end, long incr, long chunk_size,
 		struct part_workshare *pw = &t->ws[i];
 		unsigned c;
 
-		if (ps_load((unsigned long *)&pw->type) == PART_WORKSHARE_NONE) {
+		if (ps_load(&pw->type) == PART_WORKSHARE_NONE) {
 			/* perhaps one of the threads just converted it to a loop */
-			if (!ps_cas((unsigned long *)&pw->type, PART_WORKSHARE_NONE, PART_WORKSHARE_LOOP_DYNAMIC)) assert(pw->type == PART_WORKSHARE_LOOP_DYNAMIC);
+			if (!ps_cas(&pw->type, PART_WORKSHARE_NONE, PART_WORKSHARE_LOOP_DYNAMIC)) assert(pw->type == PART_WORKSHARE_LOOP_DYNAMIC);
 		}
 
-		if (ps_load((unsigned long *)&pw->type) != PART_WORKSHARE_LOOP_DYNAMIC) continue;
+		if (ps_load(&pw->type) != PART_WORKSHARE_LOOP_DYNAMIC) continue;
 
 retry_bmp:
-		c = ps_load((unsigned long *)&pw->worker_bmp);
+		c = ps_load(&pw->worker_bmp);
 		/* if already went through this, should not have called start! */
 		assert(!(c & b));
 
 		/* 
 		 * this thd, add to worker bmp to indicate it reached the construct.
 		 */
-		if (ps_cas((unsigned long *)&pw->worker_bmp, c, c | b)) t->ws_off[coff] = i;
+		if (ps_cas(&pw->worker_bmp, c, c | b)) t->ws_off[coff] = i;
 		else goto retry_bmp;
 
 		/* all threads participating will initialize to the same values */
-		if (!pw->end) {
+		if (unlikely(!pw->end)) {
 			pw->chunk_sz = chunk_size;
 			pw->inc = incr;
 			pw->st = start;
 			pw->end = end;
 		}
 
-		if (istart && iend) return _gomp_loop_dynamic_next(t, pw, istart, iend);
+		if (likely(istart && iend)) return _gomp_loop_dynamic_next(t, pw, istart, iend);
 		else return true;
 	}
 
@@ -245,8 +268,7 @@ GOMP_loop_dynamic_next (long *istart, long *iend)
 	unsigned coff = part_task_work_thd_num(t);
 	int woff = t->ws_off[coff];
 
-	woff = woff < 0 ? 0 : woff;
-	t->ws_off[coff] = woff;
+	if (unlikely(woff < 0)) t->ws_off[coff] = woff = 0;
 	assert(t->ws[woff].type == PART_WORKSHARE_LOOP_DYNAMIC);
 
 	return _gomp_loop_dynamic_next(t, &t->ws[woff], istart, iend);
@@ -264,8 +286,8 @@ GOMP_loop_end (void)
 	part_task_barrier(t);
 
 //	do {
-//		c = ps_load((unsigned long *)&t->nwsdone);
-//	} while (!ps_cas((unsigned long *)&t->nwsdone, c, c | (1 << woff)));
+//		c = ps_load(&t->nwsdone);
+//	} while (!ps_cas(&t->nwsdone, c, c | (1 << woff)));
 }
 
 void
@@ -277,20 +299,93 @@ GOMP_loop_end_nowait (void)
 
 	assert(t->ws[woff].type == PART_WORKSHARE_LOOP_DYNAMIC);
 //	do {
-//		c = ps_load((unsigned long *)&t->nwsdone);
-//	} while (!ps_cas((unsigned long *)&t->nwsdone, c, c | (1 << woff)));
+//		c = ps_load(&t->nwsdone);
+//	} while (!ps_cas(&t->nwsdone, c, c | (1 << woff)));
 }
 
 void
 GOMP_critical_start (void)
 {
-//	/* TODO: a multi-core lock! */
-//	sl_lock_take(&_cos_gomp_lock);
+	crt_lock_take(&_glock);
 }
 
 void
 GOMP_critical_end (void)
 {
-//	/* TODO: a multi-core lock! */
-//	sl_lock_release(&_cos_gomp_lock);
+	crt_lock_release(&_glock);
+}
+
+void
+GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
+           long arg_size, long arg_align, bool if_clause, unsigned flags,
+           void **depend, int priority)
+{
+	struct part_task *parent = (struct part_task *)sl_thd_curr()->part_context;
+	int parent_off = -1, ret = -1;
+
+	/* 
+	 * There should be nothing that prevents us to enqueue a task that 
+	 * has a dependency, in or out!
+	 * The thread that pops this task should potentially do the dependency
+	 * tracking before/after execution of the function.
+	 */
+	/* TODO: depend, flags, etc! */
+	assert(!depend);
+
+	if (if_clause) {
+		struct part_task *pt = _cos_gomp_alloc_explicit();
+		char *arg = _cos_gomp_alloc_data_explicit();
+
+		assert(arg_size + arg_align - 1 <= COS_GOMP_MAX_ARGS_SZ);
+		if (cpyfn) cpyfn(arg, data);
+		else       memcpy(arg, data, arg_size);
+
+		assert(parent);
+		part_task_init(pt, 0, parent, 0, fn, arg);
+		parent_off = part_task_add_child(parent, pt);
+		assert(parent_off >= 0);
+
+		do {
+			ret = part_deque_push(pt);
+		} while (ret == -EAGAIN);
+		assert(ret == 0);
+	} else {
+		/* if_clause is false, task is an included/undeferred task */
+		struct part_task pt;
+
+		assert(parent);
+		part_task_init(&pt, 0, parent, 0, fn, data);
+		parent_off = part_task_add_child(parent, &pt);
+		assert(parent_off >= 0);
+
+		/* TODO: do I still need to make a copy like in libgomp? */
+		fn(data);
+
+		part_task_end(&pt);
+	}
+}
+
+void
+GOMP_taskwait (void)
+{
+	struct part_task *t = sl_thd_curr()->part_context;
+	int i;
+
+	for (i = 0; i < PART_MAX_CHILD; i++) {
+		struct part_task *ct = t->child[i];
+
+		if (!ct) continue;
+
+		/* 
+		 * TODO:
+		 * Options for explicit tasks: 
+		 * 1. Perhaps run that task here if it has not been picked up by any other thread, 
+		 *    unfortunately we cannot do that with "deque" data-structure!
+		 * 2. Perhaps yield to a free thread that could potentially run that task? 
+		 * 3. Just yield (a task scheduling point = a thread scheduling point), 
+		 *    so other pending work is taken care of before we get to run again!
+		 */
+		while (ct) sl_thd_yield(0);
+	}
+	/* no barriers of course! */
 }
diff --git a/src/components/lib/part.c b/src/components/lib/part.c
index 59e0add4f6..9ce2471990 100644
--- a/src/components/lib/part.c
+++ b/src/components/lib/part.c
@@ -10,48 +10,42 @@ struct deque_part part_dq_percore[NUM_CPU];
 //struct cirque_par parcq_global;
 struct ps_list_head part_l_global;
 static unsigned part_ready = 0;
+struct crt_lock part_l_lock;
 
 #define _PART_PRIO 1
 #define _PART_PRIO_PACK() sched_param_pack(SCHEDP_PRIO, _PART_PRIO)
 
 unsigned
 part_isready(void)
-{ return part_ready; }
+{ return (part_ready == NUM_CPU); }
 
 void
 part_init(void)
 {
-	int j;
-	struct sl_xcore_thd *x;
-	sched_param_t p = _PART_PRIO_PACK();
-	sched_param_t pa[1] = { p };
-	struct sl_thd *t;
-	static int is_first = NUM_CPU;
-
-	ps_list_head_init(&part_l_global);
-	if (!ps_cas((unsigned long *)&is_first, NUM_CPU, cos_cpuid())) return;
+	int k;
+	static int is_first = NUM_CPU, ds_init_done = 0;
+
+	if (!ps_cas(&is_first, NUM_CPU, cos_cpuid())) {
+		while (!ps_load(&ds_init_done)) ;
+	} else {
+		ps_list_head_init(&part_l_global);
+		crt_lock_init(&part_l_lock);
+		ps_faa(&ds_init_done, 1);
+	}
 	
-	for (j = 0; j < NUM_CPU; j++) {
-		int k;
-
-		if (j == cos_cpuid()) {
-			for (k = 0; k < PART_MAX_CORE_THDS; k++) {
-				t = sl_thd_alloc(part_thd_fn, NULL);
-				assert(t);
-
-				sl_thd_param_set(t, p);
-
-				x = sl_xcore_thd_lookup(sl_thd_thdid(t), cos_cpuid());
-				assert(x);
-			}
-
-		} else {
-			for (k = 0; k < PART_MAX_CORE_THDS; k++) {
-				x = sl_xcore_thd_alloc(j, part_thd_fn, NULL, 1, pa);
-				assert(x);
-			}
-		}
+	for (k = 0; k < PART_MAX_CORE_THDS; k++) {
+		struct sl_xcore_thd *x;
+		struct sl_thd *t;
+		sched_param_t p = _PART_PRIO_PACK();
+
+		t = sl_thd_alloc(part_thd_fn, NULL);
+		assert(t);
+
+		sl_thd_param_set(t, p);
+
+		x = sl_xcore_thd_lookup_init(sl_thd_thdid(t), cos_cpuid());
+		assert(x);
 	}
 
-	part_ready = 1;
+	ps_faa(&part_ready, 1);
 }
diff --git a/src/components/lib/sl/sl_blkpt.c b/src/components/lib/sl/sl_blkpt.c
index dac56db1d1..de59ee69a1 100644
--- a/src/components/lib/sl/sl_blkpt.c
+++ b/src/components/lib/sl/sl_blkpt.c
@@ -41,7 +41,7 @@ sched_blkpt_alloc(void)
 
 	sl_cs_enter();
 
-	id = (sched_blkpt_id_t)__blkpt_offset;
+	id = (sched_blkpt_id_t)ps_faa(&__blkpt_offset, 1);
 	m  = blkpt_get(id);
 	if (!m) ERR_THROW(SCHED_BLKPT_NULL, unlock);
 
@@ -49,7 +49,7 @@ sched_blkpt_alloc(void)
 	ret      = id;
 	m->epoch = 0;
 	stacklist_init(&m->blocked);
-	__blkpt_offset++;
+	/* TODO: undo offset if it failed in an multi-core safe way!*/
 unlock:
 	sl_cs_exit();
 
@@ -67,7 +67,7 @@ int
 sched_blkpt_trigger(sched_blkpt_id_t blkpt, sched_blkpt_epoch_t epoch, int single)
 {
 	thdid_t tid;
-	struct sl_thd *t;
+	cpuid_t core;
 	struct blkpt_mem *m;
 	int ret = 0;
 
@@ -80,11 +80,22 @@ sched_blkpt_trigger(sched_blkpt_id_t blkpt, sched_blkpt_epoch_t epoch, int singl
 	if (!blkpt_epoch_is_higher(m->epoch, epoch)) ERR_THROW(0, unlock);
 
 	m->epoch = epoch;
-	while ((tid = stacklist_dequeue(&m->blocked)) != 0) {
-		t = sl_thd_lkup(tid);
-		assert(t);
-
-		sl_thd_wakeup_no_cs(t); /* ignore retval: process next thread */
+	while ((tid = stacklist_dequeue(&core, &m->blocked)) != 0) {
+		if (core == cos_cpuid()) {
+			struct sl_thd *t = sl_thd_lkup(tid);
+
+			assert(t);
+
+			sl_thd_wakeup_no_cs(t); /* ignore retval: process next thread */
+		} else {
+			struct sl_xcore_thd *t = sl_xcore_thd_lookup(tid);
+
+			assert(t && t->core == core);
+			/* perhaps sl_xcore_thd_wakeup_no_cs? */
+			sl_cs_exit();
+			sl_xcore_thd_wakeup(t);
+			sl_cs_enter();
+		}
 	}
 	/* most likely we switch to a woken thread here */
 	sl_cs_exit_schedule();
diff --git a/src/components/lib/sl/sl_capmgr.c b/src/components/lib/sl/sl_capmgr.c
index d8ec61991c..346e59c5ba 100644
--- a/src/components/lib/sl/sl_capmgr.c
+++ b/src/components/lib/sl/sl_capmgr.c
@@ -81,6 +81,7 @@ sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t
 	t->prio           = TCAP_PRIO_MIN;
 	ps_list_init(t, SL_THD_EVENT_LIST);
 	sl_thd_event_info_reset(t);
+	sl_xcore_thd_lookup_init(aep->tid, cos_cpuid());
 
 done:
 	return t;
diff --git a/src/components/lib/sl/sl_sched.c b/src/components/lib/sl/sl_sched.c
index 7502e41c64..062a6ccb1a 100644
--- a/src/components/lib/sl/sl_sched.c
+++ b/src/components/lib/sl/sl_sched.c
@@ -524,14 +524,13 @@ sl_thd_exit()
 }
 
 void
-sl_thd_param_set(struct sl_thd *t, sched_param_t sp)
+sl_thd_param_set_no_cs(struct sl_thd *t, sched_param_t sp)
 {
 	sched_param_type_t type;
 	unsigned int       value;
 
 	assert(t);
 
-	sl_cs_enter();
 	sched_param_get(sp, &type, &value);
 
 	switch (type) {
@@ -550,6 +549,16 @@ sl_thd_param_set(struct sl_thd *t, sched_param_t sp)
 	}
 
 	sl_mod_thd_param_set(sl_mod_thd_policy_get(t), type, value);
+}
+
+void
+sl_thd_param_set(struct sl_thd *t, sched_param_t sp)
+{
+	assert(t);
+
+	sl_cs_enter();
+
+	sl_thd_param_set_no_cs(t, sp);
 	sl_cs_exit();
 }
 
diff --git a/src/components/lib/sl/sl_xcore.c b/src/components/lib/sl/sl_xcore.c
index 283cf9fb79..ca17543a34 100644
--- a/src/components/lib/sl/sl_xcore.c
+++ b/src/components/lib/sl/sl_xcore.c
@@ -8,6 +8,7 @@
 
 /* static xcore thread backend! mainly for bookkeeping across cores! */
 static struct sl_xcore_thd _xcore_thds[MAX_NUM_THREADS];
+extern void sl_thd_param_set_no_cs(struct sl_thd *, sched_param_t);
 
 static inline struct sl_xcore_thd *
 _sl_xcore_thd_backend_lookup(thdid_t tid)
@@ -28,7 +29,7 @@ _sl_xcore_thd_backend_init(thdid_t tid, cpuid_t core, asndcap_t snd)
 }
 
 struct sl_xcore_thd *
-sl_xcore_thd_lookup(thdid_t tid, cpuid_t core)
+sl_xcore_thd_lookup_init(thdid_t tid, cpuid_t core)
 {
 	struct sl_xcore_thd *t = _sl_xcore_thd_backend_lookup(tid);
 
@@ -40,6 +41,12 @@ sl_xcore_thd_lookup(thdid_t tid, cpuid_t core)
 	return t;
 }
 
+struct sl_xcore_thd *
+sl_xcore_thd_lookup(thdid_t tid)
+{
+	return _sl_xcore_thd_backend_lookup(tid);
+}
+
 #define SL_XCORE_REQ(req, typ, resp) do { 				\
 					req.type        = typ;		\
 					req.client_core = cos_cpuid();	\
@@ -105,7 +112,7 @@ sl_xcore_thd_alloc(cpuid_t core, cos_thd_fn_t fn, void *data, int nparams, sched
 	if (sl_thd_curr() != sl__globals_core()->sched_thd) {
 		sl_thd_block(0);
 	} else {
-		while (!xcore_tid) sl_thd_yield(0);
+		while (!ps_load(&xcore_tid)) ;
 	}
 	assert(xcore_tid);
 	
@@ -181,7 +188,9 @@ sl_xcore_thd_wakeup(struct sl_xcore_thd *t)
 void
 sl_xcore_thd_wakeup_tid(thdid_t tid, cpuid_t core)
 {
-	struct sl_xcore_thd *t = sl_xcore_thd_lookup(tid, core);
+	struct sl_xcore_thd *t = sl_xcore_thd_lookup(tid);
+
+	assert(t->core == core);
 
 	sl_xcore_thd_wakeup(t);
 }
@@ -201,7 +210,7 @@ _sl_xcore_req_thd_alloc_no_cs(struct sl_xcore_request *req)
 	t = sl_thd_alloc_no_cs(fn, data);
 	assert(t);
 	if (likely(req->response)) *((thdid_t *)req->response) = sl_thd_thdid(t);
-	for (i = 0; i < req->sl_xcore_req_thd_alloc.param_count; i++) sl_thd_param_set(t, req->sl_xcore_req_thd_alloc.params[i]);
+	for (i = 0; i < req->sl_xcore_req_thd_alloc.param_count; i++) sl_thd_param_set_no_cs(t, req->sl_xcore_req_thd_alloc.params[i]);
 	_sl_xcore_thd_wakeup_tid_no_cs(req->client_thd, req->client_core);
 
 	return 0;
@@ -213,7 +222,7 @@ _sl_xcore_req_thd_param_set_no_cs(struct sl_xcore_request *req)
 	struct sl_thd *t = sl_thd_lkup(req->sl_xcore_req_thd_param_set.tid);
 
 	if (!t) return -1;
-	sl_thd_param_set(t, req->sl_xcore_req_thd_param_set.param);
+	sl_thd_param_set_no_cs(t, req->sl_xcore_req_thd_param_set.param);
 
 	return 0;
 }
@@ -224,6 +233,7 @@ _sl_xcore_req_thd_wakeup_no_cs(struct sl_xcore_request *req)
 	struct sl_thd *t = sl_thd_lkup(req->sl_xcore_req_thd_param_set.tid);
 
 	if (!t) return -1;
+	if (unlikely(t == sl__globals_core()->sched_thd)) return 0;
 	sl_thd_wakeup_no_cs(t);
 
 	return 0;
diff --git a/src/kernel/include/shared/cos_config.h b/src/kernel/include/shared/cos_config.h
index bf501b3be9..8c46ae5377 100644
--- a/src/kernel/include/shared/cos_config.h
+++ b/src/kernel/include/shared/cos_config.h
@@ -17,7 +17,7 @@
 
 #include "cpu_ghz.h"
 
-#define NUM_CPU 1
+#define NUM_CPU 2
 #define NUM_CPU_BMP_BYTES ((NUM_CPU + 7) / 8)
 #define NUM_CPU_BMP_WORDS ((NUM_CPU_BMP_BYTES + 3) / 4)
 

From 81279e069f7149e28049d95ef34f8dbbd3029df0 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Sat, 11 May 2019 19:13:29 -0400
Subject: [PATCH 060/127] fixed task construct basic working

---
 .../no_interface/omp_hello/hello_omp.c        |  6 ++--
 src/components/include/deque.h                |  4 +--
 src/components/include/part.h                 | 35 +++++++++++--------
 src/components/include/part_task.h            | 18 ++++++++--
 src/components/lib/cos_gomp/cos_gomp.c        |  4 +--
 src/components/lib/cos_gomp/cos_gomp.h        |  2 +-
 src/components/lib/part.c                     |  2 ++
 7 files changed, 47 insertions(+), 24 deletions(-)

diff --git a/src/components/implementation/no_interface/omp_hello/hello_omp.c b/src/components/implementation/no_interface/omp_hello/hello_omp.c
index f2d2495aa9..2d54f4bd52 100644
--- a/src/components/implementation/no_interface/omp_hello/hello_omp.c
+++ b/src/components/implementation/no_interface/omp_hello/hello_omp.c
@@ -59,10 +59,12 @@ int main ( void )
   INSIDE THE PARALLEL REGION, have each thread say hello.
 */
 #if 1
-#pragma omp parallel
+#pragma omp parallel num_threads(2) private(id)
   {
 #pragma omp for
-  for (id = 0; id < 10; id++) {
+  for (id = 0; id < 10; id++) 
+  {
+#pragma omp task
 	  PRINTC("id:%u\n", id);
   }
   }
diff --git a/src/components/include/deque.h b/src/components/include/deque.h
index 21422eab26..1190814648 100644
--- a/src/components/include/deque.h
+++ b/src/components/include/deque.h
@@ -37,7 +37,7 @@ deque_init_##name(struct deque_##name *q, size_t sz)					\
 											\
 	if (sz) {									\
 		/* only for size with pow of 2 */					\
-		assert(sz & (sz - 1));							\
+		assert((sz & (sz - 1)) == 0);						\
 		assert(sz <= DEQUE_MAX_SZ);						\
 	} else {									\
 		sz = DEQUE_MAX_SZ;							\
@@ -62,7 +62,7 @@ deque_push_##name(struct deque_##name *q, type *w)					\
 	ps_mem_fence();									\
 	if (!ps_upcas((unsigned long *)&q->bottom, cb, cb + 1)) assert(0);		\
 											\
-	return -EAGAIN;									\
+	return 0;									\
 }											\
 											\
 /* Use mutual exclusion locks around push/pop if multi-threaded. */			\
diff --git a/src/components/include/part.h b/src/components/include/part.h
index 1cf459e997..65d070db4c 100644
--- a/src/components/include/part.h
+++ b/src/components/include/part.h
@@ -59,12 +59,12 @@ part_deque_push(struct part_task *t)
 }
 
 static inline int
-part_deque_pop(struct part_task *t)
+part_deque_pop(struct part_task **t)
 {
 	int ret;
 
 	sl_cs_enter();
-	ret = deque_pop_part(part_deque_curr(), &t);
+	ret = deque_pop_part(part_deque_curr(), t);
 	sl_cs_exit();
 
 	return ret;
@@ -186,15 +186,22 @@ part_task_end(struct part_task *t)
 	struct sl_thd *ts = sl_thd_curr();
 	int tn = part_task_work_thd_num(t);
 
-	part_task_barrier(t);
-
 	assert(tn >= 0 && t->nthds >= 1);
 	assert(ts->part_context == (void *)t);
 	if (t->nthds == 1) {
+		int i;
+
 		assert(tn == 0);
+		for (i = 0; i < PART_MAX_CHILD; i++) {
+			while (ps_load(&t->child[i])) sl_thd_yield(0);
+		}
+		ps_faa(&t->end, 1);
+		part_task_remove_child(t->parent, t);
+		ts->part_context = 0;
 
 		return;
 	}
+	part_task_barrier(t);
 
 	if (tn == 0) {
 		if (t->type == PART_TASK_T_WORKSHARE) part_list_remove(t);
@@ -218,25 +225,25 @@ part_thd_fn(void *d)
 	/* parallel runtime not ready? */
 	while (unlikely(!part_isready())) sl_thd_yield(0);
 
-	/* no parallel sections? */
-	while (unlikely(ps_list_head_empty(part_list()))) sl_thd_yield(0);
-
 	while (1) {
 		struct part_task *t = NULL;
 		int ret;
-		int thdnum = 0;
+		int thdnum = -1;
 		unsigned thd = cos_cpuid() << 16 | cos_thdid();
 
 		/* FIXME: nested parallel needs love! */
 		t = part_list_peek();
-		if (likely(t)) goto found;
+		if (likely(t)) {
+			thdnum = part_task_work_try(t);
+			if (thdnum >= 0) goto found;
+		}
 
 single:
-		ret = part_deque_pop(t);
+		ret = part_deque_pop(&t);
 		if (likely(ret == 0)) {
-			assert(t->type != PART_TASK_T_WORKSHARE);
-
-			goto found;
+			assert(t && t->type != PART_TASK_T_WORKSHARE);
+			thdnum = part_task_work_try(t);
+			if (thdnum == 0) goto found;
 		}
 
 		if (unlikely(ret == -EAGAIN)) goto single;
@@ -248,7 +255,7 @@ part_thd_fn(void *d)
 		}
 		assert(t->type != PART_TASK_T_WORKSHARE);
 found:
-		thdnum = part_task_work_try(t);
+		if (unlikely(thdnum < 0)) thdnum = part_task_work_try(t);
 		if (unlikely(thdnum < 0)) continue;
 		if (t->type != PART_TASK_T_WORKSHARE) assert(thdnum == 0);
 		curr->part_context = (void *)t;
diff --git a/src/components/include/part_task.h b/src/components/include/part_task.h
index 3c6be3519e..2249e17eaa 100644
--- a/src/components/include/part_task.h
+++ b/src/components/include/part_task.h
@@ -12,7 +12,7 @@
 #define PART_MAX            4 
 #define PART_MAX_CORE_THDS  4
 #define PART_MAX_THDS       PART_MAX_CORE_THDS*NUM_CPU
-#define PART_MAX_CHILD      PART_MAX
+#define PART_MAX_CHILD      16 
 #define PART_MAX_WORKSHARES 16
 
 typedef void (*part_fn_t)(void *);
@@ -159,7 +159,13 @@ part_task_work_thd_num(struct part_task *t)
 	int i; 
 	unsigned key = PART_CURR_THD;
 
-	if (t->type != PART_TASK_T_WORKSHARE) assert(t->nthds == 1);
+	if (t->type != PART_TASK_T_WORKSHARE) {
+		assert(t->nthds == 1);
+
+		if (t->workers[0] == key) return 0;
+
+		return -1;
+	}
 
 	if (key == t->master) return 0;
 	for (i = 1; i < (int)t->nthds; i++) {
@@ -172,15 +178,21 @@ part_task_work_thd_num(struct part_task *t)
 static inline void
 part_task_barrier(struct part_task *t)
 {
-	struct sl_thd *ts = sl_thd_curr();
 	int tn = part_task_work_thd_num(t);
 	unsigned cin = 0, cout = 0;
 
 	assert(tn >= 0 && t->nthds >= 1);
 
 	if (t->nthds == 1) {
+		int i;
+
 		assert(tn == 0 && t->barrier_in == 0);
 
+		/* wait for all child tasks to complete, including explicit tasks */
+		for (i = 0; i < PART_MAX_CHILD; i++) {
+			while (ps_load(&t->child[i])) sl_thd_yield(0);
+		}
+
 		return;
 	}
 
diff --git a/src/components/lib/cos_gomp/cos_gomp.c b/src/components/lib/cos_gomp/cos_gomp.c
index 90bda9d1a7..2869d73062 100644
--- a/src/components/lib/cos_gomp/cos_gomp.c
+++ b/src/components/lib/cos_gomp/cos_gomp.c
@@ -341,7 +341,7 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
 		else       memcpy(arg, data, arg_size);
 
 		assert(parent);
-		part_task_init(pt, 0, parent, 0, fn, arg);
+		part_task_init(pt, 0, parent, 1, fn, arg);
 		parent_off = part_task_add_child(parent, pt);
 		assert(parent_off >= 0);
 
@@ -354,7 +354,7 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
 		struct part_task pt;
 
 		assert(parent);
-		part_task_init(&pt, 0, parent, 0, fn, data);
+		part_task_init(&pt, 0, parent, 1, fn, data);
 		parent_off = part_task_add_child(parent, &pt);
 		assert(parent_off >= 0);
 
diff --git a/src/components/lib/cos_gomp/cos_gomp.h b/src/components/lib/cos_gomp/cos_gomp.h
index 9d11a85df0..f64de36d88 100644
--- a/src/components/lib/cos_gomp/cos_gomp.h
+++ b/src/components/lib/cos_gomp/cos_gomp.h
@@ -3,7 +3,7 @@
 
 #include <part.h>
 
-#define COS_GOMP_MAX_THDS PART_MAX_THDS
+#define COS_GOMP_MAX_THDS 4 
 #define COS_GOMP_CORE_MAX_THDS PART_MAX_CORE_THDS
 #define COS_GOMP_MAX_CHILD PART_MAX_CHILD
 #define COS_GOMP_MAX_TASKS 4096
diff --git a/src/components/lib/part.c b/src/components/lib/part.c
index 9ce2471990..270677cd79 100644
--- a/src/components/lib/part.c
+++ b/src/components/lib/part.c
@@ -12,6 +12,7 @@ struct ps_list_head part_l_global;
 static unsigned part_ready = 0;
 struct crt_lock part_l_lock;
 
+#define PART_DEQUE_SZ 64
 #define _PART_PRIO 1
 #define _PART_PRIO_PACK() sched_param_pack(SCHEDP_PRIO, _PART_PRIO)
 
@@ -28,6 +29,7 @@ part_init(void)
 	if (!ps_cas(&is_first, NUM_CPU, cos_cpuid())) {
 		while (!ps_load(&ds_init_done)) ;
 	} else {
+		for (k = 0; k < NUM_CPU; k++) deque_init_part(&part_dq_percore[k], PART_DEQUE_SZ);
 		ps_list_head_init(&part_l_global);
 		crt_lock_init(&part_l_lock);
 		ps_faa(&ds_init_done, 1);

From 4b2fb9044f05189207d30686c81a2293027875f9 Mon Sep 17 00:00:00 2001
From: Gabe Parmer <gparmer@gwu.edu>
Date: Sat, 11 May 2019 23:23:33 -0400
Subject: [PATCH 061/127] Added primitive, blocking message queue support

- Asynchronous by default, and blocking on boundary conditions (queue full/empty)
- Macros to inline most of the channel logic and enable the magic of constant propagation
- Simple implementation for a single producer, single consumer
---
 .../implementation/tests/crt_tests/crttests.c | 157 +++++++++++-
 src/components/include/crt_chan.h             | 233 ++++++++++++++++++
 2 files changed, 380 insertions(+), 10 deletions(-)
 create mode 100644 src/components/include/crt_chan.h

diff --git a/src/components/implementation/tests/crt_tests/crttests.c b/src/components/implementation/tests/crt_tests/crttests.c
index d7632a7c17..ac8882afac 100644
--- a/src/components/implementation/tests/crt_tests/crttests.c
+++ b/src/components/implementation/tests/crt_tests/crttests.c
@@ -10,28 +10,164 @@
 #include <sl.h>
 
 #include <crt_lock.h>
+#include <crt_chan.h>
+
+struct cos_compinfo *ci;
+
+#define CHAN_ITER  1000000
+#define NCHANTHDS  5
+#define CHAN_BATCH 3
+
+CRT_CHAN_STATIC_ALLOC(c0, int, 4);
+CRT_CHAN_STATIC_ALLOC(c1, int, 4);
+CRT_CHAN_STATIC_ALLOC(c2, int, 4);
+CRT_CHAN_STATIC_ALLOC(c3, int, 4);
+CRT_CHAN_STATIC_ALLOC(c4, int, 4);
+
+CRT_CHAN_TYPE_PROTOTYPES(test, int, 4);
+struct crt_chan *chans[NCHANTHDS + 1];
+struct sl_thd  *chan_thds[NCHANTHDS] = {NULL, };
+
+typedef enum { CHILLING = 0, RECVING, SENDING } actions_t;
+unsigned long status[NCHANTHDS];
+unsigned long cnts[NCHANTHDS] = {0, };
+
+int
+chantest_is_deadlocked(void)
+{
+	int i;
+	actions_t s = status[0];
+
+	/* Are all threads in the same blocked state? */
+	for (i = 0; i < NCHANTHDS; i++) {
+		if (status[i] == CHILLING || status[i] != s) return 0;
+	}
+
+	return 1;
+}
+
+void
+chantest_send(int thd_off, struct crt_chan *c)
+{
+	int send = cos_thdid();
+
+	if (crt_chan_full_test(c)) status[thd_off] = SENDING;
+	if (!chantest_is_deadlocked()) {
+		/* printc("\t%d: send\n", cos_thdid()); */
+		crt_chan_send_test(c, &send);
+	}
+	status[thd_off] = CHILLING;
+}
+
+void
+chantest_recv(int thd_off, struct crt_chan *c)
+{
+	int recv;
+
+	if (crt_chan_empty_test(c)) status[thd_off] = RECVING;
+	if (!chantest_is_deadlocked()) {
+		/* printc("\t%d: recv\n", cos_thdid()); */
+		crt_chan_recv_test(c, &recv);
+		cnts[thd_off]++;
+	}
+	status[thd_off] = CHILLING;
+}
+
+void
+chan_thd(void *d)
+{
+	int thd_off = (int)d;
+	struct crt_chan **chan_pair = &chans[thd_off];
+	int recv;
+	int i;
+
+	for (i = 0; i < CHAN_ITER; i++) {
+		int j;
+
+		/* printc("%d: pre-send\n", cos_thdid()); */
+		for (j = 0; j < CHAN_BATCH; j++) {
+			chantest_send(thd_off, chan_pair[1]);
+		}
+
+		/* printc("%d: pre-recv\n", cos_thdid()); */
+		for (j = 0; j < CHAN_BATCH; j++) {
+			chantest_recv(thd_off, chan_pair[0]);
+		}
+	}
+
+	printc("SUCCESS! Counts (should be within %d of each other): ", NCHANTHDS * CHAN_BATCH);
+	for (i = 0; i < NCHANTHDS; i++) {
+		printc("\t%ld", cnts[i]);
+	}
+	printc("\n");
+	while (1) ;
+}
+
+void
+idle_thd(void *d)
+{
+	printc("FAILURE: deadlock!\n");
+	while (1) ;
+}
+
+void
+test_chan(void)
+{
+	int i;
+	struct sl_thd *idle;
+	union sched_param_union idle_param = {.c = {.type = SCHEDP_PRIO, .value = 10}};
+
+	union sched_param_union sps[] = {
+		{.c = {.type = SCHEDP_PRIO, .value = 7}},
+		{.c = {.type = SCHEDP_PRIO, .value = 6}},
+		{.c = {.type = SCHEDP_PRIO, .value = 8}},
+		{.c = {.type = SCHEDP_PRIO, .value = 5}},
+		{.c = {.type = SCHEDP_PRIO, .value = 5}}
+	};
+
+	chans[0] = c0;
+	chans[1] = c1;
+	chans[2] = c2;
+	chans[3] = c3;
+	chans[4] = c4;
+	chans[5] = c0;
+
+	for (i = 0; i < NCHANTHDS; i++) {
+		crt_chan_init_test(chans[i]);
+	}
+
+	printc("Create threads:\n");
+	for (i = 0; i < NCHANTHDS; i++) {
+		chan_thds[i] = sl_thd_alloc(chan_thd, (void *)i);
+		assert(chan_thds[i]);
+		printc("\tcreating thread %d at prio %d\n", sl_thd_thdid(chan_thds[i]), sps[i].c.value);
+		sl_thd_param_set(chan_thds[i], sps[i].v);
+	}
+	idle = sl_thd_alloc(idle_thd, NULL);
+	printc("\tcreating IDLE %d at prio %d\n", sl_thd_thdid(idle), idle_param.c.value);
+	sl_thd_param_set(idle, idle_param.v);
+
+}
 
 #define LOCK_ITER 1000000
-#define NTHDS 4
+#define NLOCKTHDS 4
 struct crt_lock lock;
-struct sl_thd  *lock_thds[NTHDS] = {NULL, };
-unsigned int    progress[NTHDS] = {0, };
-struct cos_compinfo *ci;
+struct sl_thd  *lock_thds[NLOCKTHDS] = {NULL, };
+unsigned int    progress[NLOCKTHDS] = {0, };
+volatile thdid_t holder;
 
 thdid_t
 next_thd(void)
 {
-	return sl_thd_thdid(lock_thds[(unsigned int)(ps_tsc() % NTHDS)]);
+	return sl_thd_thdid(lock_thds[(unsigned int)(ps_tsc() % NLOCKTHDS)]);
 }
 
-volatile thdid_t holder;
-
 void
 lock_thd(void *d)
 {
 	int i, cnt, me = -1;
 
-	for (i = 0; i < NTHDS; i++) {
+	for (i = 0; i < NLOCKTHDS; i++) {
 		if (sl_thd_thdid(lock_thds[i]) != cos_thdid()) continue;
 
 		me = i;
@@ -56,7 +192,7 @@ lock_thd(void *d)
 		sl_thd_yield(next_thd());
 	}
 
-	for (i = 0; i < NTHDS; i++) {
+	for (i = 0; i < NLOCKTHDS; i++) {
 		if (i == me) continue;
 
 		if (progress[i] < LOCK_ITER) {
@@ -82,7 +218,7 @@ test_lock(void)
 	crt_lock_init(&lock);
 
 	printc("Create threads:\n");
-	for (i = 0; i < NTHDS; i++) {
+	for (i = 0; i < NLOCKTHDS; i++) {
 		lock_thds[i] = sl_thd_alloc(lock_thd, NULL);
 		printc("\tcreating thread %d at prio %d\n", sl_thd_thdid(lock_thds[i]), sps[i].c.value);
 		sl_thd_param_set(lock_thds[i], sps[i].v);
@@ -101,6 +237,7 @@ cos_init(void)
 	sl_init(SL_MIN_PERIOD_US);
 
 	test_lock();
+//	test_chan();
 
 	printc("Running benchmark...\n");
 	sl_sched_loop_nonblock();
diff --git a/src/components/include/crt_chan.h b/src/components/include/crt_chan.h
new file mode 100644
index 0000000000..39a06974f4
--- /dev/null
+++ b/src/components/include/crt_chan.h
@@ -0,0 +1,233 @@
+/*
+ * Copyright 2019, Gabriel Parmer, GWU, gparmer@gwu.edu.
+ *
+ * This uses a two clause BSD License.
+ */
+
+#ifndef CRT_CHAN_H
+#define CRT_CHAN_H
+
+/***
+ *
+ */
+
+#include <cos_component.h>
+#include <crt_blkpt.h>
+#include <bitmap.h>
+
+struct crt_chan {
+	u32_t producer;
+	/* If the ring is empty, recving threads will block on this blkpt. */
+	struct crt_blkpt empty;
+	char _padding1[CACHE_LINE * 2 - (sizeof(struct crt_blkpt) + sizeof(u32_t))];
+	u32_t consumer;
+	/* If the ring is full, sending thread will block on this blkpt. */
+	struct crt_blkpt full;
+	char _padding2[CACHE_LINE * 2 - (sizeof(struct crt_blkpt) + sizeof(u32_t))];
+	/*
+	 * @item_sz is a power of two and corresponds to the
+	 * wraparound_mask. The number of data items that the channel
+	 * can hold is item_sz - 1. @wraparound_mask = nslots-1 (were
+	 * nslots is a power of two)
+	 */
+	u32_t item_sz, wraparound_mask;
+	u32_t nslots;
+	/* The memory for the channel. */
+	char mem[0];
+};
+
+/* produce a  */
+#define CRT_CHAN_STATIC_ALLOC(name, type, nslots)		\
+struct __crt_chan_envelope_##name {	                        \
+        struct crt_chan c;					\
+	char mem[nslots * sizeof(type)];			\
+} __##name;							\
+struct crt_chan *name = &__##name.c
+
+#define CRT_CHAN_TYPE_PROTOTYPES(name, type, nslots)			\
+static inline int							\
+crt_chan_init_##name(struct crt_chan *c)				\
+{ return crt_chan_init(c, sizeof(type), nslots); }			\
+static inline void							\
+crt_chan_teardown_##name(struct crt_chan *c)				\
+{ crt_chan_teardown(c); }						\
+static inline int							\
+crt_chan_empty_##name(struct crt_chan *c)				\
+{ return __crt_chan_empty(c, nslots - 1); }				\
+static inline int							\
+crt_chan_full_##name(struct crt_chan *c)				\
+{ return __crt_chan_full(c, nslots - 1); }				\
+static inline int							\
+crt_chan_send_##name(struct crt_chan *c, void *item)			\
+{									\
+	assert(pow2(nslots));						\
+	return __crt_chan_send(c, item, nslots - 1, sizeof(type));	\
+}									\
+static inline int							\
+crt_chan_recv_##name(struct crt_chan *c, void *item)			\
+{									\
+	assert(pow2(nslots));						\
+	return __crt_chan_recv(c, item, nslots - 1, sizeof(type));	\
+}									\
+static inline int							\
+crt_chan_async_send_##name(struct crt_chan *c, void *item)		\
+{									\
+	assert(pow2(nslots));						\
+	if (__crt_chan_produce(c, item, nslots - 1, sizeof(type))) return -EAGAIN; \
+	return 0;							\
+}									\
+static inline int							\
+crt_chan_async_recv_##name(struct crt_chan *c, void *item)		\
+{									\
+	assert(pow2(nslots));						\
+	if (__crt_chan_consume(c, item, nslots - 1, sizeof(type))) return -EAGAIN; \
+	return 0;							\
+}
+
+#define CRT_CHANCHAN_PROTOTYPES(nslots) \
+CRT_CHAN_TYPE_PROTOTYPES(chan, struct chan *, nslots
+
+static inline unsigned int
+__crt_chan_buff_idx(struct crt_chan *c, u32_t v, u32_t wraparound_mask)
+{ return v & wraparound_mask; }
+
+static inline int
+__crt_chan_full(struct crt_chan *c, u32_t wraparound_mask)
+{ return c->consumer == __crt_chan_buff_idx(c, c->producer + 1, wraparound_mask); }
+
+static inline int
+__crt_chan_empty(struct crt_chan *c, u32_t wraparound_mask)
+{ return c->producer == c->consumer; }
+
+static inline int
+__crt_chan_produce(struct crt_chan *c, void *d, u32_t wraparound_mask, u32_t sz)
+{
+	if (__crt_chan_full(c, wraparound_mask)) return 1;
+	memcpy(c->mem + (__crt_chan_buff_idx(c, c->producer, wraparound_mask) * sz), d, sz);
+	c->producer++;
+
+	return 0;
+}
+
+static inline int
+__crt_chan_consume(struct crt_chan *c, void *d, u32_t wraparound_mask, u32_t sz)
+{
+	void *ret;
+
+	if (__crt_chan_empty(c, wraparound_mask)) return 1;
+	memcpy(d, c->mem + (__crt_chan_buff_idx(c, c->consumer, wraparound_mask) * sz), sz);
+	c->consumer++;
+
+	return 0;
+}
+
+/**
+ * The next two functions pass all of the variables in via arguments,
+ * so that we can use them for constant propagation along with
+ * inlining to get rid of the general memcpy code.
+ */
+static inline int
+__crt_chan_send(struct crt_chan *c, void *item, u32_t wraparound_mask, u32_t item_sz)
+{
+	while (1) {
+		struct crt_blkpt_checkpoint chkpt;
+
+		crt_blkpt_checkpoint(&c->full, &chkpt);
+		if (!__crt_chan_produce(c, item, wraparound_mask, item_sz)) {
+			/* success! */
+			crt_blkpt_trigger(&c->empty, 0);
+			break;
+		}
+		crt_blkpt_wait(&c->full, 0, &chkpt);
+	}
+
+	return 0;
+}
+
+static inline int
+__crt_chan_recv(struct crt_chan *c, void *item, u32_t wraparound_mask, u32_t item_sz)
+{
+	while (1) {
+		struct crt_blkpt_checkpoint chkpt;
+
+		crt_blkpt_checkpoint(&c->empty, &chkpt);
+		if (!__crt_chan_consume(c, item, wraparound_mask, item_sz)) {
+			/* success! */
+			crt_blkpt_trigger(&c->full, 0);
+			break;
+		}
+		crt_blkpt_wait(&c->empty, 0, &chkpt);
+	}
+
+	return 0;
+}
+
+
+/*
+ * We need to know how much to malloc? This function returns that
+ * requirement. It assumes (and checks) that @slots is a power of two.
+ */
+static inline int
+crt_chan_mem_sz(int item_sz, int slots)
+{
+	assert(pow2(slots));
+
+	return sizeof(struct crt_chan) + item_sz * slots;
+}
+
+/* How many slots can we fit into an allocation of a specific mem_sz */
+static inline int
+crt_chan_nslots(int item_sz, int mem_sz)
+{
+	return leqpow2((mem_sz - sizeof(struct crt_chan)) / item_sz);
+}
+
+static inline int
+crt_chan_init(struct crt_chan *c, int item_sz, int slots)
+{
+	assert(pow2(slots));
+	if (crt_blkpt_init(&c->empty)) return -1;
+	if (crt_blkpt_init(&c->full)) return -1;
+	c->nslots  = slots;
+	c->item_sz = item_sz;
+	c->wraparound_mask = slots - 1; /* slots is a pow2 */
+
+	return 0;
+}
+
+static inline void
+crt_chan_teardown(struct crt_chan *c)
+{
+	crt_blkpt_teardown(&c->empty);
+	crt_blkpt_teardown(&c->full);
+}
+
+/* User-facing send and receive APIs: */
+
+static inline int
+crt_chan_send(struct crt_chan *c, void *item)
+{
+	return __crt_chan_send(c, item, c->wraparound_mask, c->item_sz);
+}
+
+static inline int
+crt_chan_recv(struct crt_chan *c, void *item)
+{
+	return __crt_chan_recv(c, item, c->wraparound_mask, c->item_sz);
+}
+
+static inline int
+crt_chan_async_send(struct crt_chan *c, void *item)
+{
+	if (__crt_chan_produce(c, item, c->wraparound_mask, c->item_sz)) return -EAGAIN;
+	return 0;
+}
+
+static inline int
+crt_chan_async_recv(struct crt_chan *c, void *item)
+{
+	if (__crt_chan_consume(c, item, c->wraparound_mask, c->item_sz)) return -EAGAIN;
+	return 0;
+}
+
+#endif /* CRT_CHAN_H */

From 869e7abccf18cdbdcbef13e570d3a783d5ea9747 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Sun, 12 May 2019 12:25:49 -0400
Subject: [PATCH 062/127] common init.c for omp execution

---
 .../no_interface/omp_dijkstra/dijkstra_omp.c  | 57 +-----------------
 .../no_interface/omp_dijkstra/init.c          |  1 +
 .../no_interface/omp_hello/hello_omp.c        | 57 +-----------------
 .../no_interface/omp_hello/init.c             | 60 +++++++++++++++++++
 4 files changed, 65 insertions(+), 110 deletions(-)
 create mode 120000 src/components/implementation/no_interface/omp_dijkstra/init.c
 create mode 100644 src/components/implementation/no_interface/omp_hello/init.c

diff --git a/src/components/implementation/no_interface/omp_dijkstra/dijkstra_omp.c b/src/components/implementation/no_interface/omp_dijkstra/dijkstra_omp.c
index 34a648ef95..4eb5375c3c 100644
--- a/src/components/implementation/no_interface/omp_dijkstra/dijkstra_omp.c
+++ b/src/components/implementation/no_interface/omp_dijkstra/dijkstra_omp.c
@@ -1,9 +1,7 @@
-#include <cos_types.h>
-#include <cos_kernel_api.h>
-#include <sl.h>
 #include <llprint.h>
-#include <cos_omp.h>
-#include <hypercall.h>
+#include <cos_types.h>
+#include <stdlib.h>
+#include <omp.h>
 
 # define NV 6
 
@@ -564,52 +562,3 @@ void update_mind ( int s, int e, int mv, int connected[NV], int ohd[NV][NV],
   }
   return;
 } 
-
-
-static void
-cos_main(void *d)
-{
-	main();
-}
-
-extern void cos_gomp_init(void);
-void
-cos_init(void *d)
-{
-	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
-	struct cos_compinfo *   ci    = cos_compinfo_get(defci);
-	int i;
-	static volatile unsigned long first = NUM_CPU + 1, init_done[NUM_CPU] = { 0 };
-
-	PRINTC("In OpenMP-based Hello Program!\n");
-	if (ps_cas((unsigned long *)&first, NUM_CPU + 1, cos_cpuid())) {
-		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
-		cos_defcompinfo_init();
-	} else {
-		while (!ps_load((unsigned long *)&init_done[first])) ;
-
-		cos_defcompinfo_sched_init();
-	}
-	ps_faa((unsigned long *)&init_done[cos_cpuid()], 1);
-
-	/* make sure the INITTHD of the scheduler is created on all cores.. for cross-core sl initialization to work! */
-	for (i = 0; i < NUM_CPU; i++) {
-		while (!ps_load((unsigned long *)&init_done[i])) ;
-	}
-	sl_init(SL_MIN_PERIOD_US*100);
-	cos_gomp_init();
-	hypercall_comp_init_done();
-
-	if (!cos_cpuid()) {
-		struct sl_thd *t = NULL;
-
-		t = sl_thd_alloc(cos_main, NULL);
-		assert(t);
-		sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX));
-	}
-
-	sl_sched_loop_nonblock();
-
-	PRINTC("Should never get here!\n");
-	assert(0);
-}
diff --git a/src/components/implementation/no_interface/omp_dijkstra/init.c b/src/components/implementation/no_interface/omp_dijkstra/init.c
new file mode 120000
index 0000000000..b2694bf833
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_dijkstra/init.c
@@ -0,0 +1 @@
+../omp_hello/init.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_hello/hello_omp.c b/src/components/implementation/no_interface/omp_hello/hello_omp.c
index 2d54f4bd52..5bfb7757ee 100644
--- a/src/components/implementation/no_interface/omp_hello/hello_omp.c
+++ b/src/components/implementation/no_interface/omp_hello/hello_omp.c
@@ -1,9 +1,5 @@
-#include <cos_kernel_api.h>
-#include <cos_defkernel_api.h>
 #include <llprint.h>
-#include <sl.h>
-#include <cos_omp.h>
-#include <hypercall.h>
+#include <omp.h>
 
 /******************************************************************************/
 
@@ -95,54 +91,3 @@ int main ( void )
 
   return 0;
 }
-
-static void 
-cos_main(void *d)
-{
-	main();
-
-	while (1);
-}
-
-extern void cos_gomp_init(void);
-
-void
-cos_init(void *d)
-{
-	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
-	struct cos_compinfo *   ci    = cos_compinfo_get(defci);
-	int i;
-	static volatile unsigned long first = NUM_CPU + 1, init_done[NUM_CPU] = { 0 };
-
-	PRINTC("In OpenMP-based Hello Program!\n");
-	if (ps_cas((unsigned long *)&first, NUM_CPU + 1, cos_cpuid())) {
-		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
-		cos_defcompinfo_init();
-	} else {
-		while (!ps_load((unsigned long *)&init_done[first])) ;
-
-		cos_defcompinfo_sched_init();
-	}
-	ps_faa((unsigned long *)&init_done[cos_cpuid()], 1);
-
-	/* make sure the INITTHD of the scheduler is created on all cores.. for cross-core sl initialization to work! */
-	for (i = 0; i < NUM_CPU; i++) {
-		while (!ps_load((unsigned long *)&init_done[i])) ;
-	}
-	sl_init(SL_MIN_PERIOD_US*100);
-	cos_gomp_init();
-	hypercall_comp_init_done();
-
-	if (!cos_cpuid()) {
-		struct sl_thd *t = NULL;
-
-		t = sl_thd_alloc(cos_main, NULL);
-		assert(t);
-		sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX));
-	}
-
-	sl_sched_loop_nonblock();
-
-	PRINTC("Should never get here!\n");
-	assert(0);
-}
diff --git a/src/components/implementation/no_interface/omp_hello/init.c b/src/components/implementation/no_interface/omp_hello/init.c
new file mode 100644
index 0000000000..6973648098
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_hello/init.c
@@ -0,0 +1,60 @@
+#include <cos_kernel_api.h>
+#include <cos_defkernel_api.h>
+#include <llprint.h>
+#include <sl.h>
+#include <cos_omp.h>
+#include <hypercall.h>
+
+int main(void);
+
+static void
+cos_main(void *d)
+{
+	main();
+
+	while (1) ;
+}
+
+extern void cos_gomp_init(void);
+
+void
+cos_init(void *d)
+{
+	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
+	struct cos_compinfo *   ci    = cos_compinfo_get(defci);
+	int i;
+	static volatile unsigned long first = NUM_CPU + 1, init_done[NUM_CPU] = { 0 };
+
+	PRINTC("In OpenMP-based Hello Program!\n");
+	if (ps_cas((unsigned long *)&first, NUM_CPU + 1, cos_cpuid())) {
+		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
+		cos_defcompinfo_init();
+	} else {
+		while (!ps_load((unsigned long *)&init_done[first])) ;
+
+		cos_defcompinfo_sched_init();
+	}
+	ps_faa((unsigned long *)&init_done[cos_cpuid()], 1);
+
+	/* make sure the INITTHD of the scheduler is created on all cores.. for cross-core sl initialization to work! */
+	for (i = 0; i < NUM_CPU; i++) {
+		while (!ps_load((unsigned long *)&init_done[i])) ;
+	}
+	sl_init(SL_MIN_PERIOD_US*100);
+	cos_gomp_init();
+	hypercall_comp_init_done();
+
+	if (!cos_cpuid()) {
+		struct sl_thd *t = NULL;
+
+		t = sl_thd_alloc(cos_main, NULL);
+		assert(t);
+		sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX));
+	}
+
+	sl_sched_loop_nonblock();
+
+	PRINTC("Should never get here!\n");
+	assert(0);
+}
+

From d0fbd49611e48f386d5ab0fb63f907326c9744e5 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Mon, 13 May 2019 22:46:47 -0400
Subject: [PATCH 063/127] bugfixes in part and added bots-fib benchmark

---
 .../no_interface/omp_fib_bots/Makefile        |  19 +
 .../no_interface/omp_fib_bots/app-desc.h      |  47 ++
 .../no_interface/omp_fib_bots/bots.h          | 113 ++++
 .../no_interface/omp_fib_bots/bots_common.c   | 340 +++++++++++
 .../no_interface/omp_fib_bots/bots_common.h   |  56 ++
 .../no_interface/omp_fib_bots/bots_main.c     | 538 ++++++++++++++++++
 .../no_interface/omp_fib_bots/bots_main.h     |  53 ++
 .../no_interface/omp_fib_bots/fib.c           | 235 ++++++++
 .../no_interface/omp_fib_bots/fib.h           |  40 ++
 .../no_interface/omp_fib_bots/init.c          |   1 +
 .../no_interface/omp_fib_bots/omp-tasks-app.h |  31 +
 .../no_interface/omp_fib_bots/posix_basic.c   |   1 +
 .../no_interface/omp_hello/init.c             |   9 +-
 .../tests/unit_schedtests/unit_schedlib.c     |   4 +-
 src/components/include/part.h                 |  17 +-
 src/components/include/part_task.h            |  65 ++-
 src/components/lib/cos_gomp/cos_gomp.c        |  94 +--
 src/components/lib/cos_gomp/cos_gomp.h        |   4 +-
 src/components/lib/cos_gomp/cos_omp.c         |   3 +-
 src/components/lib/part.c                     |  78 +++
 src/kernel/include/shared/consts.h            |   2 +-
 src/kernel/include/shared/cos_config.h        |   2 +-
 src/platform/i386/runscripts/omp_fib_bots.sh  |   7 +
 23 files changed, 1658 insertions(+), 101 deletions(-)
 create mode 100644 src/components/implementation/no_interface/omp_fib_bots/Makefile
 create mode 100644 src/components/implementation/no_interface/omp_fib_bots/app-desc.h
 create mode 100644 src/components/implementation/no_interface/omp_fib_bots/bots.h
 create mode 100644 src/components/implementation/no_interface/omp_fib_bots/bots_common.c
 create mode 100644 src/components/implementation/no_interface/omp_fib_bots/bots_common.h
 create mode 100644 src/components/implementation/no_interface/omp_fib_bots/bots_main.c
 create mode 100644 src/components/implementation/no_interface/omp_fib_bots/bots_main.h
 create mode 100644 src/components/implementation/no_interface/omp_fib_bots/fib.c
 create mode 100644 src/components/implementation/no_interface/omp_fib_bots/fib.h
 create mode 120000 src/components/implementation/no_interface/omp_fib_bots/init.c
 create mode 100644 src/components/implementation/no_interface/omp_fib_bots/omp-tasks-app.h
 create mode 120000 src/components/implementation/no_interface/omp_fib_bots/posix_basic.c
 create mode 100644 src/platform/i386/runscripts/omp_fib_bots.sh

diff --git a/src/components/implementation/no_interface/omp_fib_bots/Makefile b/src/components/implementation/no_interface/omp_fib_bots/Makefile
new file mode 100644
index 0000000000..20cdb21093
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fib_bots/Makefile
@@ -0,0 +1,19 @@
+COMPONENT=omp_fib_bots.o
+INTERFACES=
+DEPENDENCIES=capmgr
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLCAPMGR) -lsl_mod_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart -lsl_blkpt
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
+
+CFLAGS += -fopenmp
+# if tied tasks are required
+CFLAGS += -DFORCE_TIED_TASKS
+
+OMPC_FINAL_FLAGS=
+
+# one per compilation or none
+#CFLAGS += -DMANUAL_CUTOFF
+#CFLAGS += -DIF_CUTOFF
+#CFLAGS += -DFINAL_CUTOFF $(OMPC_FINAL_FLAGS) 
diff --git a/src/components/implementation/no_interface/omp_fib_bots/app-desc.h b/src/components/implementation/no_interface/omp_fib_bots/app-desc.h
new file mode 100644
index 0000000000..e8af171324
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fib_bots/app-desc.h
@@ -0,0 +1,47 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include "omp-tasks-app.h"
+
+#define BOTS_APP_NAME "Fibonacci"
+#define BOTS_APP_PARAMETERS_DESC "N=%d"
+#define BOTS_APP_PARAMETERS_LIST ,bots_arg_size
+
+#define BOTS_APP_USES_ARG_SIZE
+#define BOTS_APP_DEF_ARG_SIZE 20
+#define BOTS_APP_DESC_ARG_SIZE "Number to compute"
+
+int fib_verify(int);
+void fib0 (int);
+void fib0_seq (int);
+
+//#define KERNEL_INIT
+#define KERNEL_CALL fib0(bots_arg_size)
+//#define KERNEL_FINI
+
+//#define KERNEL_SEQ_INIT
+#define KERNEL_SEQ_CALL fib0_seq(bots_arg_size)
+//#define KERNEL_SEQ_FINI
+
+
+#define KERNEL_CHECK fib_verify(bots_arg_size)
+
+#define BOTS_CUTOFF_DEF_VALUE 10
+
diff --git a/src/components/implementation/no_interface/omp_fib_bots/bots.h b/src/components/implementation/no_interface/omp_fib_bots/bots.h
new file mode 100644
index 0000000000..add69e42ec
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fib_bots/bots.h
@@ -0,0 +1,113 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#ifndef _BOTS_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <llprint.h>
+
+/* common flags */
+extern int bots_sequential_flag;
+extern int bots_benchmark_flag;
+extern int bots_check_flag;
+extern int bots_result;
+extern int bots_output_format;
+extern int bots_print_header;
+/* common variables */
+extern char bots_name[];
+extern char bots_parameters[];
+extern char bots_model[];
+extern char bots_resources[];
+/* compile and execution information */
+extern char bots_exec_date[];
+extern char bots_exec_message[];
+extern char bots_comp_date[];
+extern char bots_comp_message[];
+extern char bots_cc[];
+extern char bots_cflags[];
+extern char bots_ld[];
+extern char bots_ldflags[];
+/* time variables */
+extern double bots_time_program;
+extern double bots_time_sequential;
+
+/* number of tasks variable */
+extern unsigned long long bots_number_of_tasks; /* forcing 8 bytes size on -m32 and -m64 */
+
+extern char bots_cutoff[];
+extern int  bots_cutoff_value;
+
+extern int  bots_app_cutoff_value;
+extern int  bots_app_cutoff_value_1;
+extern int  bots_app_cutoff_value_2;
+
+extern int bots_arg_size;
+extern int bots_arg_size_1;
+extern int bots_arg_size_2;
+
+/* function could be used in app. code but are implemented in bots_common.c */
+long bots_usecs();
+void bots_error(int error, char *message);
+void bots_warning(int warning, char *message);
+
+#define BOTS_RESULT_NA 0
+#define BOTS_RESULT_SUCCESSFUL 1
+#define BOTS_RESULT_UNSUCCESSFUL 2
+#define BOTS_RESULT_NOT_REQUESTED 3
+
+
+typedef enum { BOTS_VERBOSE_NONE=0,
+               BOTS_VERBOSE_DEFAULT,
+               BOTS_VERBOSE_DEBUG } bots_verbose_mode_t;
+
+extern bots_verbose_mode_t bots_verbose_mode;
+
+#define bots_message(msg, ...) \
+   {\
+      if ( bots_verbose_mode >= BOTS_VERBOSE_DEFAULT ) {\
+        PRINTC(msg , ##__VA_ARGS__);\
+      }\
+   }
+
+#ifdef BOTS_DEBUG
+#define bots_debug(msg, ...) \
+   {\
+      if ( bots_verbose_mode >= BOTS_VERBOSE_DEBUG ) {\
+       PRINTC(msg , ##__VA_ARGS__);\
+      }\
+   }
+#define bots_debug_with_location_info(msg, ...) \
+   {\
+      if ( bots_verbose_mode >= BOTS_VERBOSE_DEBUG ) {\
+       PRINTC("%s:%d:%s:" msg ,__FILE__, __LINE__,__func__,##__VA_ARGS__);\
+      }\
+   }
+#else
+#define bots_debug(msg, ...)
+#define bots_debug_with_location_info(msg, ...)
+#endif
+
+#define FALSE 0
+#define TRUE 1
+
+#endif
+
+
diff --git a/src/components/implementation/no_interface/omp_fib_bots/bots_common.c b/src/components/implementation/no_interface/omp_fib_bots/bots_common.c
new file mode 100644
index 0000000000..95d71f172a
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fib_bots/bots_common.c
@@ -0,0 +1,340 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/utsname.h>
+#include <sys/resource.h>
+
+#include "bots_common.h"
+#include "bots_main.h"
+#include "bots.h"
+
+void
+bots_error(int error, char *message)
+{
+   if (message == NULL)
+   {
+      switch(error)
+      {
+         case BOTS_ERROR:
+            PRINTC("Error (%d): %s\n",error,"Unspecified error.");
+            break;
+         case BOTS_ERROR_NOT_ENOUGH_MEMORY:
+            PRINTC("Error (%d): %s\n",error,"Not enough memory.");
+            break;
+         case BOTS_ERROR_UNRECOGNIZED_PARAMETER:
+            PRINTC("Error (%d): %s\n",error,"Unrecognized parameter.");
+            bots_print_usage();
+            break;
+         default:
+            PRINTC("Error (%d): %s\n",error,"Invalid error code.");
+            break;
+      }
+   }
+   else PRINTC("Error (%d): %s\n",error,message);
+   cos_exit(100+error);
+}
+
+void
+bots_warning(int warning, char *message)
+{
+   if (message == NULL)
+   {
+      switch(warning)
+      {
+         case BOTS_WARNING:
+            PRINTC("Warning (%d): %s\n",warning,"Unspecified warning.");
+            break;
+         default:
+            PRINTC("Warning (%d): %s\n",warning,"Invalid warning code.");
+            break;
+      }
+   }
+   else PRINTC("Warning (%d): %s\n",warning,message);
+}
+
+long bots_usecs (void)
+{
+   struct timeval t;
+   gettimeofday(&t,NULL);
+   return t.tv_sec*1000000+t.tv_usec;
+}
+
+void
+bots_get_date(char *str)
+{
+   time_t now;
+   time(&now);
+   //strftime(str, 32, "%Y/%m/%d;%H:%M", gmtime(&now));
+   strncpy(str, "01/01/0001", 32);
+}
+
+void bots_get_architecture(char *str)
+{
+	snprintf(str, BOTS_TMP_STR_SZ, "Composite-i386;%d", NUM_CPU);
+//   int ncpus = sysconf(_SC_NPROCESSORS_CONF);
+//   struct utsname architecture;
+//
+//   uname(&architecture);
+//   snprintf(str, BOTS_TMP_STR_SZ, "%s-%s;%d" ,architecture.sysname, architecture.machine, ncpus);
+}
+
+#undef __linux
+#if defined (__linux)
+/* ****************************************************************** */
+void bots_get_load_average(char *str)
+{
+   double loadavg[3];
+   getloadavg (loadavg, 3);
+   snprintf(str, BOTS_TMP_STR_SZ, "%.2f;%.2f;%.2f",loadavg[0],loadavg[1],loadavg[2]);
+}
+#else
+/* ****************************************************************** */
+void bots_get_load_average(char *str) { sprintf(str,";;"); }
+#endif
+
+void bots_print_results()
+{
+   char str_name[BOTS_TMP_STR_SZ];
+   char str_parameters[BOTS_TMP_STR_SZ];
+   char str_model[BOTS_TMP_STR_SZ];
+   char str_resources[BOTS_TMP_STR_SZ];
+   char str_result[15];
+   char str_time_program[15];
+   char str_time_sequential[15];
+   char str_speed_up[15];
+   char str_number_of_tasks[15];
+   char str_number_of_tasks_per_second[15];
+   char str_exec_date[BOTS_TMP_STR_SZ];
+   char str_exec_message[BOTS_TMP_STR_SZ];
+   char str_architecture[BOTS_TMP_STR_SZ];
+   char str_load_avg[BOTS_TMP_STR_SZ];
+   char str_comp_date[BOTS_TMP_STR_SZ];
+   char str_comp_message[BOTS_TMP_STR_SZ];
+   char str_cc[BOTS_TMP_STR_SZ];
+   char str_cflags[BOTS_TMP_STR_SZ];
+   char str_ld[BOTS_TMP_STR_SZ];
+   char str_ldflags[BOTS_TMP_STR_SZ];
+   char str_cutoff[BOTS_TMP_STR_SZ];
+
+   /* compute output strings */
+   sprintf(str_name, "%s", bots_name);
+   sprintf(str_parameters, "%s", bots_parameters);
+   sprintf(str_model, "%s", bots_model);
+   sprintf(str_cutoff, "%s", bots_cutoff);
+   sprintf(str_resources, "%s", bots_resources);
+   switch(bots_result)
+   {
+      case BOTS_RESULT_NA: 
+         sprintf(str_result, "n/a");
+         break;
+      case BOTS_RESULT_SUCCESSFUL: 
+         sprintf(str_result, "successful");
+         break;
+      case BOTS_RESULT_UNSUCCESSFUL: 
+         sprintf(str_result, "UNSUCCESSFUL");
+         break;
+      case BOTS_RESULT_NOT_REQUESTED:
+         sprintf(str_result, "Not requested");
+         break;
+      default: 
+         sprintf(str_result, "error");
+         break;
+   }
+   sprintf(str_time_program, "%f", bots_time_program);
+   if (bots_sequential_flag) sprintf(str_time_sequential, "%f", bots_time_sequential);
+   else sprintf(str_time_sequential, "n/a");
+   if (bots_sequential_flag)
+   sprintf(str_speed_up, "%3.2f", bots_time_sequential/bots_time_program);
+   else sprintf(str_speed_up, "n/a");
+
+   sprintf(str_number_of_tasks, "%3.2f", (float) bots_number_of_tasks);
+   sprintf(str_number_of_tasks_per_second, "%3.2f", (float) bots_number_of_tasks/bots_time_program);
+
+   sprintf(str_exec_date, "%s", bots_exec_date);
+   sprintf(str_exec_message, "%s", bots_exec_message);
+   bots_get_architecture(str_architecture);
+   bots_get_load_average(str_load_avg);
+   sprintf(str_comp_date, "%s", bots_comp_date);
+   sprintf(str_comp_message, "%s", bots_comp_message);
+   sprintf(str_cc, "%s", bots_cc);
+   sprintf(str_cflags, "%s", bots_cflags);
+   sprintf(str_ld, "%s", bots_ld);
+   sprintf(str_ldflags, "%s", bots_ldflags);
+
+   if(bots_print_header)
+   {
+      switch(bots_output_format)
+      {
+         case 0:
+            break;
+         case 1:
+            break;
+         case 2:
+PRINTC(
+"Benchmark;Parameters;Model;Cutoff;Resources;Result;\
+Time;Sequential;Speed-up;\
+Nodes;Nodes/Sec;\
+Exec Date;Exec Time;Exec Message;\
+Architecture;Processors;Load Avg-1;Load Avg-5;Load Avg-15;\
+Comp Date;Comp Time;Comp Message;CC;CFLAGS;LD;LDFLAGS\n");
+            break;
+         case 3:
+            break;
+         case 4:
+PRINTC(
+"Benchmark;Parameters;Model;Cutoff;Resources;Result;\
+Time;Sequential;Speed-up;\
+Nodes;Nodes/Sec;\n");
+            break;
+         default:
+            break;
+      }
+   }
+
+   /* print results */
+   switch(bots_output_format)
+   {
+      case 0:
+         break;
+      case 1:
+	 PRINTC("\n");
+         PRINTC("Program             = %s\n", str_name); /*fix*/
+         PRINTC("Parameters          = %s\n", str_parameters); /*fix*/
+         PRINTC("Model               = %s\n", str_model); 
+         PRINTC("Embedded cut-off    = %s\n", str_cutoff); 
+         PRINTC("# of Threads        = %s\n", str_resources);
+         PRINTC("Verification        = %s\n", str_result);
+
+         PRINTC("Time Program        = %s seconds\n", str_time_program);
+	 if (bots_sequential_flag) {
+           PRINTC("Time Sequential     = %s seconds\n", str_time_sequential);
+           PRINTC("Speed-up            = %s\n", str_speed_up);
+	 }
+
+         if ( bots_number_of_tasks > 0 ) {
+           PRINTC("Nodes               = %s\n", str_number_of_tasks);
+           PRINTC("Nodes/Sec           = %s\n", str_number_of_tasks_per_second);
+	 }
+
+         PRINTC("Execution Date      = %s\n", str_exec_date);
+         PRINTC("Execution Message   = %s\n", str_exec_message);
+
+         PRINTC("Architecture        = %s\n", str_architecture);
+         PRINTC("Load Avg [1:5:15]   = %s\n", str_load_avg);
+
+         PRINTC("Compilation Date    = %s\n", str_comp_date);
+         PRINTC("Compilation Message = %s\n", str_comp_message);
+
+         PRINTC("Compiler            = %s\n", str_cc);
+         PRINTC("Compiler Flags      = %s\n", str_cflags);
+         PRINTC("Linker              = %s\n", str_ld);
+         PRINTC("Linker Flags        = %s\n", str_ldflags);
+	 fflush(stdout);
+         break;
+      case 2:
+         PRINTC("%s;%s;%s;%s;%s;%s;", 
+              str_name, 
+              str_parameters, 
+              str_model, 
+              str_cutoff, 
+              str_resources, 
+              str_result
+         );
+         PRINTC("%s;%s;%s;", 
+              str_time_program, 
+              str_time_sequential, 
+              str_speed_up 
+         );
+         PRINTC("%s;%s;", 
+              str_number_of_tasks, 
+              str_number_of_tasks_per_second
+         );
+         PRINTC("%s;%s;", 
+              str_exec_date,
+              str_exec_message
+         );
+         PRINTC("%s;%s;", 
+              str_architecture,
+              str_load_avg
+         );
+         PRINTC("%s;%s;", 
+              str_comp_date,
+              str_comp_message
+         );
+         PRINTC("%s;%s;%s;%s;",
+              str_cc,
+              str_cflags,
+              str_ld,
+              str_ldflags
+         );
+         PRINTC("\n");
+         break;
+      case 3:
+	 PRINTC("\n");
+         PRINTC("Program             = %s\n", str_name); /*fix*/
+         PRINTC("Parameters          = %s\n", str_parameters); /*fix*/
+         PRINTC("Model               = %s\n", str_model); 
+         PRINTC("Embedded cut-off    = %s\n", str_cutoff); 
+         PRINTC("# of Threads        = %s\n", str_resources);
+         PRINTC("Verification        = %s\n", str_result);
+
+         PRINTC("Time Program        = %s seconds\n", str_time_program);
+	 if (bots_sequential_flag) {
+           PRINTC("Time Sequential     = %s seconds\n", str_time_sequential);
+           PRINTC("Speed-up            = %s\n", str_speed_up);
+	 }
+
+         if ( bots_number_of_tasks > 0 ) {
+           PRINTC("Nodes               = %s\n", str_number_of_tasks);
+           PRINTC("Nodes/Sec           = %s\n", str_number_of_tasks_per_second);
+	 }
+         break;
+      case 4:
+         PRINTC("%s;%s;%s;%s;%s;%s;", 
+              str_name, 
+              str_parameters, 
+              str_model, 
+              str_cutoff, 
+              str_resources, 
+              str_result
+         );
+         PRINTC("%s;%s;%s;", 
+              str_time_program, 
+              str_time_sequential, 
+              str_speed_up 
+         );
+         PRINTC("%s;%s;", 
+              str_number_of_tasks, 
+              str_number_of_tasks_per_second
+         );
+         PRINTC("\n");
+         break;
+      default:
+         bots_error(BOTS_ERROR,"No valid output format\n");
+         break;
+   }
+}
+
diff --git a/src/components/implementation/no_interface/omp_fib_bots/bots_common.h b/src/components/implementation/no_interface/omp_fib_bots/bots_common.h
new file mode 100644
index 0000000000..1e306b7f1d
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fib_bots/bots_common.h
@@ -0,0 +1,56 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#ifndef _COMMON_H
+#define _COMMON_H
+
+#ifndef CC
+#define CC "GCC"
+#endif
+#ifndef CFLAGS
+#define CFLAGS "-fopenmp"
+#endif
+#ifndef LD
+#define LD "LD"
+#endif
+#ifndef LDFLAGS
+#define LDFLAGS "-fopenmp -lcos_gomp"
+#endif
+#ifndef CDATE
+#define CDATE "01/01/0001"
+#endif
+#ifndef CMESSAGE
+#define CMESSAGE "Done!"
+#endif
+
+#define BOTS_ERROR                         0
+#define BOTS_ERROR_NOT_ENOUGH_MEMORY       1
+#define BOTS_ERROR_UNRECOGNIZED_PARAMETER  2
+
+#define BOTS_WARNING                       0
+
+void bots_get_date(char *str);
+void bots_get_architecture(char *str);
+void bots_get_load_average(char *str);
+void bots_print_results(void);
+
+#define BOTS_TMP_STR_SZ 256
+
+#endif
diff --git a/src/components/implementation/no_interface/omp_fib_bots/bots_main.c b/src/components/implementation/no_interface/omp_fib_bots/bots_main.c
new file mode 100644
index 0000000000..2c168be403
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fib_bots/bots_main.c
@@ -0,0 +1,538 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+/***********************************************************************
+ * main function & common behaviour of the benchmark.
+ **********************************************************************/
+#include <stdio.h>
+#include <stdlib.h> 
+#include <string.h>
+#include <math.h>
+#include <stddef.h>
+#include <memory.h>
+#include <sys/time.h>
+#include <libgen.h>
+#include "bots_common.h"
+#include "bots_main.h"
+#include "bots.h"
+#include "app-desc.h"
+#include <cos_component.h>
+
+/***********************************************************************
+ * DEFAULT VALUES 
+ *********************************************************************/
+/* common flags */
+int bots_sequential_flag = FALSE;
+int bots_check_flag = FALSE;
+bots_verbose_mode_t bots_verbose_mode = BOTS_VERBOSE_DEFAULT;
+int bots_result = BOTS_RESULT_NOT_REQUESTED;
+int bots_output_format = 1;
+int bots_print_header = FALSE;
+/* common variables */
+char bots_name[BOTS_TMP_STR_SZ];
+char bots_execname[BOTS_TMP_STR_SZ];
+char bots_parameters[BOTS_TMP_STR_SZ];
+char bots_model[BOTS_TMP_STR_SZ];
+char bots_resources[BOTS_TMP_STR_SZ];
+/* compile and execution information */
+char bots_exec_date[BOTS_TMP_STR_SZ];
+char bots_exec_message[BOTS_TMP_STR_SZ];
+char bots_comp_date[BOTS_TMP_STR_SZ];
+char bots_comp_message[BOTS_TMP_STR_SZ];
+char bots_cc[BOTS_TMP_STR_SZ];
+char bots_cflags[BOTS_TMP_STR_SZ];
+char bots_ld[BOTS_TMP_STR_SZ];
+char bots_ldflags[BOTS_TMP_STR_SZ];
+char bots_cutoff[BOTS_TMP_STR_SZ];
+
+/* time variables */
+double bots_time_program = 0.0;
+double bots_time_sequential = 0.0;
+unsigned long long bots_number_of_tasks = 0; /* forcing 8 bytes size in -m32 and -m64 */
+
+/*
+ * Application dependent info
+ */
+
+#ifndef BOTS_APP_NAME
+#error "Application name must be defined (#define BOTS_APP_NAME)"
+#endif
+
+#ifndef BOTS_APP_PARAMETERS_DESC
+#define BOTS_APP_PARAMETERS_DESC ""
+#endif
+
+#ifndef BOTS_APP_PARAMETERS_LIST
+#define BOTS_APP_PARAMETERS_LIST
+#endif
+
+#ifndef BOTS_APP_INIT
+#define BOTS_APP_INIT
+#endif
+
+#ifndef BOTS_APP_FINI
+#define BOTS_APP_FINI
+#endif
+
+#ifndef KERNEL_CALL
+#error "Initial kernell call must be specified (#define KERNEL_CALL)"
+#endif
+
+#ifndef KERNEL_INIT
+#define KERNEL_INIT
+#endif
+
+#ifndef KERNEL_FINI
+#define KERNEL_FINI
+#endif
+
+#ifndef KERNEL_SEQ_INIT
+#define KERNEL_SEQ_INIT
+#endif
+
+#ifndef KERNEL_SEQ_FINI
+#define KERNEL_SEQ_FINI
+#endif
+
+#ifndef BOTS_MODEL_DESC
+#define BOTS_MODEL_DESC "Unknown"
+#endif
+
+#ifdef BOTS_APP_USES_ARG_SIZE
+#ifndef BOTS_APP_DEF_ARG_SIZE
+#error "Default vaule for argument size must be specified (#define BOTS_APP_DEF_ARG_SIZE)"
+#endif
+#ifndef BOTS_APP_DESC_ARG_SIZE
+#error "Help description for argument size must be specified (#define BOTS_APP_DESC_ARG_SIZE)"
+#endif
+int bots_arg_size = BOTS_APP_DEF_ARG_SIZE;
+#endif
+
+#ifdef BOTS_APP_USES_ARG_SIZE_1
+#ifndef BOTS_APP_DEF_ARG_SIZE_1
+#error "Default vaule for argument size must be specified (#define BOTS_APP_DEF_ARG_SIZE_1)"
+#endif
+#ifndef BOTS_APP_DESC_ARG_SIZE_1
+#error "Help description for argument size must be specified (#define BOTS_APP_DESC_ARG_SIZE_1)"
+#endif
+int bots_arg_size_1 = BOTS_APP_DEF_ARG_SIZE_1;
+#endif
+
+#ifdef BOTS_APP_USES_ARG_SIZE_2
+#ifndef BOTS_APP_DEF_ARG_SIZE_2
+#error "Default vaule for argument size must be specified (#define BOTS_APP_DEF_ARG_SIZE_2)"
+#endif
+#ifndef BOTS_APP_DESC_ARG_SIZE_2
+#error "Help description for argument size must be specified (#define BOTS_APP_DESC_ARG_SIZE_2)"
+#endif
+int bots_arg_size_2 = BOTS_APP_DEF_ARG_SIZE_2;
+#endif
+
+#ifdef BOTS_APP_USES_ARG_REPETITIONS
+#ifndef BOTS_APP_DEF_ARG_REPETITIONS
+#error "Default vaule for argument repetitions must be specified (#define BOTS_APP_DEF_ARG_REPETITIONS)"
+#endif
+#ifndef BOTS_APP_DESC_ARG_REPETITIONS
+#error "Help description for argument repetitions must be specified (#define BOTS_APP_DESC_ARG_REPETITIONS)"
+#endif
+int bots_arg_repetitions = BOTS_APP_DEF_ARG_REPETITIONS;
+#endif
+
+#ifdef BOTS_APP_USES_ARG_FILE
+#ifndef BOTS_APP_DESC_ARG_FILE
+#error "Help description for argument file must be specified (#define BOTS_APP_DESC_ARG_FILE)"
+#endif
+char bots_arg_file[255]="";
+#endif
+
+#ifdef BOTS_APP_USES_ARG_BLOCK
+#ifndef BOTS_APP_DEF_ARG_BLOCK
+#error "Default value for argument block must be specified (#define BOTS_APP_DEF_ARG_BLOCK)"
+#endif
+#ifndef BOTS_APP_DESC_ARG_BLOCK
+#error "Help description for argument block must be specified (#define BOTS_APP_DESC_ARG_BLOCK)"
+#endif
+int bots_arg_block = BOTS_APP_DEF_ARG_BLOCK;
+#endif
+
+#ifdef BOTS_APP_USES_ARG_CUTOFF
+#ifndef BOTS_APP_DEF_ARG_CUTOFF
+#error "Default value for argument cutoff  must be specified (#define BOTS_APP_DEF_ARG_CUTOFF)"
+#endif
+#ifndef BOTS_APP_DESC_ARG_CUTOFF
+#error "Help description for argument cutoff must be specified (#define BOTS_APP_DESC_ARG_CUTOFF)"
+#endif
+int bots_app_cutoff_value = BOTS_APP_DEF_ARG_CUTOFF;
+#endif
+
+#ifdef BOTS_APP_USES_ARG_CUTOFF_1
+#ifndef BOTS_APP_DEF_ARG_CUTOFF_1
+#error "Default value for argument cutoff  must be specified (#define BOTS_APP_DEF_ARG_CUTOFF_1)"
+#endif
+#ifndef BOTS_APP_DESC_ARG_CUTOFF_1
+#error "Help description for argument cutoff must be specified (#define BOTS_APP_DESC_ARG_CUTOFF_1)"
+#endif
+int bots_app_cutoff_value_1 = BOTS_APP_DEF_ARG_CUTOFF_1;
+#endif
+
+#ifdef BOTS_APP_USES_ARG_CUTOFF_2
+#ifndef BOTS_APP_DEF_ARG_CUTOFF_2
+#error "Default value for argument cutoff  must be specified (#define BOTS_APP_DEF_ARG_CUTOFF_2)"
+#endif
+#ifndef BOTS_APP_DESC_ARG_CUTOFF_2
+#error "Help description for argument cutoff must be specified (#define BOTS_APP_DESC_ARG_CUTOFF_2)"
+#endif
+int bots_app_cutoff_value_2 = BOTS_APP_DEF_ARG_CUTOFF_2;
+#endif
+
+#if defined(MANUAL_CUTOFF) || defined(IF_CUTOFF) || defined(FINAL_CUTOFF)
+int  bots_cutoff_value = BOTS_CUTOFF_DEF_VALUE;
+#endif
+
+/***********************************************************************
+ * print_usage: 
+ **********************************************************************/
+void bots_print_usage()
+{
+   PRINTC("\n");
+   PRINTC("Usage: %s -[options]\n", bots_execname);
+   PRINTC("\n");
+   PRINTC("Where options are:\n");
+#ifdef BOTS_APP_USES_REPETITIONS
+   PRINTC("  -r <value> : Set the number of repetitions (default = 1).\n");
+#endif
+#ifdef BOTS_APP_USES_ARG_SIZE
+   PRINTC("  -n <size>  : "BOTS_APP_DESC_ARG_SIZE"\n");
+#endif
+#ifdef BOTS_APP_USES_ARG_SIZE_1
+   PRINTC("  -m <size>  : "BOTS_APP_DESC_ARG_SIZE_1"\n");
+#endif
+#ifdef BOTS_APP_USES_ARG_SIZE_2
+   PRINTC("  -l <size>  : "BOTS_APP_DESC_ARG_SIZE_2"\n");
+#endif
+#ifdef BOTS_APP_USES_ARG_FILE
+   PRINTC("  -f <file>  : "BOTS_APP_DESC_ARG_FILE"\n");
+#endif
+#if defined(MANUAL_CUTOFF) || defined(IF_CUTOFF) || defined(FINAL_CUTOFF)
+   PRINTC("  -x <value> : OpenMP tasks cut-off value (default=%d)\n",BOTS_CUTOFF_DEF_VALUE);
+#endif
+#ifdef BOTS_APP_USES_ARG_CUTOFF
+   PRINTC("  -y <value> : "BOTS_APP_DESC_ARG_CUTOFF"(default=%d)\n", BOTS_APP_DEF_ARG_CUTOFF);
+#endif
+#ifdef BOTS_APP_USES_ARG_CUTOFF_1
+   PRINTC("  -a <value> : "BOTS_APP_DESC_ARG_CUTOFF_1"(default=%d)\n", BOTS_APP_DEF_ARG_CUTOFF_1);
+#endif
+#ifdef BOTS_APP_USES_ARG_CUTOFF_2
+   PRINTC("  -b <value> : "BOTS_APP_DESC_ARG_CUTOFF_2"(default=%d)\n", BOTS_APP_DEF_ARG_CUTOFF_2);
+#endif
+
+   PRINTC("\n");
+   PRINTC("  -e <str>   : Include 'str' execution message.\n");
+   PRINTC("  -v <level> : Set verbose level (default = 1).\n");
+   PRINTC("               0 - none.\n");
+   PRINTC("               1 - default.\n");
+   PRINTC("               2 - debug.\n");
+   PRINTC("  -o <value> : Set output format mode (default = 1).\n");
+   PRINTC("               0 - no benchmark output.\n");
+   PRINTC("               1 - detailed list format.\n");
+   PRINTC("               2 - detailed row format.\n");
+   PRINTC("               3 - abridged list format.\n");
+   PRINTC("               4 - abridged row format.\n");
+   PRINTC("  -z         : Print row header (if output format is a row variant).\n");
+   PRINTC("\n");
+#ifdef KERNEL_SEQ_CALL
+   PRINTC("  -s         : Run sequential version.\n");
+#endif
+#ifdef BOTS_APP_CHECK_USES_SEQ_RESULT
+   PRINTC("  -c         : Check mode ON (implies running sequential version).\n");
+#else
+   PRINTC("  -c         : Check mode ON.\n");
+#endif
+   PRINTC("\n");
+   PRINTC("  -h         : Print program's usage (this help).\n");
+   PRINTC("\n");
+}
+/***********************************************************************
+ * bots_get_params_common: 
+ **********************************************************************/
+void
+bots_get_params_common(int argc, char **argv)
+{
+   int i;
+   strcpy(bots_execname, basename(argv[0]));
+   bots_get_date(bots_exec_date);
+   strcpy(bots_exec_message,"");
+   for (i=1; i<argc; i++) 
+   {
+      if (argv[i][0] == '-')
+      {
+         switch (argv[i][1])
+         {
+#ifdef BOTS_APP_USES_ARG_CUTOFF_1
+	    case 'a':
+	       argv[i][1] = '*';
+               i++;
+               if (argc == i) { bots_print_usage(); cos_exit(100); }
+               bots_app_cutoff_value_1 = atoi(argv[i]);
+               break;
+#endif
+#ifdef BOTS_APP_USES_ARG_CUTOFF_2
+	    case 'b':
+	       argv[i][1] = '*';
+               i++;
+               if (argc == i) { bots_print_usage(); cos_exit(100); }
+               bots_app_cutoff_value_2 = atoi(argv[i]);
+               break;
+#endif
+            case 'c': /* set/unset check mode */
+               argv[i][1] = '*';
+               //i++;
+               //if (argc == i) { bots_print_usage(); cos_exit(100); }
+               //bots_check_flag = atoi(argv[i]);
+               bots_check_flag = TRUE;
+               break;
+            case 'e': /* include execution message */
+               argv[i][1] = '*';
+               i++;
+               if (argc == i) { bots_print_usage(); cos_exit(100); }
+               strcpy(bots_exec_message, argv[i]);
+               break;
+#ifdef BOTS_APP_USES_ARG_FILE
+            case 'f': /* read argument file name */
+               argv[i][1] = '*';
+               i++;
+               if (argc == i) { bots_print_usage(); cos_exit(100); }
+               strcpy(bots_arg_file,argv[i]);
+               break;
+#endif
+            case 'h': /* print usage */
+               argv[i][1] = '*';
+               bots_print_usage();
+               cos_exit (100);
+	       break;
+#ifdef BOTS_APP_USES_ARG_SIZE_2
+            case 'l': /* read argument size 2 */
+               argv[i][1] = '*';
+               i++;
+               if (argc == i) { bots_print_usage(); cos_exit(100); }
+               bots_arg_size_2 = atoi(argv[i]);
+               break;
+#endif
+#ifdef BOTS_APP_USES_ARG_SIZE_1
+            case 'm': /* read argument size 1 */
+               argv[i][1] = '*';
+               i++;
+               if (argc == i) { bots_print_usage(); cos_exit(100); }
+               bots_arg_size_1 = atoi(argv[i]);
+               break;
+#endif
+#ifdef BOTS_APP_USES_ARG_SIZE
+            case 'n': /* read argument size 0 */
+               argv[i][1] = '*';
+               i++;
+               if (argc == i) { bots_print_usage(); cos_exit(100); }
+               bots_arg_size = atoi(argv[i]);
+               break;
+#endif
+#ifdef BOTS_APP_USES_ARG_BLOCK
+/*TODO*/
+#endif
+            case 'o': /* set output mode */
+               argv[i][1] = '*';
+               i++;
+               if (argc == i) { bots_print_usage(); cos_exit(100); }
+               bots_output_format = atoi(argv[i]);
+               break;
+#ifdef BOTS_APP_USES_REPETITIONS
+            case 'r': /* set number of repetitions */
+               argv[i][1] = '*';
+               i++;
+               if (argc == i) { bots_print_usage(); cos_exit(100); }
+               bots_arg_repetition = atoi(argv[i]);
+               break;
+#endif
+#ifdef KERNEL_SEQ_CALL
+            case 's': /* set sequential execution */
+               argv[i][1] = '*';
+               //i++;
+               //if (argc == i) { bots_print_usage(); cos_exit(100); }
+               //bots_sequential_flag = atoi(argv[i]);
+               bots_sequential_flag = TRUE;
+               break;
+#endif
+            case 'v': /* set/unset verbose level */
+               argv[i][1] = '*';
+               i++;
+               if (argc == i) { bots_print_usage(); cos_exit(100); }
+               bots_verbose_mode = (bots_verbose_mode_t) atoi(argv[i]);
+#ifndef BOTS_DEBUG
+               if ( bots_verbose_mode > 1 ) {
+                  PRINTC("Error: Configure the suite using '--debug' option in order to use a verbose level greather than 1.\n");
+                  cos_exit(100);
+               }
+#endif
+               break;
+#if defined(MANUAL_CUTOFF) || defined(IF_CUTOFF) || defined(FINAL_CUTOFF)
+	    case 'x':
+	       argv[i][1] = '*';
+               i++;
+               if (argc == i) { bots_print_usage(); cos_exit(100); }
+               bots_cutoff_value = atoi(argv[i]);
+               break;
+#endif
+#ifdef BOTS_APP_USES_ARG_CUTOFF
+	    case 'y':
+	       argv[i][1] = '*';
+               i++;
+               if (argc == i) { bots_print_usage(); cos_exit(100); }
+               bots_app_cutoff_value = atoi(argv[i]);
+               break;
+#endif
+	    case 'z':
+	       argv[i][1] = '*';
+               bots_print_header = TRUE;
+               break;
+            default:
+               // As at the moment there are only common paramenters
+               // we launch an error. Otherwise we have to ignore the
+               // parameter and to check, after specific parameters are
+               // completely read, if there are unrecognized parameters.
+               PRINTC("Error: Unrecognized parameter.\n");
+               bots_print_usage();
+               cos_exit (100);
+         }
+      }
+      else
+      {
+         // As at the moment there are only common paramenters
+         // we launch an error. Otherwise we have to ignore the
+         // parameter and to check, after specific parameters are
+         // completely read, if there are unrecognized parameters.
+         PRINTC("Error: Unrecognized parameter.\n");
+         bots_print_usage();
+         cos_exit (100);
+      }
+   }
+}
+/***********************************************************************
+ * bots_get_params_common: 
+ **********************************************************************/
+void
+bots_get_params(int argc, char **argv)
+{
+   bots_get_params_common(argc, argv);
+//   bots_get_params_specific(argc, argv);
+}
+
+
+/***********************************************************************
+ * bots_set_info 
+ **********************************************************************/
+void bots_set_info ()
+{
+   /* program specific info */
+   snprintf(bots_name, BOTS_TMP_STR_SZ, BOTS_APP_NAME);
+   snprintf(bots_parameters, BOTS_TMP_STR_SZ, BOTS_APP_PARAMETERS_DESC BOTS_APP_PARAMETERS_LIST);
+   snprintf(bots_model, BOTS_TMP_STR_SZ, BOTS_MODEL_DESC);
+   snprintf(bots_resources, BOTS_TMP_STR_SZ, "%d", omp_get_max_threads());
+
+   /* compilation info (do not modify) */
+   snprintf(bots_comp_date, BOTS_TMP_STR_SZ, CDATE);
+   snprintf(bots_comp_message, BOTS_TMP_STR_SZ, CMESSAGE);
+   snprintf(bots_cc, BOTS_TMP_STR_SZ, CC);
+   snprintf(bots_cflags, BOTS_TMP_STR_SZ, CFLAGS);
+   snprintf(bots_ld, BOTS_TMP_STR_SZ, LD);
+   snprintf(bots_ldflags, BOTS_TMP_STR_SZ, LDFLAGS);
+
+#if defined(MANUAL_CUTOFF) 
+   snprintf(bots_cutoff, BOTS_TMP_STR_SZ, "manual (%d)",bots_cutoff_value);
+#elif defined(IF_CUTOFF) 
+   snprintf(bots_cutoff, BOTS_TMP_STR_SZ, "pragma-if (%d)",bots_cutoff_value);
+#elif defined(FINAL_CUTOFF)
+   snprintf(bots_cutoff, BOTS_TMP_STR_SZ, "final (%d)",bots_cutoff_value);
+#else
+   strcpy(bots_cutoff,"none");
+#endif
+}
+
+/***********************************************************************
+ * main: 
+ **********************************************************************/
+int
+main(void)
+{
+   /* TODO: app specific args? */
+   int argc = 1;
+   char *app = "bots_app";
+   char **argv = &app;
+
+#ifndef BOTS_APP_SELF_TIMING
+   long bots_t_start;
+   long bots_t_end;
+#endif
+
+   bots_get_params(argc,argv);
+   BOTS_APP_INIT;
+   bots_set_info();
+
+#ifdef KERNEL_SEQ_CALL
+#ifdef BOTS_APP_CHECK_USES_SEQ_RESULT
+   if (bots_sequential_flag || bots_check_flag)
+#else
+   if (bots_sequential_flag)
+#endif
+   {
+      bots_sequential_flag = 1;
+      KERNEL_SEQ_INIT;
+#ifdef BOTS_APP_SELF_TIMING
+      bots_time_sequential = KERNEL_SEQ_CALL;
+#else
+      bots_t_start = bots_usecs();
+      KERNEL_SEQ_CALL;
+      bots_t_end = bots_usecs();
+      bots_time_sequential = ((double)(bots_t_end-bots_t_start))/1000000;
+#endif
+      KERNEL_SEQ_FINI;
+   }
+#endif
+
+   KERNEL_INIT;
+#ifdef BOTS_APP_SELF_TIMING
+   bots_time_program = KERNEL_CALL;
+#else
+   bots_t_start = bots_usecs();
+   KERNEL_CALL;
+   bots_t_end = bots_usecs();
+   bots_time_program = ((double)(bots_t_end-bots_t_start))/1000000;
+#endif
+   KERNEL_FINI;
+
+#ifdef KERNEL_CHECK
+   if (bots_check_flag) {
+     bots_result = KERNEL_CHECK;
+   }
+#endif
+
+   BOTS_APP_FINI;
+
+   bots_print_results();
+   return (0);
+}
+
diff --git a/src/components/implementation/no_interface/omp_fib_bots/bots_main.h b/src/components/implementation/no_interface/omp_fib_bots/bots_main.h
new file mode 100644
index 0000000000..8d1a9ca9a6
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fib_bots/bots_main.h
@@ -0,0 +1,53 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#define BOTS_PARAM_TYPE_NONE 0
+#define BOTS_PARAM_TYPE_INT 1
+#define BOTS_PARAM_TYPE_BOOL 2
+#define BOTS_PARAM_TYPE_STR 3
+
+#ifdef _OPENMP
+# include <omp.h>
+#else
+# define omp_get_max_threads()  1
+# define omp_get_thread_num()   0
+# define omp_set_num_threads(x)
+#endif
+
+void bots_print_usage(void);
+void bots_print_usage_option(char opt, int type, char* description, char *val, int subc, char **subv);
+
+/***********************************************************************
+ * BENCHMARK HEADERS 
+ *********************************************************************/
+void bots_initialize();
+void bots_finalize();
+void bots_sequential_ini();
+long bots_sequential();
+void bots_sequential_fini();
+int bots_check_result();
+void bots_print_usage_specific();
+void bots_get_params_specific(int argc, char **argv);
+void bots_set_info();
+
+void bots_get_params_common(int argc, char **argv);
+void bots_get_params(int argc, char **argv);
+
+extern void cos_exit(int x);
diff --git a/src/components/implementation/no_interface/omp_fib_bots/fib.c b/src/components/implementation/no_interface/omp_fib_bots/fib.c
new file mode 100644
index 0000000000..445b1b40d5
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fib_bots/fib.c
@@ -0,0 +1,235 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include "bots.h"
+#include "fib.h"
+
+#define FIB_RESULTS_PRE 41
+long long fib_results[FIB_RESULTS_PRE] = {0,1,1,2,3,5,8,13,21,34,55,89,144,233,377,610,987,1597,2584,4181,6765,10946,17711,28657,46368,75025,121393,196418,317811,514229,832040,1346269,2178309,3524578,5702887,9227465,14930352,24157817,39088169,63245986,102334155};
+
+long long fib_seq (int n)
+{
+	int x, y;
+	if (n < 2) return n;
+
+	x = fib_seq(n - 1);
+	y = fib_seq(n - 2);
+
+	return x + y;
+}
+
+#if defined(FORCE_TIED_TASKS)
+#if defined(IF_CUTOFF)
+
+long long fib (int n,int d)
+{
+	long long x, y;
+	if (n < 2) return n;
+
+	#pragma omp task shared(x) firstprivate(n) if(d < bots_cutoff_value)
+	x = fib(n - 1,d+1);
+
+	#pragma omp task shared(y) firstprivate(n) if(d < bots_cutoff_value)
+	y = fib(n - 2,d+1);
+
+	#pragma omp taskwait
+	return x + y;
+}
+
+#elif defined(FINAL_CUTOFF)
+
+long long fib (int n,int d)
+{
+	long long x, y;
+	if (n < 2) return n;
+
+	#pragma omp task shared(x) firstprivate(n) final(d+1 >= bots_cutoff_value) mergeable
+	x = fib(n - 1,d+1);
+
+	#pragma omp task shared(y) firstprivate(n) final(d+1 >= bots_cutoff_value) mergeable
+	y = fib(n - 2,d+1);
+
+	#pragma omp taskwait
+	return x + y;
+}
+
+#elif defined(MANUAL_CUTOFF)
+
+long long fib (int n, int d)
+{
+	long long x, y;
+	if (n < 2) return n;
+
+	if ( d < bots_cutoff_value ) {
+		#pragma omp task shared(x) firstprivate(n)
+		x = fib(n - 1,d+1);
+
+		#pragma omp task shared(y) firstprivate(n)
+		y = fib(n - 2,d+1);
+
+		#pragma omp taskwait
+	} else {
+		x = fib_seq(n-1);
+		y = fib_seq(n-2);
+	}
+
+	return x + y;
+}
+
+#else
+
+long long fib (int n)
+{
+	long long x, y;
+	if (n < 2) return n;
+
+	#pragma omp task shared(x) firstprivate(n)
+	x = fib(n - 1);
+	#pragma omp task shared(y) firstprivate(n)
+	y = fib(n - 2);
+
+	#pragma omp taskwait
+	return x + y;
+}
+
+#endif
+#else
+
+#if defined(IF_CUTOFF)
+
+long long fib (int n,int d)
+{
+	long long x, y;
+	if (n < 2) return n;
+
+	#pragma omp task untied shared(x) firstprivate(n) if(d < bots_cutoff_value)
+	x = fib(n - 1,d+1);
+
+	#pragma omp task untied shared(y) firstprivate(n) if(d < bots_cutoff_value)
+	y = fib(n - 2,d+1);
+
+	#pragma omp taskwait
+	return x + y;
+}
+
+#elif defined(FINAL_CUTOFF)
+
+long long fib (int n,int d)
+{
+	long long x, y;
+	if (n < 2) return n;
+
+	#pragma omp task untied shared(x) firstprivate(n) final(d+1 >= bots_cutoff_value) mergeable
+	x = fib(n - 1,d+1);
+
+	#pragma omp task untied shared(y) firstprivate(n) final(d+1 >= bots_cutoff_value) mergeable
+	y = fib(n - 2,d+1);
+
+	#pragma omp taskwait
+	return x + y;
+}
+
+#elif defined(MANUAL_CUTOFF)
+
+long long fib (int n, int d)
+{
+	long long x, y;
+	if (n < 2) return n;
+
+	if ( d < bots_cutoff_value ) {
+		#pragma omp task untied shared(x) firstprivate(n)
+		x = fib(n - 1,d+1);
+
+		#pragma omp task untied shared(y) firstprivate(n)
+		y = fib(n - 2,d+1);
+
+		#pragma omp taskwait
+	} else {
+		x = fib_seq(n-1);
+		y = fib_seq(n-2);
+	}
+
+	return x + y;
+}
+
+#else
+
+long long fib (int n)
+{
+	long long x, y;
+	if (n < 2) return n;
+
+	#pragma omp task untied shared(x) firstprivate(n)
+	x = fib(n - 1);
+	#pragma omp task untied shared(y) firstprivate(n)
+	y = fib(n - 2);
+
+	#pragma omp taskwait
+	return x + y;
+}
+
+#endif
+#endif
+
+static long long par_res, seq_res;
+
+void fib0 (int n)
+{
+	#pragma omp parallel
+	#pragma omp single
+#if defined(MANUAL_CUTOFF) || defined(IF_CUTOFF) || defined(FINAL_CUTOFF)
+	par_res = fib(n,0);
+#else
+	par_res = fib(n);
+#endif
+	bots_message("Fibonacci result for %d is %lld\n",n,par_res);
+}
+
+void fib0_seq (int n)
+{
+	seq_res = fib_seq(n);
+	bots_message("Fibonacci result for %d is %lld\n",n,seq_res);
+}
+
+long long fib_verify_value(int n)
+{
+	if (n < FIB_RESULTS_PRE) return fib_results[n];
+	return ( fib_verify_value(n-1) + fib_verify_value(n-2));
+}
+
+int fib_verify (int n)
+{
+	int result;
+
+	if (bots_sequential_flag)
+	{
+		if (par_res == seq_res) result = BOTS_RESULT_SUCCESSFUL;
+		else result = BOTS_RESULT_UNSUCCESSFUL;
+	}
+	else
+	{
+		seq_res = fib_verify_value(n);
+		if (par_res == seq_res) result = BOTS_RESULT_SUCCESSFUL;
+		else result = BOTS_RESULT_UNSUCCESSFUL;
+	}
+
+	return result;
+}
+
diff --git a/src/components/implementation/no_interface/omp_fib_bots/fib.h b/src/components/implementation/no_interface/omp_fib_bots/fib.h
new file mode 100644
index 0000000000..e3d2983e7c
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fib_bots/fib.h
@@ -0,0 +1,40 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+#ifndef FIB_H
+#define FIB_H
+#if defined(IF_CUTOFF)
+long long fib (int n,int d);
+#elif defined(FINAL_CUTOFF)
+long long fib (int n,int d);
+#elif defined(MANUAL_CUTOFF)
+long long fib (int n,int d);
+#else
+long long fib (int n);
+#endif
+
+long long fib_seq (int n);
+
+void fib0 (int n);
+void fib0_seq (int n);
+
+int fib_verify (int n);
+long long fib_verify_value(int n);
+#endif
+
diff --git a/src/components/implementation/no_interface/omp_fib_bots/init.c b/src/components/implementation/no_interface/omp_fib_bots/init.c
new file mode 120000
index 0000000000..b2694bf833
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fib_bots/init.c
@@ -0,0 +1 @@
+../omp_hello/init.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_fib_bots/omp-tasks-app.h b/src/components/implementation/no_interface/omp_fib_bots/omp-tasks-app.h
new file mode 100644
index 0000000000..9cbc9282b2
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fib_bots/omp-tasks-app.h
@@ -0,0 +1,31 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include <omp.h>
+
+#define MODEL OMP-TASKS
+
+#ifdef FORCE_TIED_TASKS
+#define BOTS_MODEL_DESC "OpenMP (using tied tasks)"
+#else
+#define BOTS_MODEL_DESC "OpenMP (using tasks)"
+#endif
+
+
diff --git a/src/components/implementation/no_interface/omp_fib_bots/posix_basic.c b/src/components/implementation/no_interface/omp_fib_bots/posix_basic.c
new file mode 120000
index 0000000000..99b9e18548
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fib_bots/posix_basic.c
@@ -0,0 +1 @@
+../omp_dijkstra/posix_basic.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_hello/init.c b/src/components/implementation/no_interface/omp_hello/init.c
index 6973648098..00924aac1e 100644
--- a/src/components/implementation/no_interface/omp_hello/init.c
+++ b/src/components/implementation/no_interface/omp_hello/init.c
@@ -7,6 +7,13 @@
 
 int main(void);
 
+void
+cos_exit(int x)
+{
+	PRINTC("Exit code: %d\n", x);
+	while (1) ;
+}
+
 static void
 cos_main(void *d)
 {
@@ -25,7 +32,7 @@ cos_init(void *d)
 	int i;
 	static volatile unsigned long first = NUM_CPU + 1, init_done[NUM_CPU] = { 0 };
 
-	PRINTC("In OpenMP-based Hello Program!\n");
+	PRINTC("In an OpenMP program!\n");
 	if (ps_cas((unsigned long *)&first, NUM_CPU + 1, cos_cpuid())) {
 		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
 		cos_defcompinfo_init();
diff --git a/src/components/implementation/tests/unit_schedtests/unit_schedlib.c b/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
index c7cd84a532..d917541c88 100644
--- a/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
+++ b/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
@@ -223,8 +223,8 @@ cos_init(void)
 	cos_dcb_info_init_curr();
 	sl_init(SL_MIN_PERIOD_US);
 
-	//test_yield_perf();
-	test_yields();
+	test_yield_perf();
+	//test_yields();
 	//test_blocking_directed_yield();
 	//test_timeout_wakeup();
 
diff --git a/src/components/include/part.h b/src/components/include/part.h
index 65d070db4c..49a895a330 100644
--- a/src/components/include/part.h
+++ b/src/components/include/part.h
@@ -192,12 +192,13 @@ part_task_end(struct part_task *t)
 		int i;
 
 		assert(tn == 0);
-		for (i = 0; i < PART_MAX_CHILD; i++) {
-			while (ps_load(&t->child[i])) sl_thd_yield(0);
-		}
+		part_task_wait_children(t);
 		ps_faa(&t->end, 1);
 		part_task_remove_child(t->parent, t);
-		ts->part_context = 0;
+		if (t->type == PART_TASK_T_WORKSHARE) {
+			assert(t->workers[tn] == t->master);
+			ts->part_context = t->parent;
+		}
 
 		return;
 	}
@@ -263,6 +264,14 @@ part_thd_fn(void *d)
 		t->cs.fn(t->cs.data);
 
 		part_task_end(t);
+		/* free the explicit task! */
+		if (t->type != PART_TASK_T_WORKSHARE) {
+			struct part_data *d = t->data_env;
+
+			part_task_free(t);
+			part_data_free(d);
+		}
+		curr->part_context = NULL;
 	}
 
 	sl_thd_exit();
diff --git a/src/components/include/part_task.h b/src/components/include/part_task.h
index 2249e17eaa..94531a8c5e 100644
--- a/src/components/include/part_task.h
+++ b/src/components/include/part_task.h
@@ -9,8 +9,10 @@
 #define PART_THD(c, t) (cos_cpuid() << 16 | cos_thdid())
 #define PART_CURR_THD  PART_THD(cos_cpuid(), cos_thdid()) 
 
-#define PART_MAX            4 
-#define PART_MAX_CORE_THDS  4
+#define PART_MAX_TASKS      1024
+#define PART_MAX_DATA       512
+#define PART_MAX_PAR_THDS   4
+#define PART_MAX_CORE_THDS  64
 #define PART_MAX_THDS       PART_MAX_CORE_THDS*NUM_CPU
 #define PART_MAX_CHILD      16 
 #define PART_MAX_WORKSHARES 16
@@ -20,6 +22,7 @@ typedef void (*part_fn_t)(void *);
 typedef enum {
 	PART_TASK_S_FREED,
 	PART_TASK_S_ALLOCATED,
+	PART_TASK_S_INITIALIZED,
 	PART_TASK_S_RUNNING,
 	PART_TASK_S_CHILD_WAIT, /* WAIT FOR CHILD TASKS */
 	PART_TASK_S_SIBLING_WAIT, /* WAIT FOR SIBLING TASKS */
@@ -58,6 +61,11 @@ struct part_closure {
 	void     *data;
 };
 
+struct part_data {
+	int flag; /* 0 = not in use, 1 = in use */	
+	char data[PART_MAX_DATA];
+};
+
 struct part_task {
 	part_task_state_t state;
 	part_task_type_t  type;
@@ -66,20 +74,20 @@ struct part_task {
 	struct part_closure   cs;
 
 	unsigned nthds; /* number of threads for this task, 1 in case of non-workshare work */
-	unsigned workers[PART_MAX_THDS]; /* threads sharing this work or thread doing this work! */
-	int ws_off[PART_MAX_THDS]; /* progress of the workshares in each participating thread */
+	unsigned workers[PART_MAX_PAR_THDS]; /* threads sharing this work or thread doing this work! */
+	int ws_off[PART_MAX_PAR_THDS]; /* progress of the workshares in each participating thread */
 	unsigned master; /* coreid << 16 | thdid of the master */
 	unsigned barrier_in, barrier_out, end;
 
-	/* TODO: parent to wait on all child tasks for taskwait synchronization! */
+	struct part_data *data_env; 
 	struct part_task *parent;
-	struct part_task *child[PART_MAX_CHILD];
+	int nchildren;
 
 	struct ps_list partask;
 } CACHE_ALIGNED;
 
 static inline void
-part_task_init(struct part_task *t, part_task_type_t type, struct part_task *p, unsigned nthds, part_fn_t fn, void *data)
+part_task_init(struct part_task *t, part_task_type_t type, struct part_task *p, unsigned nthds, part_fn_t fn, void *data, struct part_data *d)
 {
 	int i;
 
@@ -87,20 +95,28 @@ part_task_init(struct part_task *t, part_task_type_t type, struct part_task *p,
 
 	ps_list_init(t, partask);
 	t->type = type;
-	t->state = PART_TASK_S_ALLOCATED;
+	t->state = PART_TASK_S_INITIALIZED;
 	t->parent = p;
 	t->nthds = nthds;
+	t->nchildren = 0;
+	t->barrier_in = t->barrier_out = t->end = 0;
+	t->data_env = d;
 
 	t->master = PART_CURR_THD;
 	t->cs.fn = fn;
 	t->cs.data = data;
 
-	for (i = 0; i < PART_MAX_THDS; i++) t->ws_off[i] = -1;
+	for (i = 0; i < PART_MAX_PAR_THDS; i++) t->ws_off[i] = -1;
 
 	/* if it's worksharing, current thread is the master and does take part in the par section */
 	if (type == PART_TASK_T_WORKSHARE) t->workers[0] = t->master;
 }
 
+struct part_task *part_task_alloc(part_task_type_t);
+void part_task_free(struct part_task *);
+struct part_data *part_data_alloc(void);
+void part_data_free(struct part_data *);
+
 static inline int
 part_task_add_child(struct part_task *t, struct part_task *c)
 {
@@ -108,11 +124,10 @@ part_task_add_child(struct part_task *t, struct part_task *c)
 
 	if (unlikely(!t || !c)) return -1;
 
-	for (i = 0; i < PART_MAX_CHILD; i++) {
-		if (likely(t->child[i] == 0 && ps_cas(&t->child[i], 0, (unsigned long)c))) return i;
-	}
-
-	return -1;
+	i = ps_faa(&t->nchildren, 1);
+	assert(i < PART_MAX_CHILD);
+	
+	return i;
 }
 
 static inline void
@@ -121,12 +136,18 @@ part_task_remove_child(struct part_task *t, struct part_task *c)
 	int i;
 
 	if (unlikely(!t || !c)) return;
+	assert(ps_load(&t->nchildren));
 
-	for (i = 0; i < PART_MAX_CHILD; i++) {
-		if (t->child[i] != c) continue;
+	i = ps_faa(&t->nchildren, -1);
+	assert(i > 0);
+}
 
-		if (unlikely(!ps_cas(&t->child[i], (unsigned long)c, 0))) assert(0);
-	}
+static inline void
+part_task_wait_children(struct part_task *t)
+{
+	while (ps_load(&t->nchildren) > 0) sl_thd_yield(0);
+
+	assert(t->nchildren == 0);
 }
 
 static inline int
@@ -189,9 +210,7 @@ part_task_barrier(struct part_task *t)
 		assert(tn == 0 && t->barrier_in == 0);
 
 		/* wait for all child tasks to complete, including explicit tasks */
-		for (i = 0; i < PART_MAX_CHILD; i++) {
-			while (ps_load(&t->child[i])) sl_thd_yield(0);
-		}
+		part_task_wait_children(t);
 
 		return;
 	}
@@ -204,9 +223,7 @@ part_task_barrier(struct part_task *t)
 		int i;
 
 		/* wait for all child tasks to complete, including explicit tasks */
-		for (i = 0; i < PART_MAX_CHILD; i++) {
-			while (ps_load(&t->child[i])) sl_thd_yield(0);
-		}
+		part_task_wait_children(t);
 	} else {
 		/* wait for all sibling tasks to reach in barrier! */
 		while (ps_load(&t->barrier_in) % t->nthds != 0) sl_thd_yield(0);
diff --git a/src/components/lib/cos_gomp/cos_gomp.c b/src/components/lib/cos_gomp/cos_gomp.c
index 2869d73062..33b763af63 100644
--- a/src/components/lib/cos_gomp/cos_gomp.c
+++ b/src/components/lib/cos_gomp/cos_gomp.c
@@ -15,49 +15,18 @@
 #include <sl_thd.h>
 #include <sl_lock.h> /* for now, single core lock! */
 #include <cos_omp.h>
+#include <../../interface/capmgr/memmgr.h>
 
 #include "cos_gomp.h"
 #include <crt_lock.h>
 #include <part.h>
 
-#define COS_GOMP_MAX_EXPLICIT_TASKS 1024
-#define COS_GOMP_MAX_IMPLICIT_TASKS 512
-
-#define COS_GOMP_MAX_ARGS    8
-#define COS_GOMP_MAX_ARG_SZ  64
-#define COS_GOMP_MAX_ARGS_SZ (COS_GOMP_MAX_ARGS * COS_GOMP_MAX_ARG_SZ)
-
-static struct part_task _itasks[COS_GOMP_MAX_IMPLICIT_TASKS], _etasks[COS_GOMP_MAX_EXPLICIT_TASKS];
-static unsigned _itask_free, _etask_free, _etask_data_free;
-static char _etask_data[COS_GOMP_MAX_EXPLICIT_TASKS][COS_GOMP_MAX_ARGS_SZ];
-
 static struct crt_lock _glock; /* global lock for critical sections */
 
-static inline struct part_task *
-_cos_gomp_alloc_implicit(void)
-{
-	unsigned i = ps_faa(&_itask_free, 1);
-
-	assert(i < COS_GOMP_MAX_IMPLICIT_TASKS);
-	return &_itasks[i];
-}
-
 static inline struct part_task *
 _cos_gomp_alloc_explicit(void)
 {
-	unsigned i = ps_faa(&_etask_free, 1);
-
-	assert(i < COS_GOMP_MAX_EXPLICIT_TASKS);
-	return &_etasks[i];
-}
-
-static inline char *
-_cos_gomp_alloc_data_explicit(void)
-{
-	unsigned i = ps_faa(&_etask_data_free, 1);
-
-	assert(i < COS_GOMP_MAX_EXPLICIT_TASKS);
-	return _etask_data[i];
+	return part_task_alloc(0);
 }
 
 void
@@ -66,11 +35,6 @@ cos_gomp_init(void)
 	static int first_one = NUM_CPU, init_done = 0;
 
 	if (ps_cas(&first_one, NUM_CPU, cos_cpuid())) {
-		memset(_itasks, 0, sizeof(struct part_task) * COS_GOMP_MAX_IMPLICIT_TASKS);
-		memset(_etasks, 0, sizeof(struct part_task) * COS_GOMP_MAX_EXPLICIT_TASKS);
-		memset(_etask_data, 0, sizeof(char) * COS_GOMP_MAX_EXPLICIT_TASKS * COS_GOMP_MAX_ARGS_SZ);
-		_itask_free = _etask_free = _etask_data_free = 0;
-
 		crt_lock_init(&_glock);
 		cos_omp_init();
 		init_done = 1;
@@ -87,11 +51,12 @@ _gomp_parallel_start(struct part_task *pt, void (*fn) (void *), void *data, unsi
 	struct sl_thd *t = sl_thd_curr();
 	struct part_task *parent = (struct part_task *)t->part_context;
 
-	num_threads = num_threads ? ((num_threads > COS_GOMP_MAX_THDS) ? COS_GOMP_MAX_THDS : num_threads) : PART_MAX;
+	num_threads = (num_threads == 0 || num_threads > COS_GOMP_MAX_THDS) ? COS_GOMP_MAX_THDS : num_threads;
+
 	/* nesting? */
 	if (unlikely(parent && PART_NESTED == 0)) num_threads = 1;
 
-	part_task_init(pt, PART_TASK_T_WORKSHARE, parent, num_threads, fn, data);
+	part_task_init(pt, PART_TASK_T_WORKSHARE, parent, num_threads, fn, data, NULL);
 	assert(pt->nthds == num_threads);
 	if (unlikely(parent)) {
 		parent_off = part_task_add_child(parent, pt);
@@ -330,18 +295,22 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
 	 * tracking before/after execution of the function.
 	 */
 	/* TODO: depend, flags, etc! */
-	assert(!depend);
+	assert(depend == NULL);
 
 	if (if_clause) {
 		struct part_task *pt = _cos_gomp_alloc_explicit();
-		char *arg = _cos_gomp_alloc_data_explicit();
+		struct part_data *d = part_data_alloc();
+		char *arg = NULL;
 
-		assert(arg_size + arg_align - 1 <= COS_GOMP_MAX_ARGS_SZ);
+		assert(pt && d);
+		assert(arg_size + arg_align - 1 <= PART_MAX_DATA);
+		arg = (char *) (((uintptr_t) d->data + arg_align - 1)
+                                & ~(uintptr_t) (arg_align - 1));
 		if (cpyfn) cpyfn(arg, data);
 		else       memcpy(arg, data, arg_size);
 
 		assert(parent);
-		part_task_init(pt, 0, parent, 1, fn, arg);
+		part_task_init(pt, 0, parent, 1, fn, arg, d);
 		parent_off = part_task_add_child(parent, pt);
 		assert(parent_off >= 0);
 
@@ -354,14 +323,25 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
 		struct part_task pt;
 
 		assert(parent);
-		part_task_init(&pt, 0, parent, 1, fn, data);
+		part_task_init(&pt, 0, parent, 1, fn, data, NULL);
 		parent_off = part_task_add_child(parent, &pt);
 		assert(parent_off >= 0);
-
-		/* TODO: do I still need to make a copy like in libgomp? */
-		fn(data);
+		sl_thd_curr()->part_context = &pt;
+		pt.workers[0] = PART_CURR_THD;
+
+		if (cpyfn) {
+			char buf[arg_size + arg_align - 1];
+			char *arg = (char *) (((uintptr_t) buf + arg_align - 1)
+					& ~(uintptr_t) (arg_align - 1));
+
+			cpyfn(arg, data);
+			fn(arg);
+		} else {
+			fn(data);
+		}
 
 		part_task_end(&pt);
+		sl_thd_curr()->part_context = pt.parent;
 	}
 }
 
@@ -369,23 +349,7 @@ void
 GOMP_taskwait (void)
 {
 	struct part_task *t = sl_thd_curr()->part_context;
-	int i;
 
-	for (i = 0; i < PART_MAX_CHILD; i++) {
-		struct part_task *ct = t->child[i];
-
-		if (!ct) continue;
-
-		/* 
-		 * TODO:
-		 * Options for explicit tasks: 
-		 * 1. Perhaps run that task here if it has not been picked up by any other thread, 
-		 *    unfortunately we cannot do that with "deque" data-structure!
-		 * 2. Perhaps yield to a free thread that could potentially run that task? 
-		 * 3. Just yield (a task scheduling point = a thread scheduling point), 
-		 *    so other pending work is taken care of before we get to run again!
-		 */
-		while (ct) sl_thd_yield(0);
-	}
+	part_task_wait_children(t);
 	/* no barriers of course! */
 }
diff --git a/src/components/lib/cos_gomp/cos_gomp.h b/src/components/lib/cos_gomp/cos_gomp.h
index f64de36d88..3cce60a1fe 100644
--- a/src/components/lib/cos_gomp/cos_gomp.h
+++ b/src/components/lib/cos_gomp/cos_gomp.h
@@ -3,9 +3,9 @@
 
 #include <part.h>
 
-#define COS_GOMP_MAX_THDS 4 
+#define COS_GOMP_MAX_THDS PART_MAX_PAR_THDS 
 #define COS_GOMP_CORE_MAX_THDS PART_MAX_CORE_THDS
 #define COS_GOMP_MAX_CHILD PART_MAX_CHILD
-#define COS_GOMP_MAX_TASKS 4096
+#define COS_GOMP_MAX_TASKS PART_MAX_TASKS 
 
 #endif /* COS_GOMP_H */
diff --git a/src/components/lib/cos_gomp/cos_omp.c b/src/components/lib/cos_gomp/cos_omp.c
index f271311648..c8b7309ae6 100644
--- a/src/components/lib/cos_gomp/cos_omp.c
+++ b/src/components/lib/cos_gomp/cos_omp.c
@@ -7,6 +7,7 @@
 
 #include <part_task.h>
 #include <cos_omp.h>
+#include <cos_gomp.h>
 #include <cos_kernel_api.h>
 #include <cos_types.h>
 
@@ -40,7 +41,7 @@ omp_get_num_procs(void)
 __GOMP_NOTHROW int
 omp_get_max_threads(void)
 {
-	return COS_OMP_MAX_NUM_THREADS;
+	return COS_GOMP_MAX_THDS;
 }
 
 __GOMP_NOTHROW int
diff --git a/src/components/lib/part.c b/src/components/lib/part.c
index 270677cd79..c3fffe9b38 100644
--- a/src/components/lib/part.c
+++ b/src/components/lib/part.c
@@ -2,20 +2,91 @@
 #include <cos_component.h>
 #include <part_task.h>
 #include <part.h>
+#include <../interface/capmgr/memmgr.h>
 
 #include <sl.h>
 #include <sl_xcore.h>
 
+#define PART_MAX_PAGES ((PART_MAX_TASKS * sizeof(struct part_task)) / PAGE_SIZE)
+#define PART_MAX_DATA_PAGES ((PART_MAX_TASKS * sizeof(struct part_data)) / PAGE_SIZE)
+
 struct deque_part part_dq_percore[NUM_CPU];
 //struct cirque_par parcq_global;
 struct ps_list_head part_l_global;
 static unsigned part_ready = 0;
 struct crt_lock part_l_lock;
+static struct part_task *part_tasks = NULL;
+static struct part_data *part__data = NULL;
 
 #define PART_DEQUE_SZ 64
 #define _PART_PRIO 1
 #define _PART_PRIO_PACK() sched_param_pack(SCHEDP_PRIO, _PART_PRIO)
 
+struct part_data *
+part_data_alloc(void)
+{
+	int i;
+	struct part_data *d;
+
+	for (i = 0; i < PART_MAX_TASKS; i++) {
+		d = part__data + i;
+
+		if (d->flag) continue;
+
+		/* if this fails, someone else just alloced it! */
+		if (!ps_cas(&d->flag, 0, 1)) continue;
+
+		return d;
+	}
+
+	return NULL;
+}
+
+void
+part_data_free(struct part_data *d)
+{
+	int f;
+
+	if (!d) return;
+
+	do {
+		f = d->flag;
+		assert(f);
+	} while (!ps_cas(&d->flag, f, 0));
+}
+struct part_task *
+part_task_alloc(part_task_type_t type)
+{
+	int i;
+	struct part_task *t;
+
+	for (i = 0; i < PART_MAX_TASKS; i++) {
+		t = part_tasks + i;
+
+		if (t->state != PART_TASK_S_FREED) continue;
+
+		/* if this fails, someone else just alloced it! */
+		if (!ps_cas(&t->state, PART_TASK_S_FREED, PART_TASK_S_ALLOCATED)) continue;
+
+		return t;
+	}
+
+	return NULL;
+}
+
+void
+part_task_free(struct part_task *t)
+{
+	part_task_state_t s = 0;
+
+	if (!t) return;
+
+	do {
+		s = t->state;
+		assert(s != PART_TASK_S_FREED);
+	} while (!ps_cas(&t->state, s, PART_TASK_S_FREED));
+}
+
 unsigned
 part_isready(void)
 { return (part_ready == NUM_CPU); }
@@ -30,6 +101,13 @@ part_init(void)
 		while (!ps_load(&ds_init_done)) ;
 	} else {
 		for (k = 0; k < NUM_CPU; k++) deque_init_part(&part_dq_percore[k], PART_DEQUE_SZ);
+		part_tasks = (struct part_task *)memmgr_heap_page_allocn(PART_MAX_PAGES);
+		assert(part_tasks);
+
+
+		part__data = (struct part_data *)memmgr_heap_page_allocn(PART_MAX_DATA_PAGES);
+		assert(part__data);
+
 		ps_list_head_init(&part_l_global);
 		crt_lock_init(&part_l_lock);
 		ps_faa(&ds_init_done, 1);
diff --git a/src/kernel/include/shared/consts.h b/src/kernel/include/shared/consts.h
index dddbb93a23..ce11fd9152 100644
--- a/src/kernel/include/shared/consts.h
+++ b/src/kernel/include/shared/consts.h
@@ -48,7 +48,7 @@ struct pt_regs {
 #endif
 
 #define MAX_SERVICE_DEPTH 31
-#define MAX_NUM_THREADS (64 * NUM_CPU)
+#define MAX_NUM_THREADS (128 * NUM_CPU)
 
 /* Stacks are 2 * page_size (expressed in words) */
 #define MAX_STACK_SZ_BYTE_ORDER 12
diff --git a/src/kernel/include/shared/cos_config.h b/src/kernel/include/shared/cos_config.h
index 8c46ae5377..bf501b3be9 100644
--- a/src/kernel/include/shared/cos_config.h
+++ b/src/kernel/include/shared/cos_config.h
@@ -17,7 +17,7 @@
 
 #include "cpu_ghz.h"
 
-#define NUM_CPU 2
+#define NUM_CPU 1
 #define NUM_CPU_BMP_BYTES ((NUM_CPU + 7) / 8)
 #define NUM_CPU_BMP_WORDS ((NUM_CPU_BMP_BYTES + 3) / 4)
 
diff --git a/src/platform/i386/runscripts/omp_fib_bots.sh b/src/platform/i386/runscripts/omp_fib_bots.sh
new file mode 100644
index 0000000000..22edc6b958
--- /dev/null
+++ b/src/platform/i386/runscripts/omp_fib_bots.sh
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+cp llboot_comp.o llboot.o
+cp omp_fib_bots.o boot.o
+cp test_boot.o dummy1.o
+cp test_boot.o dummy2.o
+./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub

From 6632a20b0b593252554c1133e3549ffed0e91cc7 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Mon, 13 May 2019 23:39:19 -0400
Subject: [PATCH 064/127] block/wakeup thread pool in part

---
 src/components/include/part.h          | 46 ++++++++++++++++++++++++--
 src/components/include/sl_thd.h        |  1 +
 src/components/lib/cos_gomp/cos_gomp.c | 10 +++++-
 src/components/lib/part.c              |  2 ++
 src/components/lib/sl/sl_capmgr.c      |  1 +
 src/components/lib/sl/sl_raw.c         |  1 +
 6 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/src/components/include/part.h b/src/components/include/part.h
index 49a895a330..ed5af5ae87 100644
--- a/src/components/include/part.h
+++ b/src/components/include/part.h
@@ -19,6 +19,7 @@ extern struct deque_part part_dq_percore[];
 /* FIXME: use stacklist or another stack like data structure? */
 extern struct ps_list_head part_l_global;
 extern struct crt_lock     part_l_lock;
+extern struct ps_list_head part_thdpool_core[];
 
 static inline struct deque_part *
 part_deque_curr(void)
@@ -34,6 +35,12 @@ part_deque_core(cpuid_t c)
 	return &part_dq_percore[c];
 }
 
+static inline struct ps_list_head *
+part_thdpool_curr(void)
+{
+	return &part_thdpool_core[cos_cpuid()];
+}
+
 //static inline struct cirque_par *
 //part_cirque(void)
 //{
@@ -100,6 +107,40 @@ part_deque_steal_any(void)
 	return NULL;
 }
 
+static inline void
+part_pool_wakeup(void)
+{
+	struct sl_thd *t = NULL;
+	int i;
+
+	sl_cs_enter();
+	if (unlikely(ps_list_head_empty(part_thdpool_curr()))) {
+		sl_cs_exit();
+		return;
+	}
+
+	t = ps_list_head_first(part_thdpool_curr(), struct sl_thd, partlist);
+	assert(t != sl_thd_curr());
+	ps_list_rem(t, partlist);
+	sl_cs_exit();
+
+	sl_thd_wakeup(sl_thd_thdid(t));
+}
+
+static inline void
+part_pool_block(void)
+{
+	struct sl_thd *t = sl_thd_curr();
+
+	assert(ps_list_singleton(t, partlist));
+	sl_cs_enter();
+
+	ps_list_head_append(part_thdpool_curr(), t, partlist);
+	sl_cs_exit();
+
+	sl_thd_block(0);
+}
+
 ///* ds memory in a circular queue */
 //static inline struct part_task * 
 //part_cirque_alloc(void)
@@ -224,7 +265,7 @@ part_thd_fn(void *d)
 	struct sl_thd *curr = sl_thd_curr();
 
 	/* parallel runtime not ready? */
-	while (unlikely(!part_isready())) sl_thd_yield(0);
+	if (unlikely(!part_isready())) part_pool_block();
 
 	while (1) {
 		struct part_task *t = NULL;
@@ -251,7 +292,8 @@ part_thd_fn(void *d)
 
 		t = part_deque_steal_any();
 		if (unlikely(!t)) {
-			sl_thd_yield(0);
+			part_pool_block();
+
 			continue;
 		}
 		assert(t->type != PART_TASK_T_WORKSHARE);
diff --git a/src/components/include/sl_thd.h b/src/components/include/sl_thd.h
index 25bfd572e1..632759087f 100644
--- a/src/components/include/sl_thd.h
+++ b/src/components/include/sl_thd.h
@@ -93,6 +93,7 @@ struct sl_thd {
 	struct cos_dcb_info *dcb;
 
 	void *part_context; /* used by the parallelism stuff! */
+	struct ps_list partlist;
 };
 
 static inline struct cos_dcb_info *
diff --git a/src/components/lib/cos_gomp/cos_gomp.c b/src/components/lib/cos_gomp/cos_gomp.c
index 33b763af63..4c98bb01fe 100644
--- a/src/components/lib/cos_gomp/cos_gomp.c
+++ b/src/components/lib/cos_gomp/cos_gomp.c
@@ -64,7 +64,13 @@ _gomp_parallel_start(struct part_task *pt, void (*fn) (void *), void *data, unsi
 	}
 	t->part_context = pt;
 
-	if (unlikely(num_threads > 1)) part_list_append(pt);
+	if (unlikely(num_threads > 1)) {
+		unsigned i;
+
+		part_list_append(pt);
+
+		for (i = 1; i < num_threads; i++) part_pool_wakeup();
+	}
 }
 
 static inline void
@@ -318,6 +324,8 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
 			ret = part_deque_push(pt);
 		} while (ret == -EAGAIN);
 		assert(ret == 0);
+		/* wake up a thread that might potentially run this workload */
+		part_pool_wakeup();
 	} else {
 		/* if_clause is false, task is an included/undeferred task */
 		struct part_task pt;
diff --git a/src/components/lib/part.c b/src/components/lib/part.c
index c3fffe9b38..ce04682a7d 100644
--- a/src/components/lib/part.c
+++ b/src/components/lib/part.c
@@ -17,6 +17,7 @@ static unsigned part_ready = 0;
 struct crt_lock part_l_lock;
 static struct part_task *part_tasks = NULL;
 static struct part_data *part__data = NULL;
+struct ps_list_head part_thdpool_core[NUM_CPU];
 
 #define PART_DEQUE_SZ 64
 #define _PART_PRIO 1
@@ -97,6 +98,7 @@ part_init(void)
 	int k;
 	static int is_first = NUM_CPU, ds_init_done = 0;
 
+	ps_list_head_init(&part_thdpool_core[cos_cpuid()]);
 	if (!ps_cas(&is_first, NUM_CPU, cos_cpuid())) {
 		while (!ps_load(&ds_init_done)) ;
 	} else {
diff --git a/src/components/lib/sl/sl_capmgr.c b/src/components/lib/sl/sl_capmgr.c
index 346e59c5ba..88c595b34a 100644
--- a/src/components/lib/sl/sl_capmgr.c
+++ b/src/components/lib/sl/sl_capmgr.c
@@ -80,6 +80,7 @@ sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t
 	t->timeout_idx    = -1;
 	t->prio           = TCAP_PRIO_MIN;
 	ps_list_init(t, SL_THD_EVENT_LIST);
+	ps_list_init(t, partlist);
 	sl_thd_event_info_reset(t);
 	sl_xcore_thd_lookup_init(aep->tid, cos_cpuid());
 
diff --git a/src/components/lib/sl/sl_raw.c b/src/components/lib/sl/sl_raw.c
index 5e5d8ead8e..92f1a0b645 100644
--- a/src/components/lib/sl/sl_raw.c
+++ b/src/components/lib/sl/sl_raw.c
@@ -73,6 +73,7 @@ sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t
 	t->timeout_idx    = -1;
 	t->prio           = TCAP_PRIO_MIN;
 	ps_list_init(t, SL_THD_EVENT_LIST);
+	ps_list_init(t, partlist);
 	sl_thd_event_info_reset(t);
 
 done:

From 3467e072b2925ed2b83e83f490b47da9bb4d022c Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Tue, 14 May 2019 00:25:20 -0400
Subject: [PATCH 065/127] added bots sort, TODO: debug

---
 .../no_interface/omp_fib_bots/bots_main.c     |   3 +
 .../no_interface/omp_sort_bots/Makefile       |  19 +
 .../no_interface/omp_sort_bots/app-desc.h     |  66 +++
 .../no_interface/omp_sort_bots/bots.h         |   1 +
 .../no_interface/omp_sort_bots/bots_common.c  |   1 +
 .../no_interface/omp_sort_bots/bots_common.h  |   1 +
 .../no_interface/omp_sort_bots/bots_main.c    |   1 +
 .../no_interface/omp_sort_bots/bots_main.h    |   1 +
 .../no_interface/omp_sort_bots/init.c         |   1 +
 .../omp_sort_bots/omp-tasks-app.h             |   1 +
 .../no_interface/omp_sort_bots/posix_basic.c  |   1 +
 .../no_interface/omp_sort_bots/sort.c         | 509 ++++++++++++++++++
 src/platform/i386/runscripts/omp_sort_bots.sh |   7 +
 13 files changed, 612 insertions(+)
 create mode 100644 src/components/implementation/no_interface/omp_sort_bots/Makefile
 create mode 100644 src/components/implementation/no_interface/omp_sort_bots/app-desc.h
 create mode 120000 src/components/implementation/no_interface/omp_sort_bots/bots.h
 create mode 120000 src/components/implementation/no_interface/omp_sort_bots/bots_common.c
 create mode 120000 src/components/implementation/no_interface/omp_sort_bots/bots_common.h
 create mode 120000 src/components/implementation/no_interface/omp_sort_bots/bots_main.c
 create mode 120000 src/components/implementation/no_interface/omp_sort_bots/bots_main.h
 create mode 120000 src/components/implementation/no_interface/omp_sort_bots/init.c
 create mode 120000 src/components/implementation/no_interface/omp_sort_bots/omp-tasks-app.h
 create mode 120000 src/components/implementation/no_interface/omp_sort_bots/posix_basic.c
 create mode 100644 src/components/implementation/no_interface/omp_sort_bots/sort.c
 create mode 100644 src/platform/i386/runscripts/omp_sort_bots.sh

diff --git a/src/components/implementation/no_interface/omp_fib_bots/bots_main.c b/src/components/implementation/no_interface/omp_fib_bots/bots_main.c
index 2c168be403..53b478512a 100644
--- a/src/components/implementation/no_interface/omp_fib_bots/bots_main.c
+++ b/src/components/implementation/no_interface/omp_fib_bots/bots_main.c
@@ -430,6 +430,9 @@ bots_get_params_common(int argc, char **argv)
          cos_exit (100);
       }
    }
+
+   /* always verify? */
+   bots_check_flag = TRUE;
 }
 /***********************************************************************
  * bots_get_params_common: 
diff --git a/src/components/implementation/no_interface/omp_sort_bots/Makefile b/src/components/implementation/no_interface/omp_sort_bots/Makefile
new file mode 100644
index 0000000000..099883c6e0
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sort_bots/Makefile
@@ -0,0 +1,19 @@
+COMPONENT=omp_sort_bots.o
+INTERFACES=
+DEPENDENCIES=capmgr
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLCAPMGR) -lsl_mod_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart -lsl_blkpt
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
+
+CFLAGS += -fopenmp
+# if tied tasks are required
+CFLAGS += -DFORCE_TIED_TASKS
+
+OMPC_FINAL_FLAGS=
+
+# one per compilation or none
+#CFLAGS += -DMANUAL_CUTOFF
+#CFLAGS += -DIF_CUTOFF
+#CFLAGS += -DFINAL_CUTOFF $(OMPC_FINAL_FLAGS) 
diff --git a/src/components/implementation/no_interface/omp_sort_bots/app-desc.h b/src/components/implementation/no_interface/omp_sort_bots/app-desc.h
new file mode 100644
index 0000000000..85e6e47782
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sort_bots/app-desc.h
@@ -0,0 +1,66 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include "omp-tasks-app.h"
+
+#define BOTS_APP_NAME "Sort"
+#define BOTS_APP_PARAMETERS_DESC "N=%d:Q=%d:I=%d:M=%d"
+#define BOTS_APP_PARAMETERS_LIST ,bots_arg_size,bots_app_cutoff_value_1,bots_app_cutoff_value_2,bots_app_cutoff_value
+
+#define BOTS_APP_USES_ARG_SIZE
+#define BOTS_APP_DEF_ARG_SIZE (32*1024*1024)
+#define BOTS_APP_DESC_ARG_SIZE "Array size"
+
+#define BOTS_APP_USES_ARG_CUTOFF
+#define BOTS_APP_DEF_ARG_CUTOFF (2*1024)
+#define BOTS_APP_DESC_ARG_CUTOFF "Sequential Merge cutoff value"
+
+#define BOTS_APP_USES_ARG_CUTOFF_1
+#define BOTS_APP_DEF_ARG_CUTOFF_1 (2*1024)
+#define BOTS_APP_DESC_ARG_CUTOFF_1 "Sequential Quicksort cutoff value"
+
+#define BOTS_APP_USES_ARG_CUTOFF_2
+#define BOTS_APP_DEF_ARG_CUTOFF_2 (20)
+#define BOTS_APP_DESC_ARG_CUTOFF_2 "Sequential Insertion cutoff value"
+
+typedef long ELM;
+
+void seqquick(ELM *low, ELM *high); 
+void seqmerge(ELM *low1, ELM *high1, ELM *low2, ELM *high2, ELM *lowdest);
+ELM *binsplit(ELM val, ELM *low, ELM *high); 
+void cilkmerge(ELM *low1, ELM *high1, ELM *low2, ELM *high2, ELM *lowdest);
+void cilkmerge_par(ELM *low1, ELM *high1, ELM *low2, ELM *high2, ELM *lowdest);
+void cilksort(ELM *low, ELM *tmp, long size);
+void cilksort_par(ELM *low, ELM *tmp, long size);
+void scramble_array( ELM *array ); 
+void fill_array( ELM *array ); 
+void sort ( void ); 
+
+void sort_par (void);
+void sort_init (void);
+int sort_verify (void);
+
+#define BOTS_APP_INIT sort_init()
+
+#define KERNEL_INIT
+#define KERNEL_CALL sort_par()
+#define KERNEL_CHECK sort_verify()
+
+
diff --git a/src/components/implementation/no_interface/omp_sort_bots/bots.h b/src/components/implementation/no_interface/omp_sort_bots/bots.h
new file mode 120000
index 0000000000..ea0ad2b59f
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sort_bots/bots.h
@@ -0,0 +1 @@
+../omp_fib_bots/bots.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sort_bots/bots_common.c b/src/components/implementation/no_interface/omp_sort_bots/bots_common.c
new file mode 120000
index 0000000000..4802b0cf70
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sort_bots/bots_common.c
@@ -0,0 +1 @@
+../omp_fib_bots/bots_common.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sort_bots/bots_common.h b/src/components/implementation/no_interface/omp_sort_bots/bots_common.h
new file mode 120000
index 0000000000..14eda863e4
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sort_bots/bots_common.h
@@ -0,0 +1 @@
+../omp_fib_bots/bots_common.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sort_bots/bots_main.c b/src/components/implementation/no_interface/omp_sort_bots/bots_main.c
new file mode 120000
index 0000000000..14f2dab009
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sort_bots/bots_main.c
@@ -0,0 +1 @@
+../omp_fib_bots/bots_main.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sort_bots/bots_main.h b/src/components/implementation/no_interface/omp_sort_bots/bots_main.h
new file mode 120000
index 0000000000..86c06ad286
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sort_bots/bots_main.h
@@ -0,0 +1 @@
+../omp_fib_bots/bots_main.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sort_bots/init.c b/src/components/implementation/no_interface/omp_sort_bots/init.c
new file mode 120000
index 0000000000..9e09b82e77
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sort_bots/init.c
@@ -0,0 +1 @@
+../omp_fib_bots/init.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sort_bots/omp-tasks-app.h b/src/components/implementation/no_interface/omp_sort_bots/omp-tasks-app.h
new file mode 120000
index 0000000000..9fba574408
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sort_bots/omp-tasks-app.h
@@ -0,0 +1 @@
+../omp_fib_bots/omp-tasks-app.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sort_bots/posix_basic.c b/src/components/implementation/no_interface/omp_sort_bots/posix_basic.c
new file mode 120000
index 0000000000..9afee078fb
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sort_bots/posix_basic.c
@@ -0,0 +1 @@
+../omp_fib_bots/posix_basic.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sort_bots/sort.c b/src/components/implementation/no_interface/omp_sort_bots/sort.c
new file mode 100644
index 0000000000..e8347e4ff9
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sort_bots/sort.c
@@ -0,0 +1,509 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+/*
+ *  Original code from the Cilk project
+ *
+ * Copyright (c) 2000 Massachusetts Institute of Technology
+ * Copyright (c) 2000 Matteo Frigo
+ */
+
+/*
+ * this program uses an algorithm that we call `cilksort'.
+ * The algorithm is essentially mergesort:
+ *
+ *   cilksort(in[1..n]) =
+ *       spawn cilksort(in[1..n/2], tmp[1..n/2])
+ *       spawn cilksort(in[n/2..n], tmp[n/2..n])
+ *       sync
+ *       spawn cilkmerge(tmp[1..n/2], tmp[n/2..n], in[1..n])
+ *
+ *
+ * The procedure cilkmerge does the following:
+ *       
+ *       cilkmerge(A[1..n], B[1..m], C[1..(n+m)]) =
+ *          find the median of A \union B using binary
+ *          search.  The binary search gives a pair
+ *          (ma, mb) such that ma + mb = (n + m)/2
+ *          and all elements in A[1..ma] are smaller than
+ *          B[mb..m], and all the B[1..mb] are smaller
+ *          than all elements in A[ma..n].
+ *
+ *          spawn cilkmerge(A[1..ma], B[1..mb], C[1..(n+m)/2])
+ *          spawn cilkmerge(A[ma..m], B[mb..n], C[(n+m)/2 .. (n+m)])
+ *          sync
+ *
+ * The algorithm appears for the first time (AFAIK) in S. G. Akl and
+ * N. Santoro, "Optimal Parallel Merging and Sorting Without Memory
+ * Conflicts", IEEE Trans. Comp., Vol. C-36 No. 11, Nov. 1987 .  The
+ * paper does not express the algorithm using recursion, but the
+ * idea of finding the median is there.
+ *
+ * For cilksort of n elements, T_1 = O(n log n) and
+ * T_\infty = O(log^3 n).  There is a way to shave a
+ * log factor in the critical path (left as homework).
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "bots.h"
+#include "app-desc.h"
+
+ELM *array, *tmp;
+
+static unsigned long rand_nxt = 0;
+
+static inline unsigned long my_rand(void)
+{
+     rand_nxt = rand_nxt * 1103515245 + 12345;
+     return rand_nxt;
+}
+
+static inline void my_srand(unsigned long seed)
+{
+     rand_nxt = seed;
+}
+
+static inline ELM med3(ELM a, ELM b, ELM c)
+{
+     if (a < b) {
+	  if (b < c) {
+	       return b;
+	  } else {
+	       if (a < c)
+		    return c;
+	       else
+		    return a;
+	  }
+     } else {
+	  if (b > c) {
+	       return b;
+	  } else {
+	       if (a > c)
+		    return c;
+	       else
+		    return a;
+	  }
+     }
+}
+
+/*
+ * simple approach for now; a better median-finding
+ * may be preferable
+ */
+static inline ELM choose_pivot(ELM *low, ELM *high)
+{
+     return med3(*low, *high, low[(high - low) / 2]);
+}
+
+static ELM *seqpart(ELM *low, ELM *high)
+{
+     ELM pivot;
+     ELM h, l;
+     ELM *curr_low = low;
+     ELM *curr_high = high;
+
+     pivot = choose_pivot(low, high);
+
+     while (1) {
+	  while ((h = *curr_high) > pivot)
+	       curr_high--;
+
+	  while ((l = *curr_low) < pivot)
+	       curr_low++;
+
+	  if (curr_low >= curr_high)
+	       break;
+
+	  *curr_high-- = l;
+	  *curr_low++ = h;
+     }
+
+     /*
+      * I don't know if this is really necessary.
+      * The problem is that the pivot is not always the
+      * first element, and the partition may be trivial.
+      * However, if the partition is trivial, then
+      * *high is the largest element, whence the following
+      * code.
+      */
+     if (curr_high < high)
+	  return curr_high;
+     else
+	  return curr_high - 1;
+}
+
+#define swap(a, b) \
+{ \
+  ELM tmp;\
+  tmp = a;\
+  a = b;\
+  b = tmp;\
+}
+
+static void insertion_sort(ELM *low, ELM *high)
+{
+     ELM *p, *q;
+     ELM a, b;
+
+     for (q = low + 1; q <= high; ++q) {
+	  a = q[0];
+	  for (p = q - 1; p >= low && (b = p[0]) > a; p--)
+	       p[1] = b;
+	  p[1] = a;
+     }
+}
+
+/*
+ * tail-recursive quicksort, almost unrecognizable :-)
+ */
+void seqquick(ELM *low, ELM *high)
+{
+     ELM *p;
+
+     while (high - low >= bots_app_cutoff_value_2) {
+	  p = seqpart(low, high);
+	  seqquick(low, p);
+	  low = p + 1;
+     }
+
+     insertion_sort(low, high);
+}
+
+void seqmerge(ELM *low1, ELM *high1, ELM *low2, ELM *high2,
+	      ELM *lowdest)
+{
+     ELM a1, a2;
+
+     /*
+      * The following 'if' statement is not necessary
+      * for the correctness of the algorithm, and is
+      * in fact subsumed by the rest of the function.
+      * However, it is a few percent faster.  Here is why.
+      *
+      * The merging loop below has something like
+      *   if (a1 < a2) {
+      *        *dest++ = a1;
+      *        ++low1;
+      *        if (end of array) break;
+      *        a1 = *low1;
+      *   }
+      *
+      * Now, a1 is needed immediately in the next iteration
+      * and there is no way to mask the latency of the load.
+      * A better approach is to load a1 *before* the end-of-array
+      * check; the problem is that we may be speculatively
+      * loading an element out of range.  While this is
+      * probably not a problem in practice, yet I don't feel
+      * comfortable with an incorrect algorithm.  Therefore,
+      * I use the 'fast' loop on the array (except for the last 
+      * element) and the 'slow' loop for the rest, saving both
+      * performance and correctness.
+      */
+
+     if (low1 < high1 && low2 < high2) {
+	  a1 = *low1;
+	  a2 = *low2;
+	  for (;;) {
+	       if (a1 < a2) {
+		    *lowdest++ = a1;
+		    a1 = *++low1;
+		    if (low1 >= high1)
+			 break;
+	       } else {
+		    *lowdest++ = a2;
+		    a2 = *++low2;
+		    if (low2 >= high2)
+			 break;
+	       }
+	  }
+     }
+     if (low1 <= high1 && low2 <= high2) {
+	  a1 = *low1;
+	  a2 = *low2;
+	  for (;;) {
+	       if (a1 < a2) {
+		    *lowdest++ = a1;
+		    ++low1;
+		    if (low1 > high1)
+			 break;
+		    a1 = *low1;
+	       } else {
+		    *lowdest++ = a2;
+		    ++low2;
+		    if (low2 > high2)
+			 break;
+		    a2 = *low2;
+	       }
+	  }
+     }
+     if (low1 > high1) {
+	  memcpy(lowdest, low2, sizeof(ELM) * (high2 - low2 + 1));
+     } else {
+	  memcpy(lowdest, low1, sizeof(ELM) * (high1 - low1 + 1));
+     }
+}
+
+#define swap_indices(a, b) \
+{ \
+  ELM *tmp;\
+  tmp = a;\
+  a = b;\
+  b = tmp;\
+}
+
+ELM *binsplit(ELM val, ELM *low, ELM *high)
+{
+     /*
+      * returns index which contains greatest element <= val.  If val is
+      * less than all elements, returns low-1
+      */
+     ELM *mid;
+
+     while (low != high) {
+	  mid = low + ((high - low + 1) >> 1);
+	  if (val <= *mid)
+	       high = mid - 1;
+	  else
+	       low = mid;
+     }
+
+     if (*low > val)
+	  return low - 1;
+     else
+	  return low;
+}
+
+
+void cilkmerge_par(ELM *low1, ELM *high1, ELM *low2, ELM *high2, ELM *lowdest)
+{
+     /*
+      * Cilkmerge: Merges range [low1, high1] with range [low2, high2] 
+      * into the range [lowdest, ...]  
+      */
+
+     ELM *split1, *split2;	/*
+				 * where each of the ranges are broken for 
+				 * recursive merge 
+				 */
+     long int lowsize;		/*
+				 * total size of lower halves of two
+				 * ranges - 2 
+				 */
+
+     /*
+      * We want to take the middle element (indexed by split1) from the
+      * larger of the two arrays.  The following code assumes that split1
+      * is taken from range [low1, high1].  So if [low1, high1] is
+      * actually the smaller range, we should swap it with [low2, high2] 
+      */
+
+     if (high2 - low2 > high1 - low1) {
+	  swap_indices(low1, low2);
+	  swap_indices(high1, high2);
+     }
+     if (high2 < low2) {
+	  /* smaller range is empty */
+	  memcpy(lowdest, low1, sizeof(ELM) * (high1 - low1));
+	  return;
+     }
+     if (high2 - low2 < bots_app_cutoff_value ) {
+	  seqmerge(low1, high1, low2, high2, lowdest);
+	  return;
+     }
+     /*
+      * Basic approach: Find the middle element of one range (indexed by
+      * split1). Find where this element would fit in the other range
+      * (indexed by split 2). Then merge the two lower halves and the two
+      * upper halves. 
+      */
+
+     split1 = ((high1 - low1 + 1) / 2) + low1;
+     split2 = binsplit(*split1, low2, high2);
+     lowsize = split1 - low1 + split2 - low2;
+
+     /* 
+      * directly put the splitting element into
+      * the appropriate location
+      */
+     *(lowdest + lowsize + 1) = *split1;
+#pragma omp task untied
+     cilkmerge_par(low1, split1 - 1, low2, split2, lowdest);
+#pragma omp task untied
+     cilkmerge_par(split1 + 1, high1, split2 + 1, high2,
+		     lowdest + lowsize + 2);
+#pragma omp taskwait
+
+     return;
+}
+
+void cilksort_par(ELM *low, ELM *tmp, long size)
+{
+     /*
+      * divide the input in four parts of the same size (A, B, C, D)
+      * Then:
+      *   1) recursively sort A, B, C, and D (in parallel)
+      *   2) merge A and B into tmp1, and C and D into tmp2 (in parallel)
+      *   3) merge tmp1 and tmp2 into the original array
+      */
+     long quarter = size / 4;
+     ELM *A, *B, *C, *D, *tmpA, *tmpB, *tmpC, *tmpD;
+
+     if (size < bots_app_cutoff_value_1 ) {
+	  /* quicksort when less than 1024 elements */
+	  seqquick(low, low + size - 1);
+	  return;
+     }
+     A = low;
+     tmpA = tmp;
+     B = A + quarter;
+     tmpB = tmpA + quarter;
+     C = B + quarter;
+     tmpC = tmpB + quarter;
+     D = C + quarter;
+     tmpD = tmpC + quarter;
+
+#if defined(FORCE_TIED_TASKS)
+#pragma omp task
+     cilksort_par(A, tmpA, quarter);
+#pragma omp task
+     cilksort_par(B, tmpB, quarter);
+#pragma omp task
+     cilksort_par(C, tmpC, quarter);
+#pragma omp task
+     cilksort_par(D, tmpD, size - 3 * quarter);
+#else
+#pragma omp task untied
+     cilksort_par(A, tmpA, quarter);
+#pragma omp task untied
+     cilksort_par(B, tmpB, quarter);
+#pragma omp task untied
+     cilksort_par(C, tmpC, quarter);
+#pragma omp task untied
+     cilksort_par(D, tmpD, size - 3 * quarter);
+#endif
+#pragma omp taskwait
+
+#if defined(FORCE_TIED_TASKS)
+#pragma omp task
+     cilkmerge_par(A, A + quarter - 1, B, B + quarter - 1, tmpA);
+#pragma omp task
+     cilkmerge_par(C, C + quarter - 1, D, low + size - 1, tmpC);
+#else
+#pragma omp task untied
+     cilkmerge_par(A, A + quarter - 1, B, B + quarter - 1, tmpA);
+#pragma omp task untied
+     cilkmerge_par(C, C + quarter - 1, D, low + size - 1, tmpC);
+#endif
+#pragma omp taskwait
+
+     cilkmerge_par(tmpA, tmpC - 1, tmpC, tmpA + size - 1, A);
+}
+
+void scramble_array( ELM *array )
+{
+     unsigned long i;
+     unsigned long j;
+
+     for (i = 0; i < bots_arg_size; ++i) {
+	  j = my_rand();
+	  j = j % bots_arg_size;
+	  swap(array[i], array[j]);
+     }
+}
+
+void fill_array( ELM *array )
+{
+     unsigned long i;
+
+     my_srand(1);
+     /* first, fill with integers 1..size */
+     for (i = 0; i < bots_arg_size; ++i) {
+	  array[i] = i;
+     }
+}
+
+void sort_init ( void )
+{
+     /* Checking arguments */
+     if (bots_arg_size < 4) {
+        bots_message("%s can not be less than 4, using 4 as a parameter.\n", BOTS_APP_DESC_ARG_SIZE );
+        bots_arg_size = 4;
+     }
+
+     if (bots_app_cutoff_value < 2) {
+        bots_message("%s can not be less than 2, using 2 as a parameter.\n", BOTS_APP_DESC_ARG_CUTOFF);
+        bots_app_cutoff_value = 2;
+     }
+     else if (bots_app_cutoff_value > bots_arg_size ) {
+        bots_message("%s can not be greather than vector size, using %d as a parameter.\n", BOTS_APP_DESC_ARG_CUTOFF, bots_arg_size);
+        bots_app_cutoff_value = bots_arg_size;
+     }
+
+     if (bots_app_cutoff_value_1 > bots_arg_size ) {
+        bots_message("%s can not be greather than vector size, using %d as a parameter.\n", BOTS_APP_DESC_ARG_CUTOFF_1, bots_arg_size);
+        bots_app_cutoff_value_1 = bots_arg_size;
+     }
+     if (bots_app_cutoff_value_2 > bots_arg_size ) {
+        bots_message("%s can not be greather than vector size, using %d as a parameter.\n", BOTS_APP_DESC_ARG_CUTOFF_2, bots_arg_size);
+        bots_app_cutoff_value_2 = bots_arg_size;
+     }
+
+     if (bots_app_cutoff_value_2 > bots_app_cutoff_value_1) {
+        bots_message("%s can not be greather than %s, using %d as a parameter.\n",
+		BOTS_APP_DESC_ARG_CUTOFF_2,
+		BOTS_APP_DESC_ARG_CUTOFF_1,
+		bots_app_cutoff_value_1
+	);
+        bots_app_cutoff_value_2 = bots_app_cutoff_value_1;
+     }
+
+     array = (ELM *) malloc(bots_arg_size * sizeof(ELM));
+     tmp = (ELM *) malloc(bots_arg_size * sizeof(ELM));
+     fill_array(array);
+     scramble_array(array);
+}
+
+void sort_par ( void )
+{
+	bots_message("Computing multisort algorithm (n=%d) ", bots_arg_size);
+	#pragma omp parallel
+	#pragma omp single nowait
+#if defined(FORCE_TIED_TASKS)
+	#pragma omp task untied
+	     cilksort_par(array, tmp, bots_arg_size);
+#else
+	#pragma omp task untied
+	     cilksort_par(array, tmp, bots_arg_size);
+#endif
+	bots_message(" completed!\n");
+}
+
+int sort_verify ( void )
+{
+     int i, success = 1;
+     for (i = 0; i < bots_arg_size; ++i)
+	  if (array[i] != i)
+	       success = 0;
+
+     return success ? BOTS_RESULT_SUCCESSFUL : BOTS_RESULT_UNSUCCESSFUL;
+}
+
diff --git a/src/platform/i386/runscripts/omp_sort_bots.sh b/src/platform/i386/runscripts/omp_sort_bots.sh
new file mode 100644
index 0000000000..3f65db092f
--- /dev/null
+++ b/src/platform/i386/runscripts/omp_sort_bots.sh
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+cp llboot_comp.o llboot.o
+cp omp_sort_bots.o boot.o
+cp test_boot.o dummy1.o
+cp test_boot.o dummy2.o
+./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub

From f979d29396fb35da103581bf3ac9b7ac1b73ddf7 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Tue, 14 May 2019 10:40:27 -0400
Subject: [PATCH 066/127] part: idle thread on all cores to wakeup threads
 using sl_mod_part_fifo

* sl_mod_part_fifo tracks a single idle thread per core which is
scheduled when there is nothing else on that core to run.
---
 .../no_interface/omp_dijkstra/Makefile        |   2 +-
 .../no_interface/omp_fib_bots/Makefile        |   2 +-
 .../no_interface/omp_hello/Makefile           |   2 +-
 .../no_interface/omp_sort_bots/Makefile       |   2 +-
 src/components/lib/part.c                     |  19 ++-
 src/components/lib/sl/Makefile                |   2 +-
 src/components/lib/sl/sl_mod_part_fifo.c      | 121 ++++++++++++++++++
 src/kernel/include/shared/cos_config.h        |   2 +-
 8 files changed, 145 insertions(+), 7 deletions(-)
 create mode 100644 src/components/lib/sl/sl_mod_part_fifo.c

diff --git a/src/components/implementation/no_interface/omp_dijkstra/Makefile b/src/components/implementation/no_interface/omp_dijkstra/Makefile
index c018ed38c1..2724553d78 100644
--- a/src/components/implementation/no_interface/omp_dijkstra/Makefile
+++ b/src/components/implementation/no_interface/omp_dijkstra/Makefile
@@ -2,7 +2,7 @@ COMPONENT=omp_dijkstra.o
 INTERFACES=
 DEPENDENCIES=capmgr
 IF_LIB=
-ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLCAPMGR) -lsl_mod_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart -lsl_blkpt
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLCAPMGR) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart -lsl_blkpt
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/no_interface/omp_fib_bots/Makefile b/src/components/implementation/no_interface/omp_fib_bots/Makefile
index 20cdb21093..bdd8a43b44 100644
--- a/src/components/implementation/no_interface/omp_fib_bots/Makefile
+++ b/src/components/implementation/no_interface/omp_fib_bots/Makefile
@@ -2,7 +2,7 @@ COMPONENT=omp_fib_bots.o
 INTERFACES=
 DEPENDENCIES=capmgr
 IF_LIB=
-ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLCAPMGR) -lsl_mod_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart -lsl_blkpt
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLCAPMGR) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart -lsl_blkpt
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/no_interface/omp_hello/Makefile b/src/components/implementation/no_interface/omp_hello/Makefile
index ac2bc81844..f15a5fd6dd 100644
--- a/src/components/implementation/no_interface/omp_hello/Makefile
+++ b/src/components/implementation/no_interface/omp_hello/Makefile
@@ -2,7 +2,7 @@ COMPONENT=omp_hello.o
 INTERFACES=
 DEPENDENCIES=capmgr
 IF_LIB=
-ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLCAPMGR) -lsl_mod_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart -lsl_blkpt
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLCAPMGR) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart -lsl_blkpt
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/no_interface/omp_sort_bots/Makefile b/src/components/implementation/no_interface/omp_sort_bots/Makefile
index 099883c6e0..a711420191 100644
--- a/src/components/implementation/no_interface/omp_sort_bots/Makefile
+++ b/src/components/implementation/no_interface/omp_sort_bots/Makefile
@@ -2,7 +2,7 @@ COMPONENT=omp_sort_bots.o
 INTERFACES=
 DEPENDENCIES=capmgr
 IF_LIB=
-ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLCAPMGR) -lsl_mod_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart -lsl_blkpt
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLCAPMGR) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart -lsl_blkpt
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/lib/part.c b/src/components/lib/part.c
index ce04682a7d..5847dd33b8 100644
--- a/src/components/lib/part.c
+++ b/src/components/lib/part.c
@@ -20,9 +20,19 @@ static struct part_data *part__data = NULL;
 struct ps_list_head part_thdpool_core[NUM_CPU];
 
 #define PART_DEQUE_SZ 64
-#define _PART_PRIO 1
+#define _PART_PRIO TCAP_PRIO_MAX
 #define _PART_PRIO_PACK() sched_param_pack(SCHEDP_PRIO, _PART_PRIO)
 
+#define _PART_IDLE_PRIO (_PART_PRIO+4)
+#define _PART_IDLE_PRIO_PACK() sched_param_pack(SCHEDP_PRIO, _PART_IDLE_PRIO)
+
+/* idle thread to wakeup when there is nothing to do on this core! */
+static void
+part_idle_fn(void *d)
+{
+	while (1) part_pool_wakeup();
+}
+
 struct part_data *
 part_data_alloc(void)
 {
@@ -97,6 +107,9 @@ part_init(void)
 {
 	int k;
 	static int is_first = NUM_CPU, ds_init_done = 0;
+	struct sl_thd *it = NULL;
+	struct sl_xcore_thd *xit = NULL;
+	sched_param_t ip = _PART_IDLE_PRIO_PACK();
 
 	ps_list_head_init(&part_thdpool_core[cos_cpuid()]);
 	if (!ps_cas(&is_first, NUM_CPU, cos_cpuid())) {
@@ -129,5 +142,9 @@ part_init(void)
 		assert(x);
 	}
 
+	it = sl_thd_alloc(part_idle_fn, NULL);
+	assert(it);
+	sl_thd_param_set(it, ip);
+
 	ps_faa(&part_ready, 1);
 }
diff --git a/src/components/lib/sl/Makefile b/src/components/lib/sl/Makefile
index d17b141feb..d54ad150e6 100644
--- a/src/components/lib/sl/Makefile
+++ b/src/components/lib/sl/Makefile
@@ -1,6 +1,6 @@
 include Makefile.src Makefile.comp
 
-LIB_OBJS=sl_capmgr.o sl_raw.o sl_sched.o sl_xcore.o sl_child.o sl_mod_fprr.o sl_mod_rr.o sl_mod_fifo.o sl_lock.o sl_thd_static_backend.o sl_blkpt.o
+LIB_OBJS=sl_capmgr.o sl_raw.o sl_sched.o sl_xcore.o sl_child.o sl_mod_fprr.o sl_mod_rr.o sl_mod_fifo.o sl_mod_part_fifo.o sl_lock.o sl_thd_static_backend.o sl_blkpt.o
 LIBS=$(LIB_OBJS:%.o=%.a)
 CINC+=-m32
 
diff --git a/src/components/lib/sl/sl_mod_part_fifo.c b/src/components/lib/sl/sl_mod_part_fifo.c
new file mode 100644
index 0000000000..3dee6ee9ba
--- /dev/null
+++ b/src/components/lib/sl/sl_mod_part_fifo.c
@@ -0,0 +1,121 @@
+/**
+ * Redistribution of this file is permitted under the BSD two clause license.
+ *
+ * Copyright 2019, The George Washington University
+ * Author: Phani Gadepalli, phanikishoreg@gwu.edu
+ */
+
+#include <sl.h>
+#include <sl_consts.h>
+#include <sl_mod_policy.h>
+#include <sl_plugins.h>
+
+#define SL_FIFO_PRIO           TCAP_PRIO_MAX
+#define SL_FIFO_IDLE_PRIO      SL_FIFO_PRIO+4
+#define SL_FIFO_PERIOD_US_MIN  SL_MIN_PERIOD_US
+
+static struct ps_list_head threads[NUM_CPU] CACHE_ALIGNED;
+static struct sl_thd_policy *idle_thd[NUM_CPU];
+
+void
+sl_mod_execution(struct sl_thd_policy *t, cycles_t cycles)
+{ }
+
+struct sl_thd_policy *
+sl_mod_schedule(void)
+{
+	struct sl_thd_policy *t = NULL;
+
+	if (unlikely(ps_list_head_empty(&threads[cos_cpuid()]))) goto done;
+	return ps_list_head_first_d(&threads[cos_cpuid()], struct sl_thd_policy);
+
+done:
+	if (likely(idle_thd[cos_cpuid()])) return idle_thd[cos_cpuid()];
+
+	return t;
+}
+
+void
+sl_mod_block(struct sl_thd_policy *t)
+{
+	assert(t != idle_thd[cos_cpuid()]);
+	ps_list_rem_d(t);
+}
+
+void
+sl_mod_wakeup(struct sl_thd_policy *t)
+{
+	assert(t != idle_thd[cos_cpuid()]);
+	assert(ps_list_singleton_d(t));
+
+	ps_list_head_append_d(&threads[cos_cpuid()], t);
+}
+
+void
+sl_mod_yield(struct sl_thd_policy *t, struct sl_thd_policy *yield_to)
+{
+	if (unlikely(t == idle_thd[cos_cpuid()])) return;
+	ps_list_rem_d(t);
+	ps_list_head_append_d(&threads[cos_cpuid()], t);
+}
+
+void
+sl_mod_thd_create(struct sl_thd_policy *t)
+{
+	t->priority    = TCAP_PRIO_MIN;
+	t->period      = 0;
+	t->period_usec = 0;
+	ps_list_init_d(t);
+
+	/* TODO: add to runq here? for now, only add when PRIO is set and that's pretty much it's ARRIVAL time! */
+}
+
+void
+sl_mod_thd_delete(struct sl_thd_policy *t)
+{
+	if (unlikely(t == idle_thd[cos_cpuid()])) return;	
+	ps_list_rem_d(t);
+}
+
+void
+sl_mod_thd_param_set(struct sl_thd_policy *t, sched_param_type_t type, unsigned int v)
+{
+	int cpu = cos_cpuid();
+
+	switch (type) {
+	case SCHEDP_PRIO:
+	{
+		t->priority = v;
+		sl_thd_setprio(sl_mod_thd_get(t), t->priority);
+
+		if (v == SL_FIFO_IDLE_PRIO) {
+			assert(idle_thd[cos_cpuid()] == NULL);
+			idle_thd[cos_cpuid()] = t;
+		} else {
+			ps_list_head_append_d(&threads[cos_cpuid()], t);
+		}
+
+		break;
+	}
+	case SCHEDP_WINDOW:
+	{
+		assert(v >= SL_FIFO_PERIOD_US_MIN);
+		t->period_usec    = v;
+		t->period         = sl_usec2cyc(v);
+
+		break;
+	}
+	case SCHEDP_BUDGET:
+	{
+		break;
+	}
+	default: assert(0);
+	}
+}
+
+void
+sl_mod_init(void)
+{
+	idle_thd[cos_cpuid()] = NULL;
+	ps_list_head_init(&threads[cos_cpuid()]);
+}
diff --git a/src/kernel/include/shared/cos_config.h b/src/kernel/include/shared/cos_config.h
index bf501b3be9..8c46ae5377 100644
--- a/src/kernel/include/shared/cos_config.h
+++ b/src/kernel/include/shared/cos_config.h
@@ -17,7 +17,7 @@
 
 #include "cpu_ghz.h"
 
-#define NUM_CPU 1
+#define NUM_CPU 2
 #define NUM_CPU_BMP_BYTES ((NUM_CPU + 7) / 8)
 #define NUM_CPU_BMP_WORDS ((NUM_CPU_BMP_BYTES + 3) / 4)
 

From efcde0bbd12d61ae1ae3ff47cde986956323bcc3 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Tue, 14 May 2019 16:59:50 -0400
Subject: [PATCH 067/127] changed time output to use cos_time api in BOTS

---
 .../no_interface/omp_fib_bots/bots_common.c   |  8 +++++---
 .../no_interface/omp_fib_bots/bots_common.h   |  2 +-
 .../no_interface/omp_hello/init.c             | 18 +++++++++++++----
 src/components/include/part_task.h            |  8 ++++----
 src/components/lib/part.c                     | 20 ++++++++++---------
 src/kernel/include/shared/consts.h            |  2 +-
 6 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/src/components/implementation/no_interface/omp_fib_bots/bots_common.c b/src/components/implementation/no_interface/omp_fib_bots/bots_common.c
index 95d71f172a..49af8a438e 100644
--- a/src/components/implementation/no_interface/omp_fib_bots/bots_common.c
+++ b/src/components/implementation/no_interface/omp_fib_bots/bots_common.c
@@ -30,6 +30,7 @@
 #include "bots_common.h"
 #include "bots_main.h"
 #include "bots.h"
+#include <cos_time.h>
 
 void
 bots_error(int error, char *message)
@@ -77,9 +78,10 @@ bots_warning(int warning, char *message)
 
 long bots_usecs (void)
 {
-   struct timeval t;
-   gettimeofday(&t,NULL);
-   return t.tv_sec*1000000+t.tv_usec;
+   //struct timeval t;
+   //gettimeofday(&t,NULL);
+   //return t.tv_sec*1000000+t.tv_usec;
+   return (long)time_now_usec();
 }
 
 void
diff --git a/src/components/implementation/no_interface/omp_fib_bots/bots_common.h b/src/components/implementation/no_interface/omp_fib_bots/bots_common.h
index 1e306b7f1d..9d38799ef1 100644
--- a/src/components/implementation/no_interface/omp_fib_bots/bots_common.h
+++ b/src/components/implementation/no_interface/omp_fib_bots/bots_common.h
@@ -51,6 +51,6 @@ void bots_get_architecture(char *str);
 void bots_get_load_average(char *str);
 void bots_print_results(void);
 
-#define BOTS_TMP_STR_SZ 256
+#define BOTS_TMP_STR_SZ 64 
 
 #endif
diff --git a/src/components/implementation/no_interface/omp_hello/init.c b/src/components/implementation/no_interface/omp_hello/init.c
index 00924aac1e..f4ce213f12 100644
--- a/src/components/implementation/no_interface/omp_hello/init.c
+++ b/src/components/implementation/no_interface/omp_hello/init.c
@@ -31,24 +31,31 @@ cos_init(void *d)
 	struct cos_compinfo *   ci    = cos_compinfo_get(defci);
 	int i;
 	static volatile unsigned long first = NUM_CPU + 1, init_done[NUM_CPU] = { 0 };
+	static unsigned b1 = 0, b2 = 0, b3 = 0;
 
 	PRINTC("In an OpenMP program!\n");
-	if (ps_cas((unsigned long *)&first, NUM_CPU + 1, cos_cpuid())) {
+	if (ps_cas(&first, NUM_CPU + 1, cos_cpuid())) {
 		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
 		cos_defcompinfo_init();
 	} else {
-		while (!ps_load((unsigned long *)&init_done[first])) ;
+		while (!ps_load(&init_done[first])) ;
 
 		cos_defcompinfo_sched_init();
 	}
-	ps_faa((unsigned long *)&init_done[cos_cpuid()], 1);
+	ps_faa(&init_done[cos_cpuid()], 1);
 
 	/* make sure the INITTHD of the scheduler is created on all cores.. for cross-core sl initialization to work! */
 	for (i = 0; i < NUM_CPU; i++) {
-		while (!ps_load((unsigned long *)&init_done[i])) ;
+		while (!ps_load(&init_done[i])) ;
 	}
 	sl_init(SL_MIN_PERIOD_US*100);
+	/* barrier, wait for sl_init to be done on all cores */
+	ps_faa(&b1, 1);
+	while (ps_load(&b1) != NUM_CPU) ;
 	cos_gomp_init();
+	/* barrier, wait for gomp_init to be done on all cores */
+	ps_faa(&b2, 1);
+	while (ps_load(&b2) != NUM_CPU) ;
 	hypercall_comp_init_done();
 
 	if (!cos_cpuid()) {
@@ -58,6 +65,9 @@ cos_init(void *d)
 		assert(t);
 		sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX));
 	}
+	/* wait for all cores to reach this point, so all threads wait for main thread to be ready! */
+	ps_faa(&b3, 1);
+	while (ps_load(&b3) != NUM_CPU) ;
 
 	sl_sched_loop_nonblock();
 
diff --git a/src/components/include/part_task.h b/src/components/include/part_task.h
index 94531a8c5e..050a82f89e 100644
--- a/src/components/include/part_task.h
+++ b/src/components/include/part_task.h
@@ -9,11 +9,11 @@
 #define PART_THD(c, t) (cos_cpuid() << 16 | cos_thdid())
 #define PART_CURR_THD  PART_THD(cos_cpuid(), cos_thdid()) 
 
-#define PART_MAX_TASKS      1024
-#define PART_MAX_DATA       512
+#define PART_MAX_TASKS      256 
+#define PART_MAX_DATA       128
 #define PART_MAX_PAR_THDS   4
-#define PART_MAX_CORE_THDS  64
-#define PART_MAX_THDS       PART_MAX_CORE_THDS*NUM_CPU
+#define PART_MAX_THDS       128
+#define PART_MAX_CORE_THDS  (PART_MAX_THDS/NUM_CPU)
 #define PART_MAX_CHILD      16 
 #define PART_MAX_WORKSHARES 16
 
diff --git a/src/components/lib/part.c b/src/components/lib/part.c
index 5847dd33b8..36b2d68d7f 100644
--- a/src/components/lib/part.c
+++ b/src/components/lib/part.c
@@ -7,13 +7,13 @@
 #include <sl.h>
 #include <sl_xcore.h>
 
-#define PART_MAX_PAGES ((PART_MAX_TASKS * sizeof(struct part_task)) / PAGE_SIZE)
-#define PART_MAX_DATA_PAGES ((PART_MAX_TASKS * sizeof(struct part_data)) / PAGE_SIZE)
+#define PART_MAX_PAGES (((PART_MAX_TASKS * sizeof(struct part_task)) / PAGE_SIZE) + 1)
+#define PART_MAX_DATA_PAGES (((PART_MAX_TASKS * sizeof(struct part_data)) / PAGE_SIZE) + 1)
 
 struct deque_part part_dq_percore[NUM_CPU];
 //struct cirque_par parcq_global;
 struct ps_list_head part_l_global;
-static unsigned part_ready = 0;
+static volatile unsigned part_ready = 0;
 struct crt_lock part_l_lock;
 static struct part_task *part_tasks = NULL;
 static struct part_data *part__data = NULL;
@@ -106,26 +106,25 @@ void
 part_init(void)
 {
 	int k;
-	static int is_first = NUM_CPU, ds_init_done = 0;
+	static volatile int is_first = NUM_CPU;
 	struct sl_thd *it = NULL;
 	struct sl_xcore_thd *xit = NULL;
 	sched_param_t ip = _PART_IDLE_PRIO_PACK();
+	static volatile int all_done = 0;
 
 	ps_list_head_init(&part_thdpool_core[cos_cpuid()]);
-	if (!ps_cas(&is_first, NUM_CPU, cos_cpuid())) {
-		while (!ps_load(&ds_init_done)) ;
-	} else {
+	if (ps_cas(&is_first, NUM_CPU, cos_cpuid())) {
 		for (k = 0; k < NUM_CPU; k++) deque_init_part(&part_dq_percore[k], PART_DEQUE_SZ);
 		part_tasks = (struct part_task *)memmgr_heap_page_allocn(PART_MAX_PAGES);
 		assert(part_tasks);
-
+		memset(part_tasks, 0, PART_MAX_PAGES * PAGE_SIZE);
 
 		part__data = (struct part_data *)memmgr_heap_page_allocn(PART_MAX_DATA_PAGES);
 		assert(part__data);
+		memset(part__data, 0, PART_MAX_DATA_PAGES * PAGE_SIZE);
 
 		ps_list_head_init(&part_l_global);
 		crt_lock_init(&part_l_lock);
-		ps_faa(&ds_init_done, 1);
 	}
 	
 	for (k = 0; k < PART_MAX_CORE_THDS; k++) {
@@ -146,5 +145,8 @@ part_init(void)
 	assert(it);
 	sl_thd_param_set(it, ip);
 
+	ps_faa(&all_done, 1);
+	while (ps_load(&all_done) != NUM_CPU) ;
+
 	ps_faa(&part_ready, 1);
 }
diff --git a/src/kernel/include/shared/consts.h b/src/kernel/include/shared/consts.h
index ce11fd9152..2c891fdc20 100644
--- a/src/kernel/include/shared/consts.h
+++ b/src/kernel/include/shared/consts.h
@@ -48,7 +48,7 @@ struct pt_regs {
 #endif
 
 #define MAX_SERVICE_DEPTH 31
-#define MAX_NUM_THREADS (128 * NUM_CPU)
+#define MAX_NUM_THREADS (256 * NUM_CPU)
 
 /* Stacks are 2 * page_size (expressed in words) */
 #define MAX_STACK_SZ_BYTE_ORDER 12

From 3482929a472d9cf07a199dd99af14b0aeb73527e Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Wed, 15 May 2019 14:41:49 -0400
Subject: [PATCH 068/127] fix a race in dequeue

---
 src/components/include/stacklist.h | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/components/include/stacklist.h b/src/components/include/stacklist.h
index f1ddd8078d..edb6f9fd54 100644
--- a/src/components/include/stacklist.h
+++ b/src/components/include/stacklist.h
@@ -62,9 +62,7 @@ stacklist_add(struct stacklist_head *h, struct stacklist *l)
 static inline thdid_t 
 stacklist_dequeue(cpuid_t *core, struct stacklist_head *h)
 {
-	struct stacklist *sl;
-
-	if (!h->head) return 0;
+	struct stacklist *sl = NULL;
 
 	/*
 	 * Only a single thread should trigger an event, and dequeue
@@ -72,11 +70,10 @@ stacklist_dequeue(cpuid_t *core, struct stacklist_head *h)
 	 * this, please note that this should *not* iterate more than
 	 * once.
 	 */
-	while (1) {
+	do {
 		sl = ps_load(&h->head);
-
-		if (ps_cas((unsigned long *)&h->head, (unsigned long)sl, (unsigned long)sl->next)) break;
-	}
+		if (unlikely(!sl)) return 0;
+	} while (!ps_cas((unsigned long *)&h->head, (unsigned long)sl, (unsigned long)sl->next));
 	sl->next = NULL;
 	*core    = sl->coreid;
 

From 648eef65dac0f5bee4e75a626b3d894e93c05415 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Wed, 15 May 2019 14:43:44 -0400
Subject: [PATCH 069/127] Thread migration kernel and user-level

* BIG TODO: Test it's working!
---
 .../implementation/capmgr/naive/cap_info.h    |  11 +-
 .../implementation/capmgr/naive/cap_mgr.c     |  26 +++
 src/components/include/cos_kernel_api.h       |   3 +
 src/components/include/sl.h                   |   4 +
 src/components/include/sl_plugins.h           |   3 +
 src/components/include/sl_xcore.h             |  29 +++-
 src/components/interface/capmgr/capmgr.h      |   2 +
 .../interface/capmgr/stubs/s_stub.S           |   1 +
 src/components/lib/cos_kernel_api.c           |  12 ++
 src/components/lib/sl/sl_capmgr.c             |  41 +++++
 src/components/lib/sl/sl_mod_fifo.c           |  12 ++
 src/components/lib/sl/sl_mod_fprr.c           |   7 +
 src/components/lib/sl/sl_mod_part_fifo.c      |  12 ++
 src/components/lib/sl/sl_mod_rr.c             |  12 ++
 src/components/lib/sl/sl_raw.c                |  42 +++++
 src/components/lib/sl/sl_sched.c              |   5 +
 src/components/lib/sl/sl_thd_static_backend.c |  52 +++++-
 src/components/lib/sl/sl_xcore.c              | 151 ++++++++++++++++--
 src/kernel/capinv.c                           |  12 ++
 src/kernel/include/shared/cos_types.h         |   3 +-
 src/kernel/include/thd.h                      |  50 ++++++
 21 files changed, 460 insertions(+), 30 deletions(-)

diff --git a/src/components/implementation/capmgr/naive/cap_info.h b/src/components/implementation/capmgr/naive/cap_info.h
index 9fdbacedf4..9919c6c796 100644
--- a/src/components/implementation/capmgr/naive/cap_info.h
+++ b/src/components/implementation/capmgr/naive/cap_info.h
@@ -144,11 +144,18 @@ cap_info_is_parent(struct cap_comp_info *r, spdid_t p)
 }
 
 static inline int
-cap_info_is_sched(spdid_t c)
+cap_info_is_sched_core(spdid_t c, cpuid_t core)
 {
+	if (core >= NUM_CPU) return 0;
 	if (!c) return 1; /* llbooter! */
 
-	return bitmap_check(cap_info_schedbmp[cos_cpuid()], c - 1);
+	return bitmap_check(cap_info_schedbmp[core], c - 1);
+}
+
+static inline int
+cap_info_is_sched(spdid_t c)
+{
+	return cap_info_is_sched_core(c, cos_cpuid());
 }
 
 static inline int
diff --git a/src/components/implementation/capmgr/naive/cap_mgr.c b/src/components/implementation/capmgr/naive/cap_mgr.c
index 81efda7172..1bd6bb61aa 100644
--- a/src/components/implementation/capmgr/naive/cap_mgr.c
+++ b/src/components/implementation/capmgr/naive/cap_mgr.c
@@ -358,6 +358,32 @@ capmgr_aep_create_cserialized(struct cos_dcb_info **dcb, u32_t *tcrcvret, u32_t
 	return 0;
 }
 
+int
+capmgr_thd_migrate(thdid_t tid, thdcap_t tc, cpuid_t core)
+{
+	spdid_t                   cur     = cos_inv_token();
+	struct cos_defcompinfo   *cap_dci = cos_defcompinfo_curr_get();
+	struct cos_compinfo      *cap_ci  = cos_compinfo_get(cap_dci);
+	struct cap_comp_info     *rc      = cap_info_comp_find(cur);
+	struct sl_thd            *ti      = cap_info_thd_find(rc, tid);
+	struct cap_comp_cpu_info *rc_cpu  = NULL;
+	int ret;
+
+	if (!rc || !cap_info_init_check(rc)) return -EINVAL;
+	if (!cap_info_is_sched(cur) || !cap_info_is_sched_core(cur, core)) return -EINVAL;
+	if (!ti || !sl_thd_thdcap(ti)) return -EINVAL;
+	rc_cpu = cap_info_cpu_local(rc);
+	if (tid == rc_cpu->initthdid) return -EINVAL;
+
+	ret = cos_thd_migrate(cap_ci, sl_thd_thdcap(ti), core);
+	if (ret) return ret;
+	ret = cos_thdcap_migrate(cap_info_ci(rc), tc);
+	if (ret) return ret;
+	ret = sl_thd_migrate(tid, core);
+
+	return ret;
+}
+
 thdcap_t
 capmgr_thd_retrieve_cserialized(thdid_t *inittid, int *unused, spdid_t s, thdid_t tid)
 {
diff --git a/src/components/include/cos_kernel_api.h b/src/components/include/cos_kernel_api.h
index baa6aa28d7..fac998ae47 100644
--- a/src/components/include/cos_kernel_api.h
+++ b/src/components/include/cos_kernel_api.h
@@ -125,6 +125,9 @@ thdcap_t cos_thd_alloc_ext(struct cos_compinfo *ci, compcap_t comp, thdclosure_i
 			   dcboff_t dcboff);
 /* Create the initial (cos_init) thread */
 thdcap_t  cos_initthd_alloc(struct cos_compinfo *ci, compcap_t comp, dcbcap_t dc);
+int cos_thd_migrate(struct cos_compinfo *ci, thdcap_t thdc, cpuid_t core);
+/* update the thdcap to migrated core */
+int cos_thdcap_migrate(struct cos_compinfo *ci, thdcap_t thdc);
 sinvcap_t cos_sinv_alloc(struct cos_compinfo *srcci, compcap_t dstcomp, vaddr_t entry, invtoken_t token);
 arcvcap_t cos_arcv_alloc(struct cos_compinfo *ci, thdcap_t thdcap, tcap_t tcapcap, compcap_t compcap,
 			 arcvcap_t enotif);
diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index 16eef7e028..ed21342c10 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -302,6 +302,10 @@ void sl_thd_yield_intern(thdid_t tid);
 
 void sl_thd_yield_cs_exit(thdid_t tid);
 
+int sl_thd_migrate_no_cs(struct sl_thd *t, cpuid_t core);
+/* @return: 0 - success, -1 - failure */
+int sl_thd_migrate(thdid_t tid, cpuid_t core);
+
 /* The entire thread allocation and free API */
 struct sl_thd *sl_thd_alloc(cos_thd_fn_t fn, void *data);
 struct sl_thd *sl_thd_aep_alloc(cos_aepthd_fn_t fn, void *data, int own_tcap, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax);
diff --git a/src/components/include/sl_plugins.h b/src/components/include/sl_plugins.h
index 0a7d22db3f..a5266f5bc9 100644
--- a/src/components/include/sl_plugins.h
+++ b/src/components/include/sl_plugins.h
@@ -16,6 +16,7 @@
  */
 struct sl_thd_policy *sl_thd_alloc_backend(thdid_t tid);
 void                  sl_thd_free_backend(struct sl_thd_policy *t);
+struct sl_thd_policy *sl_thd_migrate_backend(struct sl_thd_policy *t, cpuid_t core);
 /*
  * cos_aep_info structs cannot be stack allocated!
  * The thread_alloc_backened needs to provide struct cos_aep_info without
@@ -42,6 +43,8 @@ static inline struct sl_thd_policy *sl_mod_thd_policy_get(struct sl_thd *t);
 
 void                  sl_mod_execution(struct sl_thd_policy *t, cycles_t cycles);
 struct sl_thd_policy *sl_mod_schedule(void);
+/* give me the thread at the end of the run-queue */
+struct sl_thd_policy *sl_mod_last_schedule(void);
 
 void sl_mod_block(struct sl_thd_policy *t);
 void sl_mod_wakeup(struct sl_thd_policy *t);
diff --git a/src/components/include/sl_xcore.h b/src/components/include/sl_xcore.h
index 8cf838142e..b06d3c51b4 100644
--- a/src/components/include/sl_xcore.h
+++ b/src/components/include/sl_xcore.h
@@ -6,7 +6,9 @@
 #include <cos_defkernel_api.h>
 #include <res_spec.h>
 
-#define SL_XCORE_PARAM_MAX 4
+#define SL_XCORE_PARAM_MAX   4
+#define SL_XCORE_MIGRATE_MAX 16
+#define SL_XCORE_KEEP_MIN    4
 
 typedef enum {
 	SL_XCORE_THD_ALLOC = 0,
@@ -18,13 +20,31 @@ typedef enum {
 
 	SL_XCORE_THD_PARAM_SET,
 	SL_XCORE_THD_WAKEUP,
+
+	SL_XCORE_LOAD_BALANCE,
 } sl_xcore_req_t;
 
+struct sl_xcore_response {
+	/* request type */
+	sl_xcore_req_t type; /* set by the client requesting */
+	/* response fields */
+	volatile int resp_ready;
+	union {
+		struct {
+			thdid_t tid;
+		} sl_xcore_resp_thd_alloc;
+		struct {
+			unsigned nthds;
+			thdid_t tid[SL_XCORE_MIGRATE_MAX];
+		} sl_xcore_resp_load_balance;
+	};
+};
+
 struct sl_xcore_request {
 	sl_xcore_req_t type;         /* request type */
 	cpuid_t        client_core;  /* client cpu making the request */
 	thdid_t        client_thd;
-	vaddr_t        response;     /* response addr */
+	struct sl_xcore_response *response;
 
 	union {
 		struct {
@@ -69,6 +89,9 @@ struct sl_xcore_request {
 		struct {
 			thdid_t tid;
 		} sl_xcore_req_thd_wakeup;
+		struct {
+			int nthds; /* if 0 - migrate as many as the src can */
+		} sl_xcore_req_load_balance;
 	};
 };
 
@@ -114,6 +137,7 @@ struct sl_global {
 	struct sl_xcore_request xcore_rbuf[NUM_CPU][SL_XCORE_RING_SIZE];
 	u32_t core_bmp[(NUM_CPU + 7)/8]; /* bitmap of cores this scheduler is running on! */
 	asndcap_t xcore_asnd[NUM_CPU][NUM_CPU];
+	unsigned nthds_running[NUM_CPU] CACHE_ALIGNED;
 	struct cos_scb_info *scb_area;
 } CACHE_ALIGNED;
 
@@ -164,5 +188,6 @@ struct sl_xcore_thd *sl_xcore_initaep_alloc_ext(cpuid_t core, struct cos_defcomp
 void                 sl_xcore_thd_param_set(struct sl_xcore_thd *t, sched_param_t param);
 void                 sl_xcore_thd_wakeup(struct sl_xcore_thd *t);
 void                 sl_xcore_thd_wakeup_tid(thdid_t tid, cpuid_t core);
+int                  sl_xcore_load_balance(void);
 
 #endif /* SL_XCORE_H */
diff --git a/src/components/interface/capmgr/capmgr.h b/src/components/interface/capmgr/capmgr.h
index bff7c3f32e..7e1c873414 100644
--- a/src/components/interface/capmgr/capmgr.h
+++ b/src/components/interface/capmgr/capmgr.h
@@ -24,4 +24,6 @@ asndcap_t capmgr_asnd_create(spdid_t child, thdid_t t);
 asndcap_t capmgr_asnd_rcv_create(arcvcap_t rcv);
 asndcap_t capmgr_asnd_key_create(cos_channelkey_t key);
 
+int capmgr_thd_migrate(thdid_t tid, thdcap_t tc, cpuid_t core);
+
 #endif /* CAPMGR_H */
diff --git a/src/components/interface/capmgr/stubs/s_stub.S b/src/components/interface/capmgr/stubs/s_stub.S
index bb7c472163..ef2d82a56a 100644
--- a/src/components/interface/capmgr/stubs/s_stub.S
+++ b/src/components/interface/capmgr/stubs/s_stub.S
@@ -21,6 +21,7 @@ cos_asm_server_stub(capmgr_rcv_create_cserialized)
 cos_asm_server_stub(capmgr_asnd_create)
 cos_asm_server_stub(capmgr_asnd_rcv_create)
 cos_asm_server_stub(capmgr_asnd_key_create)
+cos_asm_server_stub(capmgr_thd_migrate)
 
 cos_asm_server_stub(memmgr_heap_page_allocn)
 cos_asm_server_stub_rets(memmgr_shared_page_allocn_cserialized)
diff --git a/src/components/lib/cos_kernel_api.c b/src/components/lib/cos_kernel_api.c
index 19b07f3e8a..cbd7f01dfd 100644
--- a/src/components/lib/cos_kernel_api.c
+++ b/src/components/lib/cos_kernel_api.c
@@ -629,6 +629,18 @@ cos_initthd_alloc(struct cos_compinfo *ci, compcap_t comp, dcbcap_t dc)
 	return __cos_thd_alloc(ci, comp, 0, dc, 0);
 }
 
+int
+cos_thd_migrate(struct cos_compinfo *ci, thdcap_t t, cpuid_t c)
+{
+	return call_cap_op(ci->captbl_cap, CAPTBL_OP_THDMIGRATE, t, c, 0, 0);
+}
+
+int
+cos_thdcap_migrate(struct cos_compinfo *ci, thdcap_t t)
+{
+	return call_cap_op(ci->captbl_cap, CAPTBL_OP_THDMIGRATE, t, 0, 1, 0);
+}
+
 dcbcap_t
 cos_dcb_alloc(struct cos_compinfo *ci, pgtblcap_t ptcap, vaddr_t uaddr)
 {
diff --git a/src/components/lib/sl/sl_capmgr.c b/src/components/lib/sl/sl_capmgr.c
index 88c595b34a..e0e5be9b01 100644
--- a/src/components/lib/sl/sl_capmgr.c
+++ b/src/components/lib/sl/sl_capmgr.c
@@ -109,6 +109,7 @@ sl_thd_alloc_no_cs(cos_thd_fn_t fn, void *data)
 
 	t = sl_thd_alloc_init(aep, 0, 0, dcb);
 	sl_mod_thd_create(sl_mod_thd_policy_get(t));
+	ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1);
 
 done:
 	return t;
@@ -136,6 +137,7 @@ sl_thd_comp_init_no_cs(struct cos_defcompinfo *comp, sl_thd_property_t prps, asn
 
 	t = sl_thd_alloc_init(aep, snd, prps, NULL);
 	sl_mod_thd_create(sl_mod_thd_policy_get(t));
+	ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1);
 
 done:
 	return t;
@@ -162,6 +164,7 @@ sl_thd_alloc_ext_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx, vad
 
 		t = sl_thd_alloc_init(aep, 0, 0, NULL);
 		sl_mod_thd_create(sl_mod_thd_policy_get(t));
+		ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1);
 	} else {
 		struct cos_aep_info *compaep = cos_sched_aep_get(comp);
 
@@ -209,6 +212,7 @@ sl_thd_aep_alloc_ext_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, t
 
 		t = sl_thd_alloc_init(aep, 0, prps, NULL);
 		sl_mod_thd_create(sl_mod_thd_policy_get(t));
+		ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1);
 	}
 
 done:
@@ -233,6 +237,7 @@ sl_thd_aep_alloc_no_cs(cos_aepthd_fn_t fn, void *data, sl_thd_property_t prps, c
 
 	t = sl_thd_alloc_init(aep, 0, prps, dcb);
 	sl_mod_thd_create(sl_mod_thd_policy_get(t));
+	ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1);
 
 done:
 	return t;
@@ -418,3 +423,39 @@ sl_thd_free(struct sl_thd *t)
 	sl_thd_free_no_cs(t);
 	sl_cs_exit();
 }
+
+int
+sl_thd_migrate_no_cs(struct sl_thd *t, cpuid_t core)
+{
+	struct sl_thd_policy *x = NULL;
+	int ret;
+
+	if (t->properties) return -1;
+	if (t->state != SL_THD_RUNNABLE) return -1;
+	/* capmgr should migrate the thdcap as well */
+	ret = capmgr_thd_migrate(sl_thd_thdid(t), sl_thd_thdcap(t), core);
+	if (ret) return -1;
+	sl_mod_thd_delete(sl_mod_thd_policy_get(t));
+	ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), -1);
+
+	x = sl_thd_migrate_backend(sl_mod_thd_policy_get(t), core);
+	if (!x) return -1;
+
+	return 0;
+}
+
+int
+sl_thd_migrate(thdid_t tid, cpuid_t core)
+{
+	int ret;
+	struct sl_thd *c = sl_thd_curr(), *t = sl_thd_lkup(tid);
+
+	if (core == cos_cpuid()) return -1;
+	if (sl_thd_rcvcap(t) || sl_thd_tcap(t)) return -1;
+	assert(c != t);
+	sl_cs_enter();
+	ret = sl_thd_migrate_no_cs(t, core);
+	sl_cs_exit();
+
+	return ret;
+}
diff --git a/src/components/lib/sl/sl_mod_fifo.c b/src/components/lib/sl/sl_mod_fifo.c
index b4c7d5cab1..3824356794 100644
--- a/src/components/lib/sl/sl_mod_fifo.c
+++ b/src/components/lib/sl/sl_mod_fifo.c
@@ -30,6 +30,18 @@ sl_mod_schedule(void)
 	return t;
 }
 
+struct sl_thd_policy *
+sl_mod_last_schedule(void)
+{
+	struct sl_thd_policy *t = NULL;
+
+	if (unlikely(ps_list_head_empty(&threads[cos_cpuid()]))) goto done;
+	t = ps_list_head_last_d(&threads[cos_cpuid()], struct sl_thd_policy);
+
+done:
+	return t;
+}
+
 void
 sl_mod_block(struct sl_thd_policy *t)
 {
diff --git a/src/components/lib/sl/sl_mod_fprr.c b/src/components/lib/sl/sl_mod_fprr.c
index 31d74b0566..8992ea0a57 100644
--- a/src/components/lib/sl/sl_mod_fprr.c
+++ b/src/components/lib/sl/sl_mod_fprr.c
@@ -35,6 +35,13 @@ sl_mod_schedule(void)
 	return t;
 }
 
+struct sl_thd_policy *
+sl_mod_last_schedule(void)
+{
+	/* not supported! */
+	return NULL;
+}
+
 static inline void
 __sl_mod_bmp_unset(struct sl_thd_policy *t)
 {
diff --git a/src/components/lib/sl/sl_mod_part_fifo.c b/src/components/lib/sl/sl_mod_part_fifo.c
index 3dee6ee9ba..8ea5908773 100644
--- a/src/components/lib/sl/sl_mod_part_fifo.c
+++ b/src/components/lib/sl/sl_mod_part_fifo.c
@@ -35,6 +35,18 @@ sl_mod_schedule(void)
 	return t;
 }
 
+struct sl_thd_policy *
+sl_mod_last_schedule(void)
+{
+	struct sl_thd_policy *t = NULL;
+
+	if (unlikely(ps_list_head_empty(&threads[cos_cpuid()]))) goto done;
+	t = ps_list_head_last_d(&threads[cos_cpuid()], struct sl_thd_policy);
+
+done:
+	return t;
+}
+
 void
 sl_mod_block(struct sl_thd_policy *t)
 {
diff --git a/src/components/lib/sl/sl_mod_rr.c b/src/components/lib/sl/sl_mod_rr.c
index d02bf502dd..ef3116a97c 100644
--- a/src/components/lib/sl/sl_mod_rr.c
+++ b/src/components/lib/sl/sl_mod_rr.c
@@ -25,6 +25,18 @@ sl_mod_schedule(void)
 	return t;
 }
 
+struct sl_thd_policy *
+sl_mod_last_schedule(void)
+{
+	struct sl_thd_policy *t = NULL, *tl = NULL;
+
+	if (unlikely(ps_list_head_empty(&threads[cos_cpuid()]))) goto done;
+	t = ps_list_head_last_d(&threads[cos_cpuid()], struct sl_thd_policy);
+
+done:
+	return t;
+}
+
 void
 sl_mod_block(struct sl_thd_policy *t)
 {
diff --git a/src/components/lib/sl/sl_raw.c b/src/components/lib/sl/sl_raw.c
index 92f1a0b645..618eac31b5 100644
--- a/src/components/lib/sl/sl_raw.c
+++ b/src/components/lib/sl/sl_raw.c
@@ -103,6 +103,7 @@ sl_thd_alloc_no_cs(cos_thd_fn_t fn, void *data)
 
 	t = sl_thd_alloc_init(aep, 0, 0, dcb);
 	sl_mod_thd_create(sl_mod_thd_policy_get(t));
+	ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1);
 
 done:
 	return t;
@@ -129,6 +130,7 @@ sl_thd_comp_init_no_cs(struct cos_defcompinfo *comp, sl_thd_property_t prps, asn
 
 	t = sl_thd_alloc_init(aep, snd, prps, NULL);
 	sl_mod_thd_create(sl_mod_thd_policy_get(t));
+	ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1);
 
 done:
 	return t;
@@ -155,6 +157,7 @@ sl_thd_alloc_ext_dcb_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx,
 
 		t = sl_thd_alloc_init(aep, 0, 0, NULL);
 		sl_mod_thd_create(sl_mod_thd_policy_get(t));
+		ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1);
 	} else {
 		assert(idx == 0);
 		ret = cos_initaep_alloc(comp, NULL, 0, dcbcap);
@@ -191,6 +194,7 @@ sl_thd_aep_alloc_no_cs(cos_aepthd_fn_t fn, void *data, sl_thd_property_t prps, c
 
 	t = sl_thd_alloc_init(aep, 0, prps, dcb);
 	sl_mod_thd_create(sl_mod_thd_policy_get(t));
+	ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1);
 
 done:
 	return t;
@@ -229,6 +233,7 @@ sl_thd_aep_alloc_ext_dcb_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sche
 
 		t = sl_thd_alloc_init(aep, 0, prps, NULL);
 		sl_mod_thd_create(sl_mod_thd_policy_get(t));
+		ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1);
 
 		if (extrcv) *extrcv = sl_thd_rcvcap(t);
 	}
@@ -365,3 +370,40 @@ sl_thd_free(struct sl_thd *t)
 	sl_thd_free_no_cs(t);
 	sl_cs_exit();
 }
+
+int
+sl_thd_migrate_no_cs(struct sl_thd *t, cpuid_t core)
+{
+	struct cos_defcompinfo *dci = cos_defcompinfo_curr_get();
+	struct cos_compinfo    *ci  = cos_compinfo_get(dci);
+	struct sl_thd_policy   *x = NULL;
+	int ret;
+
+	if (t->properties) return -1;
+	if (t->state != SL_THD_RUNNABLE) return -1;
+	ret = cos_thd_migrate(ci, sl_thd_thdcap(t), core);
+	if (ret) return -1;
+	sl_mod_thd_delete(sl_mod_thd_policy_get(t));
+	ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), -1);
+
+	x = sl_thd_migrate_backend(sl_mod_thd_policy_get(t), core);
+	if (!x) return -1;
+
+	return 0;
+}
+
+int
+sl_thd_migrate(thdid_t tid, cpuid_t core)
+{
+	int ret;
+	struct sl_thd *c = sl_thd_curr(), *t = sl_thd_lkup(tid);
+
+	if (core == cos_cpuid()) return -1;
+	assert(c != t);
+	sl_cs_enter();
+	ret = sl_thd_migrate_no_cs(t, core);
+	sl_cs_exit();
+
+	return ret;
+}
+
diff --git a/src/components/lib/sl/sl_sched.c b/src/components/lib/sl/sl_sched.c
index 062a6ccb1a..c05fb58f66 100644
--- a/src/components/lib/sl/sl_sched.c
+++ b/src/components/lib/sl/sl_sched.c
@@ -159,6 +159,7 @@ sl_thd_free_no_cs(struct sl_thd *t)
         if (t->state == SL_THD_BLOCKED_TIMEOUT) sl_timeout_remove(t);
         sl_thd_index_rem_backend(sl_mod_thd_policy_get(t));
         sl_mod_thd_delete(sl_mod_thd_policy_get(t));
+	ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), -1);
         t->state = SL_THD_FREE;
         /* TODO: add logic for the graveyard to delay this deallocation if t == current */
         sl_thd_free_backend(sl_mod_thd_policy_get(t));
@@ -198,6 +199,7 @@ sl_thd_sched_block_no_cs(struct sl_thd *t, sl_thd_state_t block_type, cycles_t t
 
 	assert(sl_thd_is_runnable(t));
 	sl_mod_block(sl_mod_thd_policy_get(t));
+	ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), -1);
 
 update:
 	t->state = block_type;
@@ -253,6 +255,7 @@ sl_thd_block_no_cs(struct sl_thd *t, sl_thd_state_t block_type, cycles_t timeout
 	sl_thd_sched_unblock_no_cs(t);
 	assert(t->state == SL_THD_RUNNABLE);
 	sl_mod_block(sl_mod_thd_policy_get(t));
+	ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), -1);
 	t->state = block_type;
 	if (block_type == SL_THD_BLOCKED_TIMEOUT) sl_timeout_block(t, timeout);
 
@@ -392,6 +395,7 @@ sl_thd_sched_wakeup_no_cs(struct sl_thd *t)
 	if (t->state == SL_THD_BLOCKED_TIMEOUT) sl_timeout_remove(t);
 	t->state = SL_THD_RUNNABLE;
 	sl_mod_wakeup(sl_mod_thd_policy_get(t));
+	ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1);
 
 	return 0;
 }
@@ -409,6 +413,7 @@ sl_thd_wakeup_no_cs_rm(struct sl_thd *t)
 	assert(t->state == SL_THD_BLOCKED || t->state == SL_THD_BLOCKED_TIMEOUT);
 	t->state = SL_THD_RUNNABLE;
 	sl_mod_wakeup(sl_mod_thd_policy_get(t));
+	ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1);
 	t->rcv_suspended = 0;
 
 	return 0;
diff --git a/src/components/lib/sl/sl_thd_static_backend.c b/src/components/lib/sl/sl_thd_static_backend.c
index 86aa4eac66..3cce2a005a 100644
--- a/src/components/lib/sl/sl_thd_static_backend.c
+++ b/src/components/lib/sl/sl_thd_static_backend.c
@@ -17,26 +17,62 @@ static struct cos_aep_info __sl_aep_infos[NUM_CPU][SL_MAX_NUM_THDS];
 static u32_t               __sl_aep_free_off[NUM_CPU];
 
 /* Default implementations of backend functions */
-struct sl_thd_policy *
-sl_thd_alloc_backend(thdid_t tid)
+static inline struct sl_thd_policy *
+sl_thd_alloc_backend_core(cpuid_t core, thdid_t tid)
 {
 	assert(tid < SL_MAX_NUM_THDS);
 
-	return &(__sl_threads[cos_cpuid()][tid]);
+	return &(__sl_threads[core][tid]);
 }
 
-struct cos_aep_info *
-sl_thd_alloc_aep_backend(void)
+static inline struct cos_aep_info *
+sl_thd_alloc_aep_backend_core(cpuid_t core)
 {
+	int off = 0;
 	struct cos_aep_info *aep = NULL;
 
-	assert(__sl_aep_free_off[cos_cpuid()] < SL_MAX_NUM_THDS);
-	aep = &(__sl_aep_infos[cos_cpuid()][__sl_aep_free_off[cos_cpuid()]]);
-	ps_faa((unsigned long *)&(__sl_aep_free_off[cos_cpuid()]), 1);
+	off = ps_faa((unsigned long *)&__sl_aep_free_off[core], 1);
+	assert(off < SL_MAX_NUM_THDS);
+	aep = &__sl_aep_infos[core][off];
 
 	return aep;
 }
 
+struct sl_thd_policy *
+sl_thd_migrate_backend(struct sl_thd_policy *t, cpuid_t core)
+{
+	assert(core != cos_cpuid() && core < NUM_CPU);
+
+	struct cos_aep_info *a = sl_thd_alloc_aep_backend_core(core);
+	struct cos_aep_info *b = sl_thd_aepinfo(sl_mod_thd_get(t));
+	struct sl_thd_policy *tc = sl_thd_alloc_backend_core(core, b->tid);
+	struct sl_thd *x = sl_mod_thd_get(tc), *y = sl_mod_thd_get(t);
+
+	memset(a, 0, sizeof(struct cos_aep_info));
+	a->tid = b->tid;
+	a->thd = b->thd;
+	assert(b->rcv == 0 && b->tc == 0);
+	memset(b, 0, sizeof(struct cos_aep_info));
+
+	memcpy(tc, t, sizeof(struct sl_thd_policy));
+	x->aepinfo = a;
+	memset(t, 0, sizeof(struct sl_thd_policy));
+
+	return tc;
+}
+
+struct sl_thd_policy *
+sl_thd_alloc_backend(thdid_t tid)
+{
+	return sl_thd_alloc_backend_core(cos_cpuid(), tid);
+}
+
+struct cos_aep_info *
+sl_thd_alloc_aep_backend(void)
+{
+	return sl_thd_alloc_aep_backend_core(cos_cpuid());
+}
+
 void
 sl_thd_free_backend(struct sl_thd_policy *t)
 { }
diff --git a/src/components/lib/sl/sl_xcore.c b/src/components/lib/sl/sl_xcore.c
index ca17543a34..665c4be9af 100644
--- a/src/components/lib/sl/sl_xcore.c
+++ b/src/components/lib/sl/sl_xcore.c
@@ -10,6 +10,17 @@
 static struct sl_xcore_thd _xcore_thds[MAX_NUM_THREADS];
 extern void sl_thd_param_set_no_cs(struct sl_thd *, sched_param_t);
 
+static inline void
+_sl_xcore_response_wait(struct sl_xcore_response *r)
+{
+	if (sl_thd_curr() != sl__globals_core()->sched_thd) {
+		if (!ps_load(&r->resp_ready)) sl_thd_block(0);
+	} else {
+		while (!ps_load(&r->resp_ready)) ;
+	}
+	assert(r->resp_ready);
+}
+
 static inline struct sl_xcore_thd *
 _sl_xcore_thd_backend_lookup(thdid_t tid)
 {
@@ -35,8 +46,10 @@ sl_xcore_thd_lookup_init(thdid_t tid, cpuid_t core)
 
 	/* TODO: is this safe? a wrong coreid can cause DOS! */
 	if (unlikely(!(t->thd))) return _sl_xcore_thd_backend_init(tid, core, 0);
-	/* something wrong! */
-	if (unlikely(t->core != core)) return NULL;
+
+	/* perhaps migrated! */
+	if (unlikely(t->core != core)) t->core = core;
+	/* if (unlikely(t->core != core)) return NULL; */
 
 	return t;
 }
@@ -47,12 +60,17 @@ sl_xcore_thd_lookup(thdid_t tid)
 	return _sl_xcore_thd_backend_lookup(tid);
 }
 
-#define SL_XCORE_REQ(req, typ, resp) do { 				\
-					req.type        = typ;		\
-					req.client_core = cos_cpuid();	\
-					req.client_thd  = cos_thdid();	\
-					req.response    = resp;		\
-					} while (0)
+#define SL_XCORE_REQ(req, typ, resp) do { 		\
+			req.type        = typ;		\
+			req.client_core = cos_cpuid();	\
+			req.client_thd  = cos_thdid();	\
+			req.response    = resp;		\
+		} while (0)
+
+#define SL_XCORE_RESP(resp, typ) do {			\
+			resp.type       = typ;		\
+			resp.resp_ready = 0;		\
+		} while (0)
 
 extern struct sl_thd *sl_thd_alloc_no_cs(cos_thd_fn_t fn, void *data);
 
@@ -97,9 +115,11 @@ sl_xcore_thd_alloc(cpuid_t core, cos_thd_fn_t fn, void *data, int nparams, sched
 	int ret = 0;
 	asndcap_t snd = 0;
 	struct sl_xcore_request req;
-	volatile thdid_t xcore_tid = 0;
+	struct sl_xcore_response resp;
+	thdid_t xcore_tid;
 
-	SL_XCORE_REQ(req, SL_XCORE_THD_ALLOC, (vaddr_t)&xcore_tid);
+	SL_XCORE_REQ(req, SL_XCORE_THD_ALLOC, &resp);
+	SL_XCORE_RESP(resp, SL_XCORE_THD_ALLOC);
 	req.sl_xcore_req_thd_alloc.fn = fn;
 	req.sl_xcore_req_thd_alloc.data = data;
 	if (nparams) memcpy(req.sl_xcore_req_thd_alloc.params, params, sizeof(sched_param_t) * nparams);
@@ -109,11 +129,8 @@ sl_xcore_thd_alloc(cpuid_t core, cos_thd_fn_t fn, void *data, int nparams, sched
 	if (unlikely(ret)) return NULL;
 
 	/* Other core will wake this up after creation! */
-	if (sl_thd_curr() != sl__globals_core()->sched_thd) {
-		sl_thd_block(0);
-	} else {
-		while (!ps_load(&xcore_tid)) ;
-	}
+	_sl_xcore_response_wait(&resp);
+	xcore_tid = resp.sl_xcore_resp_thd_alloc.tid;
 	assert(xcore_tid);
 	
 	return _sl_xcore_thd_backend_init(xcore_tid, core, 0);
@@ -195,7 +212,65 @@ sl_xcore_thd_wakeup_tid(thdid_t tid, cpuid_t core)
 	sl_xcore_thd_wakeup(t);
 }
 
+int
+sl_xcore_load_balance(void)
+{
+	struct sl_xcore_request req;
+	struct sl_xcore_response resp;
+	struct sl_global *g = sl__globals();
+	unsigned max = 0, i, nthds = 0;
+	int core = -1, ret;
+
+	for (i = 0; i < NUM_CPU; i++) {
+		if (!bitmap_check(g->core_bmp, i)) continue;
+
+		if (g->nthds_running[i] <= max) continue;
+
+		max = g->nthds_running[i];
+		core = i;
+		break;
+	}
+
+	if (max == 0 || core == -1) return -1;
+
+	memset(&req, 0, sizeof(req));
+	SL_XCORE_REQ(req, SL_XCORE_LOAD_BALANCE, &resp);
+	SL_XCORE_RESP(resp, SL_XCORE_LOAD_BALANCE);
+	req.sl_xcore_req_load_balance.nthds = 1; /* FIXME: lets start with just 1 */
+	ret = _sl_xcore_request_enqueue((cpuid_t)core, &req);
+	if (unlikely(ret)) return -1;
+
+	_sl_xcore_response_wait(&resp);
+	nthds = resp.sl_xcore_resp_load_balance.nthds;
+	if (!nthds) return 0;
+
+	assert(nthds < SL_XCORE_MIGRATE_MAX);
+	sl_cs_enter();
+	for (i = 0; i < nthds; i++) {
+		struct sl_thd *t = sl_thd_lkup(resp.sl_xcore_resp_load_balance.tid[i]);
+
+		assert(t);
+		assert(t->state == SL_THD_RUNNABLE);
+		sl_mod_wakeup(sl_mod_thd_policy_get(t));
+		ps_faa(&(g->nthds_running[cos_cpuid()]), 1);
+	}
+	sl_cs_exit();
+
+	return nthds;
+}
+
 /******************************* Server-side ***************************/
+static inline void
+_sl_xcore_respond(struct sl_xcore_request *req)
+{
+	struct sl_xcore_response *resp = req->response;
+
+	if (!resp) return;
+
+	assert(resp->type == req->type && ps_load(&resp->resp_ready) == 0);
+	ps_faa(&resp->resp_ready, 1);
+	_sl_xcore_thd_wakeup_tid_no_cs(req->client_thd, req->client_core);
+}
 
 static inline int
 _sl_xcore_req_thd_alloc_no_cs(struct sl_xcore_request *req)
@@ -203,15 +278,15 @@ _sl_xcore_req_thd_alloc_no_cs(struct sl_xcore_request *req)
 	cos_thd_fn_t   fn   = req->sl_xcore_req_thd_alloc.fn;
 	void          *data = req->sl_xcore_req_thd_alloc.data;
 	struct sl_thd *t;
+	struct sl_xcore_response *x = req->response;
 	int i;
 
 	assert(fn);
 
 	t = sl_thd_alloc_no_cs(fn, data);
 	assert(t);
-	if (likely(req->response)) *((thdid_t *)req->response) = sl_thd_thdid(t);
+	if (likely(x)) x->sl_xcore_resp_thd_alloc.tid = sl_thd_thdid(t);
 	for (i = 0; i < req->sl_xcore_req_thd_alloc.param_count; i++) sl_thd_param_set_no_cs(t, req->sl_xcore_req_thd_alloc.params[i]);
-	_sl_xcore_thd_wakeup_tid_no_cs(req->client_thd, req->client_core);
 
 	return 0;
 }
@@ -239,12 +314,48 @@ _sl_xcore_req_thd_wakeup_no_cs(struct sl_xcore_request *req)
 	return 0;
 }
 
+static inline void 
+_sl_xcore_req_load_balance_no_cs(struct sl_xcore_request *req)
+{
+	struct sl_global *g = sl__globals();
+	int n = g->nthds_running[cos_cpuid()], i, j = 0;
+	struct sl_xcore_response *rp = req->response;
+	cpuid_t cl_core = req->client_core;
+
+	if (n <= SL_XCORE_KEEP_MIN) return;
+	n -= SL_XCORE_KEEP_MIN;
+
+	if (n > SL_XCORE_MIGRATE_MAX) n = SL_XCORE_MIGRATE_MAX;
+	if (n > req->sl_xcore_req_load_balance.nthds) n = req->sl_xcore_req_load_balance.nthds;
+
+	assert(rp);
+	for (i = 0; i < n; i++) {
+		struct sl_thd_policy *t = sl_mod_last_schedule();
+		thdid_t tid = 0;
+		struct sl_xcore_thd *xt = NULL;
+
+		if (!t) break;
+		tid = sl_thd_thdid(sl_mod_thd_get(t));
+		xt = sl_xcore_thd_lookup(tid);
+		assert(xt);
+		if (xt->thd == tid) assert(xt->core == cos_cpuid());
+		if (sl_thd_migrate_no_cs(sl_mod_thd_get(t), cl_core)) break;
+		sl_xcore_thd_lookup_init(tid, cl_core);
+		rp->sl_xcore_resp_load_balance.tid[i] = tid;
+	}
+	rp->sl_xcore_resp_load_balance.nthds = i;
+
+	return;
+}
+
 int
 sl_xcore_process_no_cs(void)
 {
 	int num = 0;
 	struct sl_xcore_request xcore_req;
 
+	if (likely(NUM_CPU < 2)) return 0;
+
 	while (ck_ring_dequeue_mpsc_xcore(sl__ring_curr(), sl__ring_buffer_curr(), &xcore_req) == true) {
 		assert(xcore_req.client_core != cos_cpuid());
 
@@ -275,12 +386,18 @@ sl_xcore_process_no_cs(void)
 			_sl_xcore_req_thd_wakeup_no_cs(&xcore_req);
 			break;
 		}
+		case SL_XCORE_LOAD_BALANCE:
+		{
+			_sl_xcore_req_load_balance_no_cs(&xcore_req);
+			break;
+		}
 		default:
 		{
 			PRINTC("Unrecognized request! Aborting!\n");
 			assert(0);
 		}
 		}
+		_sl_xcore_respond(&xcore_req);
 		num ++;
 	}
 
diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index c9fd687617..cb482de624 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -1316,6 +1316,18 @@ static int __attribute__((noinline)) composite_syscall_slowpath(struct pt_regs *
 
 			break;
 		}
+		case CAPTBL_OP_THDMIGRATE: {
+			u32_t reg2 = __userregs_get2(regs);
+			u32_t reg3 = __userregs_get3(regs);
+
+			if (reg3) {
+				ret = thd_migrate_cap(ct, capin);
+			} else {
+				ret = thd_migrate(ct, capin, reg2);
+			}
+
+			break;
+		}
 		case CAPTBL_OP_TCAP_ACTIVATE: {
 			capid_t        tcap_cap   = __userregs_get1(regs) >> 16;
 			capid_t        pgtbl_cap  = (__userregs_get1(regs) << 16) >> 16;
diff --git a/src/kernel/include/shared/cos_types.h b/src/kernel/include/shared/cos_types.h
index d100c343cf..f96d1a9be2 100644
--- a/src/kernel/include/shared/cos_types.h
+++ b/src/kernel/include/shared/cos_types.h
@@ -84,6 +84,7 @@ typedef enum {
 	CAPTBL_OP_THDACTIVATE,
 	CAPTBL_OP_THDDEACTIVATE,
 	CAPTBL_OP_THDTLSSET,
+	CAPTBL_OP_THDMIGRATE,
 	CAPTBL_OP_COMPACTIVATE,
 	CAPTBL_OP_COMPDEACTIVATE,
 	CAPTBL_OP_SINVACTIVATE,
@@ -208,7 +209,6 @@ __captbl_cap2sz(cap_t c)
 	/* TODO: optimize for invocation and return */
 	switch (c) {
 	case CAP_SRET:
-	case CAP_THD:
 	case CAP_TCAP:
 		return CAP_SZ_16B;
 	case CAP_SCB:
@@ -217,6 +217,7 @@ __captbl_cap2sz(cap_t c)
 	case CAP_PGTBL:
 	case CAP_HW: /* TODO: 256bits = 32B * 8b */
 		return CAP_SZ_32B;
+	case CAP_THD: /* to allow thread migration across cores using the same capability */
 	case CAP_SINV:
 	case CAP_COMP:
 	case CAP_ASND:
diff --git a/src/kernel/include/thd.h b/src/kernel/include/thd.h
index 224c7e57fd..1705fcd78f 100644
--- a/src/kernel/include/thd.h
+++ b/src/kernel/include/thd.h
@@ -362,6 +362,56 @@ thd_activate(struct captbl *t, capid_t cap, capid_t capin, struct thread *thd, c
 	return ret;
 }
 
+static inline int
+thd_migrate_cap(struct captbl *ct, capid_t thd_cap)
+{
+	struct thread *thd;
+	struct cap_thd *tc;
+
+	/* we migrated the capability to core */
+	tc = (struct cap_thd *)captbl_lkup(ct, thd_cap);
+	if (!tc || tc->h.type != CAP_THD || get_cpuid() != tc->cpuid) return -EINVAL;
+	thd = tc->t;
+	tc->cpuid = thd->cpuid;
+
+	return 0;
+}
+
+static inline int
+thd_migrate(struct captbl *ct, capid_t thd_cap, cpuid_t core)
+{
+	struct thread *thd;
+	struct cap_thd *tc;
+
+	tc = (struct cap_thd *)captbl_lkup(ct, thd_cap);
+	if (!tc || tc->h.type != CAP_THD || get_cpuid() != tc->cpuid) return -EINVAL;
+	thd = tc->t;
+	if (NUM_CPU < 2 || core >= NUM_CPU || core < 0) return -EINVAL;
+	if (tc->cpuid != thd->cpuid) return -EINVAL; /* outdated capability */
+	if (thd->cpuid == core) return -EINVAL; /* already migrated. invalid req */
+	if (thd->cpuid != get_cpuid()) return -EPERM; /* only push migration */
+
+	if (thd_current(cos_cpu_local_info()) == thd) return -EPERM; /* not a running thread! */
+	if (thd->invstk_top > 0) return -EPERM;  /* not if its in an invocation */
+	if (thd_bound2rcvcap(thd) || thd->rcvcap.rcvcap_thd_notif) return -EPERM; /* not if it's an AEP */
+	if (thd->rcvcap.rcvcap_tcap) return -EPERM; /* not if it has its own tcap on this core */
+
+	thd->scheduler_thread = NULL;
+	thd->cpuid = core;
+	/* we also migrated the capability to core */
+	tc->cpuid = core;
+
+	/* 
+	 * TODO:
+	 * given that the thread is not running right now, 
+	 * and we don't allow migrating a thread that's in an invocation for now,
+	 * i think we can find the COREID_OFFSET/CPUID_OFFSET on stack and fix the
+	 * core id right here?? 
+	 */
+
+	return 0;
+}
+
 static int
 thd_deactivate(struct captbl *ct, struct cap_captbl *dest_ct, unsigned long capin, livenessid_t lid, capid_t pgtbl_cap,
                capid_t cosframe_addr, capid_t dcbcap, const int root)

From d19c3a3408b5ee26530f45d0f524e6ad06e9f62a Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Wed, 15 May 2019 14:44:40 -0400
Subject: [PATCH 070/127] use global main_task if no nesting for outer parallel
 construct

* This is to avoid locks in common case.
* inner parallel structs will use stack for part_task but they're
  not going to be added into the main list as they're going to
  be run by the master thread
---
 src/components/include/part.h          | 49 ++++++++++++++++++++++++--
 src/components/lib/cos_gomp/cos_gomp.c | 48 ++++++++++++++++++-------
 src/components/lib/part.c              | 12 ++++++-
 3 files changed, 93 insertions(+), 16 deletions(-)

diff --git a/src/components/include/part.h b/src/components/include/part.h
index ed5af5ae87..09e761c595 100644
--- a/src/components/include/part.h
+++ b/src/components/include/part.h
@@ -7,8 +7,9 @@
 #include <crt_lock.h>
 
 #include <sl.h>
+#include <sl_xcore.h>
 
-#define PART_NESTED 0 /* 0 - disabled, 1 - enabled */
+#undef PART_ENABLE_NESTED
 //#include <cirque.h>
 
 DEQUE_PROTOTYPE(part, struct part_task *);
@@ -17,9 +18,14 @@ DEQUE_PROTOTYPE(part, struct part_task *);
 extern struct deque_part part_dq_percore[];
 //extern struct cirque_par parcq_global;
 /* FIXME: use stacklist or another stack like data structure? */
+extern struct ps_list_head part_thdpool_core[];
+extern volatile int in_main_parallel;
+#if defined(PART_ENABLE_NESTED)
 extern struct ps_list_head part_l_global;
 extern struct crt_lock     part_l_lock;
-extern struct ps_list_head part_thdpool_core[];
+#else 
+extern struct part_task main_task;
+#endif
 
 static inline struct deque_part *
 part_deque_curr(void)
@@ -47,11 +53,13 @@ part_thdpool_curr(void)
 //	return &parcq_global;
 //}
 
+#if defined(PART_ENABLE_NESTED)
 static inline struct ps_list_head *
 part_list(void)
 {
 	return &part_l_global;
 }
+#endif
 
 static inline int 
 part_deque_push(struct part_task *t)
@@ -116,6 +124,9 @@ part_pool_wakeup(void)
 	sl_cs_enter();
 	if (unlikely(ps_list_head_empty(part_thdpool_curr()))) {
 		sl_cs_exit();
+
+		/* there is nothing in the pool, should we do load-balance? */
+		//sl_xcore_load_balance();
 		return;
 	}
 
@@ -164,28 +175,49 @@ part_pool_block(void)
 static inline void
 part_list_append(struct part_task *t)
 {
+#if defined(PART_ENABLE_NESTED)
 	assert(ps_list_singleton(t, partask));
 	assert(t->type == PART_TASK_T_WORKSHARE);
 
+	if (t->nthds == 1) return;
 	crt_lock_take(&part_l_lock);
 	ps_list_head_append(part_list(), t, partask);
 	crt_lock_release(&part_l_lock);
+	part_pool_wakeup();
+#endif
+
+	if (t != &main_task) {
+		assert(ps_load(&in_main_parallel));
+		return;
+	}
+	assert(ps_load(&in_main_parallel) == 0);
+
+	ps_faa(&in_main_parallel, 1);
 }
 
 static inline void
 part_list_remove(struct part_task *t)
 {
+#if defined(PART_ENABLE_NESTED)
 	assert(t->type == PART_TASK_T_WORKSHARE);
 	assert(!ps_list_singleton(t, partask));
 
 	crt_lock_take(&part_l_lock);
 	ps_list_rem(t, partask);
 	crt_lock_release(&part_l_lock);
+#endif
+	assert(ps_load(&in_main_parallel));
+	if (t != &main_task) return;
+
+	ps_faa(&in_main_parallel, -1);
 }
 
 static inline struct part_task *
 part_list_peek(void)
 {
+	if (!ps_load(&in_main_parallel)) return NULL;
+
+#if defined(PART_ENABLE_NESTED)
 	struct part_task *t = NULL;
 	int found = 0;
 
@@ -214,6 +246,17 @@ part_list_peek(void)
 	if (unlikely(!found)) return NULL;
 
 	return t;
+#else
+	int i;
+
+	assert(main_task.type == PART_TASK_T_WORKSHARE);
+	i = part_task_work_try(&main_task);
+	assert(i != 0);
+
+	if (likely(i > 0 && !ps_load(&main_task.end))) return &main_task;
+
+	return NULL;
+#endif
 }
 
 void part_init(void);
@@ -266,6 +309,8 @@ part_thd_fn(void *d)
 
 	/* parallel runtime not ready? */
 	if (unlikely(!part_isready())) part_pool_block();
+	/* not in the main parallel block? */
+	while (!ps_load(&in_main_parallel)) part_pool_block();
 
 	while (1) {
 		struct part_task *t = NULL;
diff --git a/src/components/lib/cos_gomp/cos_gomp.c b/src/components/lib/cos_gomp/cos_gomp.c
index 4c98bb01fe..eb1b32da71 100644
--- a/src/components/lib/cos_gomp/cos_gomp.c
+++ b/src/components/lib/cos_gomp/cos_gomp.c
@@ -51,10 +51,14 @@ _gomp_parallel_start(struct part_task *pt, void (*fn) (void *), void *data, unsi
 	struct sl_thd *t = sl_thd_curr();
 	struct part_task *parent = (struct part_task *)t->part_context;
 
+	if (parent) assert(ps_load(&in_main_parallel));
+
 	num_threads = (num_threads == 0 || num_threads > COS_GOMP_MAX_THDS) ? COS_GOMP_MAX_THDS : num_threads;
 
 	/* nesting? */
-	if (unlikely(parent && PART_NESTED == 0)) num_threads = 1;
+#if !defined(PART_ENABLE_NESTED)
+	if (unlikely(parent)) num_threads = 1;
+#endif
 
 	part_task_init(pt, PART_TASK_T_WORKSHARE, parent, num_threads, fn, data, NULL);
 	assert(pt->nthds == num_threads);
@@ -63,14 +67,8 @@ _gomp_parallel_start(struct part_task *pt, void (*fn) (void *), void *data, unsi
 		assert(parent_off >= 0);
 	}
 	t->part_context = pt;
-
-	if (unlikely(num_threads > 1)) {
-		unsigned i;
-
-		part_list_append(pt);
-
-		for (i = 1; i < num_threads; i++) part_pool_wakeup();
-	}
+	/* should not append to workshare list if it's a task with nthds == 1 */
+	part_list_append(pt);
 }
 
 static inline void
@@ -85,11 +83,23 @@ void
 GOMP_parallel (void (*fn) (void *), void *data, unsigned num_threads,
 	       unsigned int flags)
 {
+	struct part_task *prt = NULL;
 	struct part_task pt;
 
-	_gomp_parallel_start(&pt, fn, data, num_threads, flags);
+#if defined(PART_ENABLE_NESTED)
+	prt = &pt
+#else
+	struct sl_thd *t = sl_thd_curr();
+	struct part_task *parent = (struct part_task *)t->part_context;
+
+	/* child parallel will not be nested, will be run by this thread and also not added to the global list */
+	if(parent) prt = &pt;
+	else       prt = &main_task;
+#endif
+
+	_gomp_parallel_start(prt, fn, data, num_threads, flags);
 	fn(data);
-	_gomp_parallel_end(&pt);
+	_gomp_parallel_end(prt);
 }
 
 bool
@@ -221,15 +231,27 @@ GOMP_parallel_loop_dynamic (void (*fn) (void *), void *data,
 			    unsigned num_threads, long start, long end,
 			    long incr, long chunk_size, unsigned flags)
 {
+	struct part_task *prt = NULL;
 	struct part_task pt;
 	bool ret;
 
-	_gomp_parallel_start(&pt, fn, data, num_threads, flags);
+#if defined(PART_ENABLE_NESTED)
+	prt = &pt
+#else
+	struct sl_thd *t = sl_thd_curr();
+	struct part_task *parent = (struct part_task *)t->part_context;
+
+	/* child parallel will not be nested, will be run by this thread and also not added to the global list */
+	if (parent) prt = &pt;
+	else        prt = &main_task;
+#endif
+
+	_gomp_parallel_start(prt, fn, data, num_threads, flags);
 	ret = GOMP_loop_dynamic_start(start, end, incr, chunk_size, NULL, NULL);
 	assert(ret == true);
 
 	fn(data);
-	_gomp_parallel_end(&pt);
+	_gomp_parallel_end(prt);
 }
 
 bool
diff --git a/src/components/lib/part.c b/src/components/lib/part.c
index 36b2d68d7f..684e58ee2c 100644
--- a/src/components/lib/part.c
+++ b/src/components/lib/part.c
@@ -12,9 +12,14 @@
 
 struct deque_part part_dq_percore[NUM_CPU];
 //struct cirque_par parcq_global;
-struct ps_list_head part_l_global;
 static volatile unsigned part_ready = 0;
+volatile int in_main_parallel;
+#if defined(PART_ENABLE_NESTED)
 struct crt_lock part_l_lock;
+struct ps_list_head part_l_global;
+#else
+struct part_task main_task;
+#endif
 static struct part_task *part_tasks = NULL;
 static struct part_data *part__data = NULL;
 struct ps_list_head part_thdpool_core[NUM_CPU];
@@ -123,8 +128,13 @@ part_init(void)
 		assert(part__data);
 		memset(part__data, 0, PART_MAX_DATA_PAGES * PAGE_SIZE);
 
+#if defined(PART_ENABLE_NESTED)
 		ps_list_head_init(&part_l_global);
 		crt_lock_init(&part_l_lock);
+#else
+		memset(&main_task, 0, sizeof(main_task));
+#endif
+		in_main_parallel = 0;
 	}
 	
 	for (k = 0; k < PART_MAX_CORE_THDS; k++) {

From 14128a530ba13f9d06c1d33e8b963ac04c013809 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Wed, 15 May 2019 15:43:08 -0400
Subject: [PATCH 071/127] For migration, previously changed thd cap's size to
 take one full cache line.

* But I didn't fix INIT caps and I didn't test any of it in the last commit.
* per-core caps have to be on seperate cache lines, now, with thread
migration and reuse of a capability, changed thread to use a full cache
line.
* initthd, inittcap both occupy 64B cacheline.
* For the other caps, cos_kernel_api does size based and core-based
frontiers, so we're good.

* Trade off: If a thread is migrated across multiple cores, and if we
  freshly create per-core capability slot on each migration, we'd
probably be wasting a lot of space especially if we don't use that
core-local capability.
  In this simple(??) design though, we use a single capability that is as
big as a cache line and modify coreid on it to make it a core-local for
migrated core.
---
 src/kernel/include/shared/cos_types.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/kernel/include/shared/cos_types.h b/src/kernel/include/shared/cos_types.h
index f96d1a9be2..5eb0cf7e3c 100644
--- a/src/kernel/include/shared/cos_types.h
+++ b/src/kernel/include/shared/cos_types.h
@@ -277,19 +277,19 @@ enum
 	 */
 	BOOT_CAPTBL_SELF_INITRCV_BASE  = round_up_to_pow2(BOOT_CAPTBL_SELF_INITTHD_BASE + NUM_CPU * CAP64B_IDSZ,
                                                          CAPMAX_ENTRY_SZ),
-	BOOT_CAPTBL_LAST_CAP           = BOOT_CAPTBL_SELF_INITRCV_BASE + NUM_CPU * CAP64B_IDSZ,
+	BOOT_CAPTBL_SELF_INITTCAP_BASE  = round_up_to_pow2(BOOT_CAPTBL_SELF_INITRCV_BASE + NUM_CPU * CAP64B_IDSZ,
+                                                         CAPMAX_ENTRY_SZ),
+	BOOT_CAPTBL_LAST_CAP           = BOOT_CAPTBL_SELF_INITTCAP_BASE + NUM_CPU * CAP64B_IDSZ,
 	/* round up to next entry */
 	BOOT_CAPTBL_FREE = round_up_to_pow2(BOOT_CAPTBL_LAST_CAP, CAPMAX_ENTRY_SZ)
 };
 
-#define BOOT_CAPTBL_SELF_INITTCAP_BASE (BOOT_CAPTBL_SELF_INITTHD_BASE + CAP16B_IDSZ)
-
 #define BOOT_CAPTBL_SELF_INITTHD_CPU_BASE (BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cos_cpuid()))
 #define BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE (BOOT_CAPTBL_SELF_INITTCAP_BASE_CPU(cos_cpuid()))
 #define BOOT_CAPTBL_SELF_INITRCV_CPU_BASE (BOOT_CAPTBL_SELF_INITRCV_BASE_CPU(cos_cpuid()))
 
 #define BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cpuid) (BOOT_CAPTBL_SELF_INITTHD_BASE + cpuid * CAP64B_IDSZ)
-#define BOOT_CAPTBL_SELF_INITTCAP_BASE_CPU(cpuid) (BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cpuid) + CAP16B_IDSZ)
+#define BOOT_CAPTBL_SELF_INITTCAP_BASE_CPU(cpuid) (BOOT_CAPTBL_SELF_INITTCAP_BASE + cpuid * CAP64B_IDSZ)
 #define BOOT_CAPTBL_SELF_INITRCV_BASE_CPU(cpuid) (BOOT_CAPTBL_SELF_INITRCV_BASE + cpuid * CAP64B_IDSZ)
 
 enum llboot_scb_dcb_caps

From 11a1623e009befdc9896713f90f3916b9e123dc1 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Wed, 15 May 2019 15:48:03 -0400
Subject: [PATCH 072/127] fixed line-wraps for some.

---
 src/components/lib/cos_dcb.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/components/lib/cos_dcb.c b/src/components/lib/cos_dcb.c
index 576d1dc2b2..e73069af8f 100644
--- a/src/components/lib/cos_dcb.c
+++ b/src/components/lib/cos_dcb.c
@@ -5,7 +5,8 @@
 static struct cos_dcbinfo_data _cos_dcbinfo[NUM_CPU];
 
 void
-cos_dcb_info_init_ext(struct cos_dcbinfo_data *cdi, struct cos_compinfo *ci, dcbcap_t initdcbcap, vaddr_t initdcbaddr, dcboff_t start_off)
+cos_dcb_info_init_ext(struct cos_dcbinfo_data *cdi, struct cos_compinfo *ci, 
+		      dcbcap_t initdcbcap, vaddr_t initdcbaddr, dcboff_t start_off)
 {
 	memset(cdi, 0, sizeof(struct cos_dcbinfo_data));
 
@@ -19,7 +20,8 @@ void
 cos_dcb_info_init(struct cos_dcbinfo_data *cdi, struct cos_compinfo *ci)
 {
 	if (cos_spd_id() == 0) {
-		cos_dcb_info_init_ext(cdi, ci, LLBOOT_CAPTBL_CPU_INITDCB, (vaddr_t)cos_init_dcb_get(), 1);
+		cos_dcb_info_init_ext(cdi, ci, LLBOOT_CAPTBL_CPU_INITDCB, 
+				      (vaddr_t)cos_init_dcb_get(), 1);
 	} else {
 		cos_dcb_info_init_ext(cdi, ci, 0, 0, 0);
 	}
@@ -39,7 +41,8 @@ cos_dcb_info_init_curr_ext(dcbcap_t initdcbcap, vaddr_t initdcbaddr, dcboff_t st
 	if (initdcbcap == 0 && initdcbaddr == 0) {
 
 		if (cos_spd_id() == 0) {
-			cos_dcb_info_init_ext(&_cos_dcbinfo[cos_cpuid()], ci, LLBOOT_CAPTBL_CPU_INITDCB, (vaddr_t)cos_init_dcb_get(), 1);
+			cos_dcb_info_init_ext(&_cos_dcbinfo[cos_cpuid()], ci, 
+					      LLBOOT_CAPTBL_CPU_INITDCB, (vaddr_t)cos_init_dcb_get(), 1);
 
 			return;
 		} else {
@@ -76,7 +79,8 @@ cos_dcb_info_alloc(struct cos_dcbinfo_data *cdi, dcboff_t *dcboff, vaddr_t *dcba
 
 		cdi->dcbaddr[curr_off + 1] = cos_page_bump_intern_valloc(cdi->ci, PAGE_SIZE);
 		assert(cdi->dcbaddr[curr_off + 1]);
-		cdi->dcbcaps[curr_off + 1] = cos_dcb_alloc(cos_compinfo_get(cos_defcompinfo_curr_get()), cdi->ci->pgtbl_cap, cdi->dcbaddr[curr_off + 1]);
+		cdi->dcbcaps[curr_off + 1] = cos_dcb_alloc(cos_compinfo_get(cos_defcompinfo_curr_get()), 
+							   cdi->ci->pgtbl_cap, cdi->dcbaddr[curr_off + 1]);
 
 		assert(cdi->dcbcaps[curr_off + 1]);
 		ret = ps_cas((unsigned long *)&cdi->curr_cap, curr_off, curr_off + 1);

From 218490eb4b4ea2a64787f682bf7ada540e594696 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Wed, 15 May 2019 16:42:49 -0400
Subject: [PATCH 073/127] Fixed blocking and now fibonacci is way faster

* idea is, if we're the only thread on the current core and
  we're yielding: it just means we're at a barrier or some sync
  point. So let's just try and run the idle thread which will
  wakeup other thread.
  Thread that wakes up, tries work and if there is none, will block
  again waiting to be woken up by the idle thread!
---
 src/components/include/part.h            | 65 ++++++++++++++++++------
 src/components/lib/part.c                | 10 ++++
 src/components/lib/sl/sl_mod_part_fifo.c | 11 +++-
 3 files changed, 69 insertions(+), 17 deletions(-)

diff --git a/src/components/include/part.h b/src/components/include/part.h
index 09e761c595..31b9f99a25 100644
--- a/src/components/include/part.h
+++ b/src/components/include/part.h
@@ -10,6 +10,7 @@
 #include <sl_xcore.h>
 
 #undef PART_ENABLE_NESTED
+#define PART_ENABLE_BLOCKING
 //#include <cirque.h>
 
 DEQUE_PROTOTYPE(part, struct part_task *);
@@ -118,9 +119,13 @@ part_deque_steal_any(void)
 static inline void
 part_pool_wakeup(void)
 {
+#ifdef PART_ENABLE_BLOCKING
 	struct sl_thd *t = NULL;
 	int i;
 
+	/* we're still not in main parallel, so don't try to wakeup any threads yet! */
+	if (!ps_load(&in_main_parallel)) return;
+
 	sl_cs_enter();
 	if (unlikely(ps_list_head_empty(part_thdpool_curr()))) {
 		sl_cs_exit();
@@ -136,11 +141,13 @@ part_pool_wakeup(void)
 	sl_cs_exit();
 
 	sl_thd_wakeup(sl_thd_thdid(t));
+#endif
 }
 
 static inline void
 part_pool_block(void)
 {
+#ifdef PART_ENABLE_BLOCKING
 	struct sl_thd *t = sl_thd_curr();
 
 	assert(ps_list_singleton(t, partlist));
@@ -150,6 +157,9 @@ part_pool_block(void)
 	sl_cs_exit();
 
 	sl_thd_block(0);
+#else
+	sl_thd_yield(0);
+#endif
 }
 
 ///* ds memory in a circular queue */
@@ -175,46 +185,71 @@ part_pool_block(void)
 static inline void
 part_list_append(struct part_task *t)
 {
-#if defined(PART_ENABLE_NESTED)
-	assert(ps_list_singleton(t, partask));
-	assert(t->type == PART_TASK_T_WORKSHARE);
+	int i, in_nest = 0;
 
-	if (t->nthds == 1) return;
-	crt_lock_take(&part_l_lock);
-	ps_list_head_append(part_list(), t, partask);
-	crt_lock_release(&part_l_lock);
-	part_pool_wakeup();
-#endif
+	assert(t->type == PART_TASK_T_WORKSHARE);
 
+#if defined(PART_ENABLE_NESTED)
+	assert(ps_list_singleton(t, partask));
+	/* 
+	 * this is not required to be in a cs. 
+	 * if multiple appends are called, simultaneously, we at least
+	 * have the main outermost parallel block running!.
+	 */
+	if (likely(!ps_list_head_empty(part_list()))) in_nest = 1;
+	/* so other threads can work on this! */
+	if (t->nthds > 1) { 
+		crt_lock_take(&part_l_lock);
+		ps_list_head_append(part_list(), t, partask);
+		crt_lock_release(&part_l_lock);
+	}
+#else
 	if (t != &main_task) {
+		/* without nesting, all child parallel blocks are run just be the encountering threads -master threads */
+		assert(t->nthds == 1); 
 		assert(ps_load(&in_main_parallel));
+
 		return;
 	}
 	assert(ps_load(&in_main_parallel) == 0);
-
-	ps_faa(&in_main_parallel, 1);
+#endif
+	/* 
+	 * wake up as many threads on this core! 
+	 * some may not get work if other cores pull work before they get to it.
+	 */
+	for (i = 1; i < t->nthds; i++) part_pool_wakeup();
+
+	/* if this is the first time in a parallel, make everyone know */
+	if (likely(!in_nest)) ps_faa(&in_main_parallel, 1);
 }
 
 static inline void
 part_list_remove(struct part_task *t)
 {
-#if defined(PART_ENABLE_NESTED)
+	int in_nest = 0;
+
 	assert(t->type == PART_TASK_T_WORKSHARE);
+	assert(t->nthds > 1);
+#if defined(PART_ENABLE_NESTED)
 	assert(!ps_list_singleton(t, partask));
 
 	crt_lock_take(&part_l_lock);
 	ps_list_rem(t, partask);
+	if (unlikely(!ps_list_head_empty(part_list()))) in_nest = 1;
 	crt_lock_release(&part_l_lock);
-#endif
+#else
+	/* only called for the other parallel region */
 	assert(ps_load(&in_main_parallel));
 	if (t != &main_task) return;
+#endif
 
-	ps_faa(&in_main_parallel, -1);
+	if (likely(!in_nest)) ps_faa(&in_main_parallel, -1);
 }
 
 static inline struct part_task *
 part_list_peek(void)
 {
+	/* there should at least be the outer parallel block for other threads to peek! */
 	if (!ps_load(&in_main_parallel)) return NULL;
 
 #if defined(PART_ENABLE_NESTED)
@@ -308,7 +343,7 @@ part_thd_fn(void *d)
 	struct sl_thd *curr = sl_thd_curr();
 
 	/* parallel runtime not ready? */
-	if (unlikely(!part_isready())) part_pool_block();
+	/* if (unlikely(!part_isready())) part_pool_block(); */
 	/* not in the main parallel block? */
 	while (!ps_load(&in_main_parallel)) part_pool_block();
 
diff --git a/src/components/lib/part.c b/src/components/lib/part.c
index 684e58ee2c..800ba632f1 100644
--- a/src/components/lib/part.c
+++ b/src/components/lib/part.c
@@ -151,9 +151,19 @@ part_init(void)
 		assert(x);
 	}
 
+#ifdef PART_ENABLE_BLOCKING
+	sl_cs_enter();
+	/* 
+	 * because it's fifo, all threads would go block 
+	 * themselves up as there is no work yet
+	 * eventually returning to this main thread on core-0, 
+	 * and on all other cores, scheduler would be running!
+	 */
+	sl_cs_exit_schedule(); 
 	it = sl_thd_alloc(part_idle_fn, NULL);
 	assert(it);
 	sl_thd_param_set(it, ip);
+#endif
 
 	ps_faa(&all_done, 1);
 	while (ps_load(&all_done) != NUM_CPU) ;
diff --git a/src/components/lib/sl/sl_mod_part_fifo.c b/src/components/lib/sl/sl_mod_part_fifo.c
index 8ea5908773..53eac3dd13 100644
--- a/src/components/lib/sl/sl_mod_part_fifo.c
+++ b/src/components/lib/sl/sl_mod_part_fifo.c
@@ -24,11 +24,18 @@ sl_mod_execution(struct sl_thd_policy *t, cycles_t cycles)
 struct sl_thd_policy *
 sl_mod_schedule(void)
 {
+	struct sl_thd_policy *c = sl_mod_thd_policy_get(sl_thd_curr());
 	struct sl_thd_policy *t = NULL;
 
 	if (unlikely(ps_list_head_empty(&threads[cos_cpuid()]))) goto done;
-	return ps_list_head_first_d(&threads[cos_cpuid()], struct sl_thd_policy);
-
+	t = ps_list_head_first_d(&threads[cos_cpuid()], struct sl_thd_policy);
+
+	/*
+	 * we're the only thread and we're yielding, that
+	 * means, we don't want to run anymore. run idle thread so it can
+	 * pick someone else and that can do some work!
+	 */
+	if (likely(c != t)) return t;
 done:
 	if (likely(idle_thd[cos_cpuid()])) return idle_thd[cos_cpuid()];
 

From 2f7c7619953a666dc9513af7ce749e5413bc6d6e Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Wed, 15 May 2019 16:45:19 -0400
Subject: [PATCH 074/127] Bumped up number of cores.

This macro should be in a separate header and part of git-ignore so
we don't need to commit this file for our changes.
---
 src/kernel/include/shared/cos_config.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/kernel/include/shared/cos_config.h b/src/kernel/include/shared/cos_config.h
index 8c46ae5377..8477e914dd 100644
--- a/src/kernel/include/shared/cos_config.h
+++ b/src/kernel/include/shared/cos_config.h
@@ -17,7 +17,7 @@
 
 #include "cpu_ghz.h"
 
-#define NUM_CPU 2
+#define NUM_CPU 4
 #define NUM_CPU_BMP_BYTES ((NUM_CPU + 7) / 8)
 #define NUM_CPU_BMP_WORDS ((NUM_CPU_BMP_BYTES + 3) / 4)
 

From d88f400352f496990bdbdeff581a00ea923d6663 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Wed, 15 May 2019 17:35:03 -0400
Subject: [PATCH 075/127] fixed gcc warnings for -ve core id

---
 src/components/lib/sl/sl_thd_static_backend.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/components/lib/sl/sl_thd_static_backend.c b/src/components/lib/sl/sl_thd_static_backend.c
index 3cce2a005a..2985f8f5e5 100644
--- a/src/components/lib/sl/sl_thd_static_backend.c
+++ b/src/components/lib/sl/sl_thd_static_backend.c
@@ -20,7 +20,7 @@ static u32_t               __sl_aep_free_off[NUM_CPU];
 static inline struct sl_thd_policy *
 sl_thd_alloc_backend_core(cpuid_t core, thdid_t tid)
 {
-	assert(tid < SL_MAX_NUM_THDS);
+	assert(tid < SL_MAX_NUM_THDS && core >= 0 && core < NUM_CPU);
 
 	return &(__sl_threads[core][tid]);
 }
@@ -31,6 +31,7 @@ sl_thd_alloc_aep_backend_core(cpuid_t core)
 	int off = 0;
 	struct cos_aep_info *aep = NULL;
 
+	assert(core < NUM_CPU && core >= 0);
 	off = ps_faa((unsigned long *)&__sl_aep_free_off[core], 1);
 	assert(off < SL_MAX_NUM_THDS);
 	aep = &__sl_aep_infos[core][off];
@@ -41,7 +42,7 @@ sl_thd_alloc_aep_backend_core(cpuid_t core)
 struct sl_thd_policy *
 sl_thd_migrate_backend(struct sl_thd_policy *t, cpuid_t core)
 {
-	assert(core != cos_cpuid() && core < NUM_CPU);
+	assert(core != cos_cpuid() && core >= 0 && core < NUM_CPU);
 
 	struct cos_aep_info *a = sl_thd_alloc_aep_backend_core(core);
 	struct cos_aep_info *b = sl_thd_aepinfo(sl_mod_thd_get(t));

From e2af6f83f63fc869e5ca12c18538881afc5330f8 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Wed, 15 May 2019 17:35:35 -0400
Subject: [PATCH 076/127] yield api with timeout to program timers

* everything else uses non-timeout fast-user-dispatch api
---
 src/components/include/sl.h      | 160 +++++++++++++++++++++++--------
 src/components/lib/sl/sl_sched.c |  64 +++++++++----
 2 files changed, 162 insertions(+), 62 deletions(-)

diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index ed21342c10..bd1b60d66a 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -40,7 +40,6 @@
 #include <sl_xcore.h>
 #include <heap.h>
 
-#undef  SL_TIMEOUTS
 #define SL_CS
 #undef  SL_REPLENISH
 
@@ -299,6 +298,7 @@ int  sl_thd_sched_wakeup_no_cs(struct sl_thd *t);
 int  sl_thd_wakeup_no_cs_rm(struct sl_thd *t);
 
 void sl_thd_yield_intern(thdid_t tid);
+void sl_thd_yield_intern_timeout(cycles_t abs_timeout);
 
 void sl_thd_yield_cs_exit(thdid_t tid);
 
@@ -371,7 +371,6 @@ sl_timeout_period_get(void)
 	return sl__globals_core()->period;
 }
 
-#ifdef SL_TIMEOUTS
 static inline void
 sl_timeout_oneshot(cycles_t absolute_us)
 {
@@ -406,7 +405,7 @@ struct heap *sl_timeout_heap(void);
 static inline void
 sl_timeout_wakeup_expired(cycles_t now)
 {
-	if (!heap_size(sl_timeout_heap())) return;
+	if (likely(!heap_size(sl_timeout_heap()))) return;
 
 	do {
 		struct sl_thd *tp, *th;
@@ -426,7 +425,6 @@ sl_timeout_wakeup_expired(cycles_t now)
 		sl_thd_wakeup_no_cs_rm(th);
 	} while (heap_size(sl_timeout_heap()));
 }
-#endif
 
 static inline int
 sl_thd_is_runnable(struct sl_thd *t)
@@ -495,7 +493,7 @@ sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 }
 
 static inline int
-sl_thd_activate(struct sl_thd *t, sched_tok_t tok)
+sl_thd_activate(struct sl_thd *t, sched_tok_t tok, tcap_time_t timeout)
 {
 	struct cos_defcompinfo *dci = cos_defcompinfo_curr_get();
 	struct cos_compinfo    *ci  = &dci->ci;
@@ -503,13 +501,13 @@ sl_thd_activate(struct sl_thd *t, sched_tok_t tok)
 	int ret = 0;
 
 	if (t->properties & SL_THD_PROPERTY_SEND) {
-		return cos_sched_asnd(t->sndcap, g->timeout_next, g->sched_rcv, tok);
+		return cos_sched_asnd(t->sndcap, timeout, g->sched_rcv, tok);
 	} else if (t->properties & SL_THD_PROPERTY_OWN_TCAP) {
 		return cos_switch(sl_thd_thdcap(t), sl_thd_tcap(t), t->prio,
-				  g->timeout_next, g->sched_rcv, tok);
+				  timeout, g->sched_rcv, tok);
 	} else {
 		ret = cos_defswitch(sl_thd_thdcap(t), t->prio, t == g->sched_thd ?
-				    TCAP_TIME_NIL : g->timeout_next, tok);
+				    TCAP_TIME_NIL : timeout, tok);
 		if (likely(t != g->sched_thd && t != g->idle_thd)) return ret;
 		if (unlikely(ret != -EPERM)) return ret;
 
@@ -517,8 +515,8 @@ sl_thd_activate(struct sl_thd *t, sched_tok_t tok)
 		 * Attempting to activate scheduler thread or idle thread failed for no budget in it's tcap.
 		 * Force switch to the scheduler with current tcap.
 		 */
-		return cos_switch(sl_thd_thdcap(t), g->sched_tcap, t->prio,
-				  g->timeout_next, g->sched_rcv, tok);
+		return cos_switch(g->sched_thdcap, g->sched_tcap, t->prio,
+				  timeout, g->sched_rcv, tok);
 	}
 }
 
@@ -536,6 +534,7 @@ sl_cs_exit_schedule_nospin_arg_c(struct sl_thd *curr, struct sl_thd *next)
 	return sl_thd_dispatch(next, tok, curr);
 }
 
+void sl_thd_replenish_no_cs(struct sl_thd *t, cycles_t now);
 /*
  * Do a few things: 1. take the critical section if it isn't already
  * taken, 2. call schedule to find the next thread to run, 3. release
@@ -566,9 +565,7 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 	struct sl_thd         *t = to;
 	struct sl_global_core *globals = sl__globals_core();
 	sched_tok_t            tok;
-#if defined(SL_TIMEOUTS) || defined(SL_REPLENISH)
 	cycles_t               now;
-#endif
 	s64_t                  offset;
 	int                    ret;
 
@@ -578,15 +575,12 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 #endif
 
 	tok    = cos_sched_sync();
-#if defined(SL_TIMEOUTS) || defined(SL_REPLENISH)
 	now    = sl_now();
-#endif
 
-#ifdef SL_TIMEOUTS
+	/* still wakeup without timeouts? that adds to dispatch overhead! */
 	offset = (s64_t)(globals->timer_next - now);
 	if (globals->timer_next && offset <= 0) sl_timeout_expended(now, globals->timer_next);
 	sl_timeout_wakeup_expired(now);
-#endif
 
 	/*
 	 * Once we exit, we can't trust t's memory as it could be
@@ -603,50 +597,94 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 		struct sl_thd_policy *pt = sl_mod_schedule();
 
 		if (unlikely(!pt))
-			t = sl__globals_core()->idle_thd;
+			t = globals->sched_thd;
 		else
 			t = sl_mod_thd_get(pt);
 	}
 
 #ifdef SL_REPLENISH
-	if (t->properties & SL_THD_PROPERTY_OWN_TCAP && t->budget) {
-		struct cos_compinfo *ci = cos_compinfo_get(cos_defcompinfo_curr_get());
+	sl_thd_replenish_no_cs(t, now);
+#endif
 
-		assert(t->period);
-		assert(sl_thd_tcap(t) != sl__globals_core()->sched_tcap);
+//	assert(t && sl_thd_is_runnable(t));
+#ifdef SL_CS
+	sl_cs_exit();
+#endif
+	if (unlikely(t == sl_thd_curr())) return 0;
 
-		if (t->last_replenish == 0 || t->last_replenish + t->period <= now) {
-			tcap_res_t currbudget = 0;
-			cycles_t replenish    = now - ((now - t->last_replenish) % t->period);
+	ret = sl_thd_dispatch(t, tok, sl_thd_curr());
 
-			ret = 0;
-			currbudget = (tcap_res_t)cos_introspect(ci, sl_thd_tcap(t), TCAP_GET_BUDGET);
+#ifdef SL_REPLENISH 
+	/*
+	 * dispatch failed with -EPERM because tcap associated with thread t does not have budget.
+	 * Block the thread until it's next replenishment and return to the scheduler thread.
+	 *
+	 * If the thread is not replenished by the scheduler (replenished "only" by
+	 * the inter-component delegations), block till next timeout and try again.
+	 */
+	if (unlikely(ret == -EPERM)) {
+		assert(t != globals->sched_thd && t != globals->idle_thd);
+		sl_thd_block_expiry(t);
+		if (unlikely(sl_thd_curr() != globals->sched_thd)) ret = sl_thd_activate(globals->sched_thd, tok, globals->timeout_next);
+	}
+#endif
 
-			if (!cycles_same(currbudget, t->budget, SL_CYCS_DIFF) && currbudget < t->budget) {
-				tcap_res_t transfer = t->budget - currbudget;
+	return ret;
+}
+
+static inline int
+sl_cs_exit_schedule_nospin_arg_timeout(struct sl_thd *to, cycles_t abs_timeout)
+{
+	struct sl_thd         *t = to;
+	struct sl_global_core *globals = sl__globals_core();
+	sched_tok_t            tok;
+	cycles_t               now;
+	s64_t                  offset;
+	int                    ret;
 
-				/* tcap_transfer will assign sched_tcap's prio to t's tcap if t->prio == 0, which we don't want. */
-				assert(t->prio >= TCAP_PRIO_MAX && t->prio <= TCAP_PRIO_MIN);
-				ret = cos_tcap_transfer(sl_thd_rcvcap(t), globals->sched_tcap, transfer, t->prio);
-			}
+	/* Don't abuse this, it is only to enable the tight loop around this function for races... */
+#ifdef SL_CS
+	if (likely(!sl_cs_owner())) sl_cs_enter();
+#endif
+
+	tok    = cos_sched_sync();
+	now    = sl_now();
+
+	offset = (s64_t)(globals->timer_next - now);
+	if (globals->timer_next && offset <= 0) sl_timeout_expended(now, globals->timer_next);
+	sl_timeout_wakeup_expired(now);
+
+	/*
+	 * Once we exit, we can't trust t's memory as it could be
+	 * deallocated/modified, so cache it locally.  If these values
+	 * are out of date, the scheduler synchronization tok will
+	 * catch it.  This is a little twitchy and subtle, so lets put
+	 * it in a function, here.
+	 */
+	if (likely(to)) {
+		t = to;
+		if (unlikely(!sl_thd_is_runnable(t))) to = NULL;
+	}
+	if (unlikely(!to)) {
+		struct sl_thd_policy *pt = sl_mod_schedule();
 
-			if (likely(ret == 0)) t->last_replenish = replenish;
-		}
+		if (unlikely(!pt))
+			t = globals->sched_thd;
+		else
+			t = sl_mod_thd_get(pt);
 	}
+
+#ifdef SL_REPLENISH
+	sl_thd_replenish_no_cs(t, now);
 #endif
 
 //	assert(t && sl_thd_is_runnable(t));
 #ifdef SL_CS
 	sl_cs_exit();
 #endif
-	if (t == sl__globals_core()->idle_thd) t = sl__globals_core()->sched_thd;
-	if (t == sl_thd_curr()) return 0;
+	if (unlikely(t == sl_thd_curr())) return 0;
 
-#ifdef SL_TIMEOUTS
-	ret = sl_thd_activate(t, tok);
-#else
-	ret = sl_thd_dispatch(t, tok, sl_thd_curr());
-#endif
+	ret = sl_thd_activate(t, tok, abs_timeout ? tcap_cyc2time(abs_timeout) : globals->timeout_next);
 
 #ifdef SL_REPLENISH 
 	/*
@@ -659,7 +697,7 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 	if (unlikely(ret == -EPERM)) {
 		assert(t != globals->sched_thd && t != globals->idle_thd);
 		sl_thd_block_expiry(t);
-		if (unlikely(sl_thd_curr() != globals->sched_thd)) ret = sl_thd_activate(globals->sched_thd, tok);
+		if (unlikely(sl_thd_curr() != globals->sched_thd)) ret = sl_thd_activate(globals->sched_thd, tok, globals->timeout_next);
 	}
 #endif
 
@@ -693,6 +731,33 @@ sl_cs_exit_switchto(struct sl_thd *to)
 	}
 }
 
+static inline int
+sl_cs_exit_schedule_nospin_timeout(cycles_t abs_timeout)
+{
+	return sl_cs_exit_schedule_nospin_arg_timeout(NULL, abs_timeout);
+}
+
+static inline void
+sl_cs_exit_schedule_timeout(cycles_t abs_timeout)
+{
+	while (sl_cs_exit_schedule_nospin_timeout(abs_timeout) && sl_now() < abs_timeout)
+		;
+}
+
+static inline void
+sl_cs_exit_switchto_timeout(struct sl_thd *to, cycles_t abs_timeout)
+{
+	/*
+	 * We only try once, so it is possible that we don't end up
+	 * switching to the desired thread.  However, this is always a
+	 * case that the caller has to consider if the current thread
+	 * has a higher priority than the "to" thread.
+	 */
+	if (sl_cs_exit_schedule_nospin_arg_timeout(to, abs_timeout)) {
+		sl_cs_exit_schedule_timeout(abs_timeout);
+	}
+}
+
 static inline void
 sl_cs_exit_switchto_c(struct sl_thd *c, struct sl_thd *n)
 {
@@ -759,6 +824,17 @@ sl_thd_yield(thdid_t tid)
 	}
 }
 
+static inline void
+sl_thd_yield_timeout(thdid_t tid, cycles_t abs_timeout)
+{
+	if (likely(tid)) {
+		sl_cs_enter();
+		sl_cs_exit_switchto_timeout(sl_thd_lkup(tid), abs_timeout);
+	} else {
+		sl_thd_yield_intern_timeout(abs_timeout);
+	}
+}
+
 static inline int
 sl_thd_rcv(rcv_flags_t flags)
 {
diff --git a/src/components/lib/sl/sl_sched.c b/src/components/lib/sl/sl_sched.c
index c05fb58f66..80ecc22297 100644
--- a/src/components/lib/sl/sl_sched.c
+++ b/src/components/lib/sl/sl_sched.c
@@ -61,7 +61,6 @@ sl_cs_exit_contention(union sl_cs_intern *csi, union sl_cs_intern *cached, struc
 	return 0;
 }
 
-#ifdef SL_TIMEOUTS
 /* Timeout and wakeup functionality */
 /*
  * TODO:
@@ -131,23 +130,6 @@ sl_timeout_init(microsec_t period)
 	memset(&timeout_heap[cos_cpuid()], 0, sizeof(struct timeout_heap));
 	heap_init(sl_timeout_heap(), SL_MAX_NUM_THDS, __sl_timeout_compare_min, __sl_timeout_update_idx);
 }
-#else
-static inline void
-sl_timeout_remove(struct sl_thd *t)
-{ }
-
-static inline void
-sl_timeout_block(struct sl_thd *t, cycles_t timeout)
-{ }
-
-static void
-sl_timeout_init(microsec_t period)
-{
-	assert(period >= SL_MIN_PERIOD_US);
-
-	sl_timeout_period(period);
-}
-#endif
 
 void
 sl_thd_free_no_cs(struct sl_thd *t)
@@ -491,6 +473,18 @@ sl_thd_yield_intern(thdid_t tid)
 	sl_thd_yield_cs_exit_intern(tid);
 }
 
+void
+sl_thd_yield_intern_timeout(cycles_t abs_timeout)
+{
+	struct sl_thd *t = sl_thd_curr();
+
+	sl_cs_enter();
+	/* reset rcv_suspended if the scheduler thinks "curr" was suspended on cos_rcv previously */
+	sl_thd_sched_unblock_no_cs(t);
+	if (likely(t != sl__globals_core()->sched_thd && t != sl__globals_core()->idle_thd)) sl_mod_yield(sl_mod_thd_policy_get(t), NULL);
+	sl_cs_exit_schedule_timeout(abs_timeout);
+}
+
 void
 sl_thd_event_info_reset(struct sl_thd *t)
 {
@@ -573,9 +567,7 @@ sl_timeout_period(microsec_t period)
 	cycles_t p = sl_usec2cyc(period);
 
 	sl__globals_core()->period = p;
-#ifdef SL_TIMEOUTS
 	sl_timeout_relative(p);
-#endif
 }
 
 /* engage space heater mode */
@@ -782,3 +774,35 @@ sl_thd_kern_dispatch(thdcap_t t)
 	//return cos_switch(t, sl__globals_core()->sched_tcap, 0, sl__globals_core()->timeout_next, sl__globals_core()->sched_rcv, cos_sched_sync());
 	return cos_thd_switch(t);
 }
+
+void
+sl_thd_replenish_no_cs(struct sl_thd *t, cycles_t now)
+{
+#ifdef SL_REPLENISH
+	struct cos_compinfo *ci = cos_compinfo_get(cos_defcompinfo_curr_get());
+	tcap_res_t currbudget = 0;
+	cycles_t replenish;
+	int ret;
+
+	if (!(t->properties & SL_THD_PROPERTY_OWN_TCAP && t->budget)) return;
+	assert(t->period);
+	assert(sl_thd_tcap(t) != sl__globals_core()->sched_tcap);
+
+	if (!(t->last_replenish == 0 || t->last_replenish + t->period <= now)) return;
+
+	replenish = now - ((now - t->last_replenish) % t->period);
+
+	ret = 0;
+	currbudget = (tcap_res_t)cos_introspect(ci, sl_thd_tcap(t), TCAP_GET_BUDGET);
+
+	if (!cycles_same(currbudget, t->budget, SL_CYCS_DIFF) && currbudget < t->budget) {
+		tcap_res_t transfer = t->budget - currbudget;
+
+		/* tcap_transfer will assign sched_tcap's prio to t's tcap if t->prio == 0, which we don't want. */
+		assert(t->prio >= TCAP_PRIO_MAX && t->prio <= TCAP_PRIO_MIN);
+		ret = cos_tcap_transfer(sl_thd_rcvcap(t), globals->sched_tcap, transfer, t->prio);
+	}
+
+	if (likely(ret == 0)) t->last_replenish = replenish;
+#endif
+}

From 8f870d8610b22cc027aac8c26fb187c40e1e26fd Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Wed, 15 May 2019 18:19:23 -0400
Subject: [PATCH 077/127] use heap for deque

---
 src/components/include/part.h | 14 +++++++++-----
 src/components/lib/part.c     |  9 +++++++--
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/src/components/include/part.h b/src/components/include/part.h
index 31b9f99a25..0f4e939ffc 100644
--- a/src/components/include/part.h
+++ b/src/components/include/part.h
@@ -16,7 +16,7 @@
 DEQUE_PROTOTYPE(part, struct part_task *);
 //CIRQUE_PROTOTYPE(part, struct part_task);
 
-extern struct deque_part part_dq_percore[];
+extern struct deque_part *part_dq_percore[];
 //extern struct cirque_par parcq_global;
 /* FIXME: use stacklist or another stack like data structure? */
 extern struct ps_list_head part_thdpool_core[];
@@ -31,7 +31,7 @@ extern struct part_task main_task;
 static inline struct deque_part *
 part_deque_curr(void)
 {
-	return &part_dq_percore[cos_cpuid()];
+	return part_dq_percore[cos_cpuid()];
 }
 
 static inline struct deque_part *
@@ -39,7 +39,7 @@ part_deque_core(cpuid_t c)
 {
 	assert(c < NUM_CPU);
 
-	return &part_dq_percore[c];
+	return part_dq_percore[c];
 }
 
 static inline struct ps_list_head *
@@ -89,6 +89,7 @@ part_deque_pop(struct part_task **t)
 static inline struct part_task * 
 part_deque_steal(cpuid_t core)
 {
+#if NUM_CPU > 1
 	int ret;
 	struct part_task *t = NULL;
 
@@ -96,11 +97,15 @@ part_deque_steal(cpuid_t core)
 	if (ret) return NULL;
 
 	return t;
+#else
+	return NULL;
+#endif
 }
 
 static inline struct part_task * 
 part_deque_steal_any(void)
 {
+#if NUM_CPU > 1
 	unsigned i = 0, c = (unsigned)(ps_tsc() % NUM_CPU);
 
 	do {
@@ -112,7 +117,7 @@ part_deque_steal_any(void)
 		t = part_deque_steal(c);
 		if (likely(t)) return t;
 	} while (i < NUM_CPU);
-
+#endif
 	return NULL;
 }
 
@@ -336,7 +341,6 @@ part_task_end(struct part_task *t)
 	}
 }
 
-
 static inline void
 part_thd_fn(void *d)
 {
diff --git a/src/components/lib/part.c b/src/components/lib/part.c
index 800ba632f1..9471fb9e97 100644
--- a/src/components/lib/part.c
+++ b/src/components/lib/part.c
@@ -9,8 +9,9 @@
 
 #define PART_MAX_PAGES (((PART_MAX_TASKS * sizeof(struct part_task)) / PAGE_SIZE) + 1)
 #define PART_MAX_DATA_PAGES (((PART_MAX_TASKS * sizeof(struct part_data)) / PAGE_SIZE) + 1)
+#define PART_DEQUE_MAX_PAGES ((sizeof(struct deque_part) / PAGE_SIZE) + 1)
 
-struct deque_part part_dq_percore[NUM_CPU];
+struct deque_part *part_dq_percore[NUM_CPU];
 //struct cirque_par parcq_global;
 static volatile unsigned part_ready = 0;
 volatile int in_main_parallel;
@@ -119,7 +120,11 @@ part_init(void)
 
 	ps_list_head_init(&part_thdpool_core[cos_cpuid()]);
 	if (ps_cas(&is_first, NUM_CPU, cos_cpuid())) {
-		for (k = 0; k < NUM_CPU; k++) deque_init_part(&part_dq_percore[k], PART_DEQUE_SZ);
+		for (k = 0; k < NUM_CPU; k++) {
+			part_dq_percore[k] = (struct deque_part *)memmgr_heap_page_allocn(PART_DEQUE_MAX_PAGES);
+			assert(part_dq_percore[k]);
+			deque_init_part(part_dq_percore[k], PART_DEQUE_SZ);
+		}
 		part_tasks = (struct part_task *)memmgr_heap_page_allocn(PART_MAX_PAGES);
 		assert(part_tasks);
 		memset(part_tasks, 0, PART_MAX_PAGES * PAGE_SIZE);

From fefa23dffc707c3fa20621428f263c588604e79f Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Thu, 16 May 2019 00:44:34 -0400
Subject: [PATCH 078/127] Fixed a race in explicit task execution

---
 src/components/include/deque.h         | 15 +++---
 src/components/include/part.h          | 34 +++++++++-----
 src/components/include/part_task.h     | 64 ++++++++++++++++----------
 src/components/lib/cos_gomp/cos_gomp.c | 14 ++----
 src/components/lib/part.c              |  6 +--
 5 files changed, 77 insertions(+), 56 deletions(-)

diff --git a/src/components/include/deque.h b/src/components/include/deque.h
index 1190814648..ff6cff030e 100644
--- a/src/components/include/deque.h
+++ b/src/components/include/deque.h
@@ -60,7 +60,7 @@ deque_push_##name(struct deque_##name *q, type *w)					\
 											\
 	q->wrk[cb] = *w;								\
 	ps_mem_fence();									\
-	if (!ps_upcas((unsigned long *)&q->bottom, cb, cb + 1)) assert(0);		\
+	if (!ps_cas((unsigned long *)&q->bottom, cb, cb + 1)) assert(0);		\
 											\
 	return 0;									\
 }											\
@@ -69,15 +69,16 @@ deque_push_##name(struct deque_##name *q, type *w)					\
 static inline int									\
 deque_pop_##name(struct deque_##name *q, type *w)					\
 {											\
-	long ct = ps_load((unsigned long *)&q->top); 					\
+	long ct = 0, sz = 0;								\
 	long cb = ps_load((unsigned long *)&q->bottom) - 1;				\
-	long sz = cb - ct;								\
 	int ret = 0;									\
 											\
-	if (!ps_upcas((unsigned long *)&q->bottom, cb + 1, cb)) assert(0);		\
+	if (!ps_cas((unsigned long *)&q->bottom, cb + 1, cb)) assert(0);		\
 											\
+	ct = ps_load((unsigned long *)&q->top);						\
+	sz = cb - ct;									\
 	if (sz < 0) {									\
-		if (!ps_cas((unsigned long *)&q->bottom, cb, cb + 1)) assert(0);	\
+		if (!ps_cas((unsigned long *)&q->bottom, cb, ct)) assert(0);		\
 											\
 		return -ENOENT;								\
 	}										\
@@ -86,8 +87,8 @@ deque_pop_##name(struct deque_##name *q, type *w)					\
 	if (sz > 0) return 0;								\
 											\
 	ret = ps_cas((unsigned long *)&q->top, ct, ct + 1);				\
-	if (!ps_upcas((unsigned long *)&q->bottom, cb, ct + 1)) assert(0);		\
-	if (!ret) return -ENOENT;							\
+	if (!ps_cas((unsigned long *)&q->bottom, cb, ct + 1)) assert(0);		\
+	if (!ret) { *w = NULL; return -ENOENT; }					\
 											\
 	return 0;									\
 }											\
diff --git a/src/components/include/part.h b/src/components/include/part.h
index 0f4e939ffc..96579eb13f 100644
--- a/src/components/include/part.h
+++ b/src/components/include/part.h
@@ -67,6 +67,7 @@ part_deque_push(struct part_task *t)
 {
 	int ret;
 
+	assert(t->type == PART_TASK_T_TASK);
 	sl_cs_enter();
 	ret = deque_push_part(part_deque_curr(), &t);
 	sl_cs_exit();
@@ -79,9 +80,13 @@ part_deque_pop(struct part_task **t)
 {
 	int ret;
 
+	*t = NULL;
 	sl_cs_enter();
 	ret = deque_pop_part(part_deque_curr(), t);
 	sl_cs_exit();
+	if (unlikely(ret)) *t = NULL;
+
+	if (unlikely(*t && (*t)->type != PART_TASK_T_TASK)) { *t = NULL; ret = -EAGAIN; }
 
 	return ret;
 }
@@ -94,7 +99,8 @@ part_deque_steal(cpuid_t core)
 	struct part_task *t = NULL;
 
 	ret = deque_steal_part(part_deque_core(core), &t);
-	if (ret) return NULL;
+	if (unlikely(ret)) return NULL;
+	assert(t->type == PART_TASK_T_TASK);
 
 	return t;
 #else
@@ -190,7 +196,8 @@ part_pool_block(void)
 static inline void
 part_list_append(struct part_task *t)
 {
-	int i, in_nest = 0;
+	unsigned i;
+	int in_nest = 0;
 
 	assert(t->type == PART_TASK_T_WORKSHARE);
 
@@ -222,7 +229,7 @@ part_list_append(struct part_task *t)
 	 * wake up as many threads on this core! 
 	 * some may not get work if other cores pull work before they get to it.
 	 */
-	for (i = 1; i < t->nthds; i++) part_pool_wakeup();
+	for (i = 0; i < t->nthds; i++) part_pool_wakeup();
 
 	/* if this is the first time in a parallel, make everyone know */
 	if (likely(!in_nest)) ps_faa(&in_main_parallel, 1);
@@ -308,20 +315,23 @@ static inline void
 part_task_end(struct part_task *t)
 {
 	struct sl_thd *ts = sl_thd_curr();
+	struct part_task *p = t->parent;
 	int tn = part_task_work_thd_num(t);
 
-	assert(tn >= 0 && t->nthds >= 1);
+	assert(t->type != PART_TASK_T_NONE);
+	assert(tn >= 0);
+	assert(t->nthds >= 1);
 	assert(ts->part_context == (void *)t);
 	if (t->nthds == 1) {
 		int i;
 
 		assert(tn == 0);
 		part_task_wait_children(t);
-		ps_faa(&t->end, 1);
 		part_task_remove_child(t->parent, t);
+		ps_faa(&t->end, 1);
 		if (t->type == PART_TASK_T_WORKSHARE) {
 			assert(t->workers[tn] == t->master);
-			ts->part_context = t->parent;
+			ts->part_context = p;
 		}
 
 		return;
@@ -330,7 +340,7 @@ part_task_end(struct part_task *t)
 
 	if (tn == 0) {
 		if (t->type == PART_TASK_T_WORKSHARE) part_list_remove(t);
-		ts->part_context = t->parent;
+		ts->part_context = p;
 		part_task_remove_child(t->parent, t);
 		ps_faa(&t->end, 1);
 	} else {
@@ -367,7 +377,7 @@ part_thd_fn(void *d)
 single:
 		ret = part_deque_pop(&t);
 		if (likely(ret == 0)) {
-			assert(t && t->type != PART_TASK_T_WORKSHARE);
+			assert(t && t->type == PART_TASK_T_TASK);
 			thdnum = part_task_work_try(t);
 			if (thdnum == 0) goto found;
 		}
@@ -380,20 +390,22 @@ part_thd_fn(void *d)
 
 			continue;
 		}
-		assert(t->type != PART_TASK_T_WORKSHARE);
+		assert(t->type == PART_TASK_T_TASK);
 found:
+		assert(t->type != PART_TASK_T_NONE);
 		if (unlikely(thdnum < 0)) thdnum = part_task_work_try(t);
 		if (unlikely(thdnum < 0)) continue;
-		if (t->type != PART_TASK_T_WORKSHARE) assert(thdnum == 0);
+		if (t->type == PART_TASK_T_TASK) assert(thdnum == 0);
 		curr->part_context = (void *)t;
 
 		t->cs.fn(t->cs.data);
 
 		part_task_end(t);
 		/* free the explicit task! */
-		if (t->type != PART_TASK_T_WORKSHARE) {
+		if (t->type == PART_TASK_T_TASK) {
 			struct part_data *d = t->data_env;
 
+			assert(t->nthds == 1 && t->end == 1);
 			part_task_free(t);
 			part_data_free(d);
 		}
diff --git a/src/components/include/part_task.h b/src/components/include/part_task.h
index 050a82f89e..83b1b3f7bc 100644
--- a/src/components/include/part_task.h
+++ b/src/components/include/part_task.h
@@ -11,7 +11,7 @@
 
 #define PART_MAX_TASKS      256 
 #define PART_MAX_DATA       128
-#define PART_MAX_PAR_THDS   4
+#define PART_MAX_PAR_THDS   NUM_CPU
 #define PART_MAX_THDS       128
 #define PART_MAX_CORE_THDS  (PART_MAX_THDS/NUM_CPU)
 #define PART_MAX_CHILD      16 
@@ -31,7 +31,9 @@ typedef enum {
 } part_task_state_t;
 
 typedef enum {
+	PART_TASK_T_NONE,
 	PART_TASK_T_WORKSHARE = 1, /* task to put in a shared fifo queue */
+	PART_TASK_T_TASK,
 } part_task_type_t;
 
 typedef enum {
@@ -67,6 +69,7 @@ struct part_data {
 };
 
 struct part_task {
+	int id; /* only for debugging */
 	part_task_state_t state;
 	part_task_type_t  type;
 
@@ -89,27 +92,28 @@ struct part_task {
 static inline void
 part_task_init(struct part_task *t, part_task_type_t type, struct part_task *p, unsigned nthds, part_fn_t fn, void *data, struct part_data *d)
 {
-	int i;
-
-	memset(t, 0, sizeof(struct part_task));
+	static unsigned part_id_free = 0;
+	int i, id = ps_faa(&part_id_free, 1);
 
-	ps_list_init(t, partask);
+	assert(type != PART_TASK_T_NONE);
 	t->type = type;
-	t->state = PART_TASK_S_INITIALIZED;
-	t->parent = p;
-	t->nthds = nthds;
-	t->nchildren = 0;
-	t->barrier_in = t->barrier_out = t->end = 0;
-	t->data_env = d;
-
-	t->master = PART_CURR_THD;
+	if (!ps_cas(&t->state, PART_TASK_S_ALLOCATED, PART_TASK_S_INITIALIZED)) assert(0);
+	t->id = id;
+	memset(t->ws, 0, sizeof(struct part_workshare) * PART_MAX_WORKSHARES);
 	t->cs.fn = fn;
 	t->cs.data = data;
-
-	for (i = 0; i < PART_MAX_PAR_THDS; i++) t->ws_off[i] = -1;
-
+	t->nthds = nthds;
+	memset(t->workers, 0, sizeof(unsigned) * PART_MAX_PAR_THDS);
+	t->master = PART_CURR_THD;
 	/* if it's worksharing, current thread is the master and does take part in the par section */
 	if (type == PART_TASK_T_WORKSHARE) t->workers[0] = t->master;
+	for (i = 0; i < PART_MAX_PAR_THDS; i++) t->ws_off[i] = -1;
+	t->barrier_in = t->barrier_out = t->end = 0;
+	t->data_env = d;
+	t->parent = p;
+	t->nchildren = 0;
+
+	ps_list_init(t, partask);
 }
 
 struct part_task *part_task_alloc(part_task_type_t);
@@ -122,6 +126,8 @@ part_task_add_child(struct part_task *t, struct part_task *c)
 {
 	int i;
 
+	assert(t->state == PART_TASK_S_INITIALIZED);
+
 	if (unlikely(!t || !c)) return -1;
 
 	i = ps_faa(&t->nchildren, 1);
@@ -136,7 +142,7 @@ part_task_remove_child(struct part_task *t, struct part_task *c)
 	int i;
 
 	if (unlikely(!t || !c)) return;
-	assert(ps_load(&t->nchildren));
+	assert(t->state == PART_TASK_S_INITIALIZED);
 
 	i = ps_faa(&t->nchildren, -1);
 	assert(i > 0);
@@ -145,6 +151,7 @@ part_task_remove_child(struct part_task *t, struct part_task *c)
 static inline void
 part_task_wait_children(struct part_task *t)
 {
+	assert(t->state == PART_TASK_S_INITIALIZED);
 	while (ps_load(&t->nchildren) > 0) sl_thd_yield(0);
 
 	assert(t->nchildren == 0);
@@ -153,22 +160,26 @@ part_task_wait_children(struct part_task *t)
 static inline int
 part_task_work_try(struct part_task *t)
 {
-	unsigned i = 0;
+	unsigned i;
         unsigned key = PART_CURR_THD;
 
-	if (t->type != PART_TASK_T_WORKSHARE) {
+	assert(t->state == PART_TASK_S_INITIALIZED);
+	if (t->type == PART_TASK_T_TASK) {
 		assert(t->nthds == 1);
 	} else {
+		assert(t->type == PART_TASK_T_WORKSHARE);
 		assert(t->master != key && t->master == t->workers[0]);
 		assert(t->nthds >= 1);
 	}
 
-	for (; i < t->nthds; i++)
+	for (i = 0; i < t->nthds; i++)
 	{
-		if (t->workers[i] == key) return i;
-		if (t->workers[i]) continue;
+		unsigned w = ps_load(&t->workers[i]);
+
+		if (w == key) return i;
+		if (w) continue;
 
-		if (likely(ps_cas(&t->workers[i], 0, key))) return i;
+		if (likely(ps_cas(&t->workers[i], w, key))) return i;
 	}
 
 	return -1;
@@ -180,13 +191,15 @@ part_task_work_thd_num(struct part_task *t)
 	int i; 
 	unsigned key = PART_CURR_THD;
 
-	if (t->type != PART_TASK_T_WORKSHARE) {
+	assert(t->state == PART_TASK_S_INITIALIZED);
+	if (t->type == PART_TASK_T_TASK) {
 		assert(t->nthds == 1);
 
-		if (t->workers[0] == key) return 0;
+		if (ps_load(&t->workers[0]) == key) return 0;
 
 		return -1;
 	}
+	assert(t->type == PART_TASK_T_WORKSHARE);
 
 	if (key == t->master) return 0;
 	for (i = 1; i < (int)t->nthds; i++) {
@@ -202,6 +215,7 @@ part_task_barrier(struct part_task *t)
 	int tn = part_task_work_thd_num(t);
 	unsigned cin = 0, cout = 0;
 
+	assert(t->state == PART_TASK_S_INITIALIZED);
 	assert(tn >= 0 && t->nthds >= 1);
 
 	if (t->nthds == 1) {
diff --git a/src/components/lib/cos_gomp/cos_gomp.c b/src/components/lib/cos_gomp/cos_gomp.c
index eb1b32da71..c3b5214856 100644
--- a/src/components/lib/cos_gomp/cos_gomp.c
+++ b/src/components/lib/cos_gomp/cos_gomp.c
@@ -15,7 +15,6 @@
 #include <sl_thd.h>
 #include <sl_lock.h> /* for now, single core lock! */
 #include <cos_omp.h>
-#include <../../interface/capmgr/memmgr.h>
 
 #include "cos_gomp.h"
 #include <crt_lock.h>
@@ -60,6 +59,7 @@ _gomp_parallel_start(struct part_task *pt, void (*fn) (void *), void *data, unsi
 	if (unlikely(parent)) num_threads = 1;
 #endif
 
+	pt->state = PART_TASK_S_ALLOCATED;
 	part_task_init(pt, PART_TASK_T_WORKSHARE, parent, num_threads, fn, data, NULL);
 	assert(pt->nthds == num_threads);
 	if (unlikely(parent)) {
@@ -277,10 +277,6 @@ GOMP_loop_end (void)
 	assert(t->ws[woff].type == PART_WORKSHARE_LOOP_DYNAMIC);
 
 	part_task_barrier(t);
-
-//	do {
-//		c = ps_load(&t->nwsdone);
-//	} while (!ps_cas(&t->nwsdone, c, c | (1 << woff)));
 }
 
 void
@@ -291,9 +287,6 @@ GOMP_loop_end_nowait (void)
 	int woff = t->ws_off[coff], c = 0;
 
 	assert(t->ws[woff].type == PART_WORKSHARE_LOOP_DYNAMIC);
-//	do {
-//		c = ps_load(&t->nwsdone);
-//	} while (!ps_cas(&t->nwsdone, c, c | (1 << woff)));
 }
 
 void
@@ -338,9 +331,10 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
 		else       memcpy(arg, data, arg_size);
 
 		assert(parent);
-		part_task_init(pt, 0, parent, 1, fn, arg, d);
+		part_task_init(pt, PART_TASK_T_TASK, parent, 1, fn, arg, d);
 		parent_off = part_task_add_child(parent, pt);
 		assert(parent_off >= 0);
+		assert(pt->type == PART_TASK_T_TASK);
 
 		do {
 			ret = part_deque_push(pt);
@@ -353,7 +347,7 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
 		struct part_task pt;
 
 		assert(parent);
-		part_task_init(&pt, 0, parent, 1, fn, data, NULL);
+		part_task_init(&pt, PART_TASK_T_TASK, parent, 1, fn, data, NULL);
 		parent_off = part_task_add_child(parent, &pt);
 		assert(parent_off >= 0);
 		sl_thd_curr()->part_context = &pt;
diff --git a/src/components/lib/part.c b/src/components/lib/part.c
index 9471fb9e97..e29e7d9ab0 100644
--- a/src/components/lib/part.c
+++ b/src/components/lib/part.c
@@ -80,7 +80,7 @@ part_task_alloc(part_task_type_t type)
 	for (i = 0; i < PART_MAX_TASKS; i++) {
 		t = part_tasks + i;
 
-		if (t->state != PART_TASK_S_FREED) continue;
+		if (ps_load(&t->state) != PART_TASK_S_FREED) continue;
 
 		/* if this fails, someone else just alloced it! */
 		if (!ps_cas(&t->state, PART_TASK_S_FREED, PART_TASK_S_ALLOCATED)) continue;
@@ -99,8 +99,8 @@ part_task_free(struct part_task *t)
 	if (!t) return;
 
 	do {
-		s = t->state;
-		assert(s != PART_TASK_S_FREED);
+		s = ps_load(&t->state);
+		if (s != PART_TASK_S_INITIALIZED) return;
 	} while (!ps_cas(&t->state, s, PART_TASK_S_FREED));
 }
 

From 48bdcbc6cad17f9b94615f0e819cdce0594d33ef Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Thu, 16 May 2019 01:24:36 -0400
Subject: [PATCH 079/127] fixed timer programming to be either for an earlier
 timeout or on expiry

---
 src/components/include/sl.h | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index bd1b60d66a..4176e843dc 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -612,7 +612,12 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 #endif
 	if (unlikely(t == sl_thd_curr())) return 0;
 
-	ret = sl_thd_dispatch(t, tok, sl_thd_curr());
+	/*
+	 * if the periodic timer is already ahead,
+	 * don't reprogram it!
+	 */
+	if (likely(offset > globals->cyc_per_usec)) ret = sl_thd_dispatch(t, tok, sl_thd_curr());
+	else ret = sl_thd_activate(t, tok, globals->timeout_next);
 
 #ifdef SL_REPLENISH 
 	/*
@@ -684,7 +689,20 @@ sl_cs_exit_schedule_nospin_arg_timeout(struct sl_thd *to, cycles_t abs_timeout)
 #endif
 	if (unlikely(t == sl_thd_curr())) return 0;
 
-	ret = sl_thd_activate(t, tok, abs_timeout ? tcap_cyc2time(abs_timeout) : globals->timeout_next);
+	/* 
+	 * if the requested timeout is greater than next timeout 
+	 * and timer is already programmed to be over a usec away, don't 
+	 * reprogam it.
+	 *
+	 * else, reprogram for an earlier timeout requested.
+	 */
+	if (likely(offset > globals->cyc_per_usec 
+		   && abs_timeout > globals->timer_next)) {
+		ret = sl_thd_dispatch(t, tok, sl_thd_curr());
+	} else {
+		ret = sl_thd_activate(t, tok, abs_timeout < globals->timer_next 
+				      ? tcap_cyc2time(abs_timeout) : globals->timeout_next);
+	}
 
 #ifdef SL_REPLENISH 
 	/*

From 271f23c52fec28b7c00a1bbd5e4635c329a37b52 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Thu, 16 May 2019 12:02:53 -0400
Subject: [PATCH 080/127] change to upcas in deque for core-local push/pop

* i had previously changed it to multi-core cas for some debugging,
  forgot to revert back.
---
 src/components/include/deque.h         | 10 +++++-----
 src/components/lib/cos_gomp/cos_gomp.c |  1 +
 src/platform/i386/qemu-kvm.sh          |  2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/components/include/deque.h b/src/components/include/deque.h
index ff6cff030e..6563039940 100644
--- a/src/components/include/deque.h
+++ b/src/components/include/deque.h
@@ -19,7 +19,7 @@
  * PPoPP implementation paper, "Correct and Efficient Work-Stealing for Weak Memory Models"
  * https://www.di.ens.fr/~zappa/readings/ppopp13.pdf
  */
-#define DEQUE_MAX_SZ 4096
+#define DEQUE_MAX_SZ (1<<15)
 
 #define DEQUE_PROTOTYPE(name, type)							\
 struct deque_##name {									\
@@ -60,7 +60,7 @@ deque_push_##name(struct deque_##name *q, type *w)					\
 											\
 	q->wrk[cb] = *w;								\
 	ps_mem_fence();									\
-	if (!ps_cas((unsigned long *)&q->bottom, cb, cb + 1)) assert(0);		\
+	if (!ps_upcas((unsigned long *)&q->bottom, cb, cb + 1)) assert(0);		\
 											\
 	return 0;									\
 }											\
@@ -73,12 +73,12 @@ deque_pop_##name(struct deque_##name *q, type *w)					\
 	long cb = ps_load((unsigned long *)&q->bottom) - 1;				\
 	int ret = 0;									\
 											\
-	if (!ps_cas((unsigned long *)&q->bottom, cb + 1, cb)) assert(0);		\
+	if (!ps_upcas((unsigned long *)&q->bottom, cb + 1, cb)) assert(0);		\
 											\
 	ct = ps_load((unsigned long *)&q->top);						\
 	sz = cb - ct;									\
 	if (sz < 0) {									\
-		if (!ps_cas((unsigned long *)&q->bottom, cb, ct)) assert(0);		\
+		if (!ps_upcas((unsigned long *)&q->bottom, cb, ct)) assert(0);		\
 											\
 		return -ENOENT;								\
 	}										\
@@ -87,7 +87,7 @@ deque_pop_##name(struct deque_##name *q, type *w)					\
 	if (sz > 0) return 0;								\
 											\
 	ret = ps_cas((unsigned long *)&q->top, ct, ct + 1);				\
-	if (!ps_cas((unsigned long *)&q->bottom, cb, ct + 1)) assert(0);		\
+	if (!ps_upcas((unsigned long *)&q->bottom, cb, ct + 1)) assert(0);		\
 	if (!ret) { *w = NULL; return -ENOENT; }					\
 											\
 	return 0;									\
diff --git a/src/components/lib/cos_gomp/cos_gomp.c b/src/components/lib/cos_gomp/cos_gomp.c
index c3b5214856..88d88a1a74 100644
--- a/src/components/lib/cos_gomp/cos_gomp.c
+++ b/src/components/lib/cos_gomp/cos_gomp.c
@@ -325,6 +325,7 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
 
 		assert(pt && d);
 		assert(arg_size + arg_align - 1 <= PART_MAX_DATA);
+		memset(d->data, 0, PART_MAX_DATA);
 		arg = (char *) (((uintptr_t) d->data + arg_align - 1)
                                 & ~(uintptr_t) (arg_align - 1));
 		if (cpyfn) cpyfn(arg, data);
diff --git a/src/platform/i386/qemu-kvm.sh b/src/platform/i386/qemu-kvm.sh
index b416a0f107..2ec66f87b1 100755
--- a/src/platform/i386/qemu-kvm.sh
+++ b/src/platform/i386/qemu-kvm.sh
@@ -12,4 +12,4 @@ fi
 MODULES=$(sh $1 | awk '/^Writing image/ { print $3; }' | tr '\n' ' ')
 
 #qemu-system-i386 -m 768 -nographic -kernel kernel.img -no-reboot -s -initrd "$(echo $MODULES | tr ' ' ',')"
-qemu-system-i386 -enable-kvm -rtc base=localtime,clock=host,driftfix=none -smp sockets=1,cores=2,threads=1 -cpu host -nographic -m 768 -kernel kernel.img -initrd "$(echo $MODULES | tr ' ' ',')"
+qemu-system-i386 -enable-kvm -rtc base=localtime,clock=host,driftfix=none -smp sockets=1,cores=4,threads=1 -cpu host -nographic -m 768 -kernel kernel.img -initrd "$(echo $MODULES | tr ' ' ',')"

From c91f6eb3f8f67432cb1f3bc91b691bc957a48fc3 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Fri, 17 May 2019 00:41:16 -0400
Subject: [PATCH 081/127] fixed timeout based slowpath in sl

---
 src/components/include/sl.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index 4176e843dc..fba0e3c171 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -65,7 +65,7 @@ struct sl_global_core {
 
 	int         cyc_per_usec;
 	cycles_t    period;
-	cycles_t    timer_next;
+	cycles_t    timer_next, timer_prev;
 	tcap_time_t timeout_next;
 
 	struct cos_scb_info *scb_info;
@@ -374,10 +374,11 @@ sl_timeout_period_get(void)
 static inline void
 sl_timeout_oneshot(cycles_t absolute_us)
 {
-	sl__globals_core()->timer_next   = absolute_us;
-	sl__globals_core()->timeout_next = tcap_cyc2time(absolute_us);
+	struct sl_global_core *g = sl__globals_core();
 
-	sl_scb_info_core()->timer_next   = absolute_us;
+	g->timer_prev   = g->timer_next;
+	g->timer_next   = absolute_us;
+	g->timeout_next = tcap_cyc2time(absolute_us);
 }
 
 static inline void
@@ -616,7 +617,7 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 	 * if the periodic timer is already ahead,
 	 * don't reprogram it!
 	 */
-	if (likely(offset > globals->cyc_per_usec)) ret = sl_thd_dispatch(t, tok, sl_thd_curr());
+	if (likely(offset > globals->cyc_per_usec && globals->timer_prev)) ret = sl_thd_dispatch(t, tok, sl_thd_curr());
 	else ret = sl_thd_activate(t, tok, globals->timeout_next);
 
 #ifdef SL_REPLENISH 
@@ -696,7 +697,7 @@ sl_cs_exit_schedule_nospin_arg_timeout(struct sl_thd *to, cycles_t abs_timeout)
 	 *
 	 * else, reprogram for an earlier timeout requested.
 	 */
-	if (likely(offset > globals->cyc_per_usec 
+	if (likely(offset > globals->cyc_per_usec && globals->timer_prev
 		   && abs_timeout > globals->timer_next)) {
 		ret = sl_thd_dispatch(t, tok, sl_thd_curr());
 	} else {

From bc0ae03b4af799a5fefa4cc92371cf79f01182c9 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Fri, 17 May 2019 00:41:46 -0400
Subject: [PATCH 082/127] Modified part to use blocking vs yielding for
 synchronization

* using cross-core asnds for wakeup results in general-protection fault.
  TODO: debug and fix that.
---
 .../no_interface/omp_hello/hello_omp.c        |   3 +-
 src/components/include/deque.h                |   2 +-
 src/components/include/part.h                 | 140 ++++++++++--------
 src/components/include/part_task.h            | 123 ++++++++-------
 src/components/lib/cos_gomp/cos_gomp.c        |   4 +-
 src/components/lib/part.c                     |   7 +-
 src/components/lib/sl/sl_mod_part_fifo.c      |   7 +-
 src/kernel/include/shared/cos_config.h        |   2 +-
 8 files changed, 164 insertions(+), 124 deletions(-)

diff --git a/src/components/implementation/no_interface/omp_hello/hello_omp.c b/src/components/implementation/no_interface/omp_hello/hello_omp.c
index 5bfb7757ee..f96d49d3fc 100644
--- a/src/components/implementation/no_interface/omp_hello/hello_omp.c
+++ b/src/components/implementation/no_interface/omp_hello/hello_omp.c
@@ -55,12 +55,11 @@ int main ( void )
   INSIDE THE PARALLEL REGION, have each thread say hello.
 */
 #if 1
-#pragma omp parallel num_threads(2) private(id)
+#pragma omp parallel private(id)
   {
 #pragma omp for
   for (id = 0; id < 10; id++) 
   {
-#pragma omp task
 	  PRINTC("id:%u\n", id);
   }
   }
diff --git a/src/components/include/deque.h b/src/components/include/deque.h
index 6563039940..7f5a1fe164 100644
--- a/src/components/include/deque.h
+++ b/src/components/include/deque.h
@@ -19,7 +19,7 @@
  * PPoPP implementation paper, "Correct and Efficient Work-Stealing for Weak Memory Models"
  * https://www.di.ens.fr/~zappa/readings/ppopp13.pdf
  */
-#define DEQUE_MAX_SZ (1<<15)
+#define DEQUE_MAX_SZ (1<<13)
 
 #define DEQUE_PROTOTYPE(name, type)							\
 struct deque_##name {									\
diff --git a/src/components/include/part.h b/src/components/include/part.h
index 96579eb13f..9e60e4d1ea 100644
--- a/src/components/include/part.h
+++ b/src/components/include/part.h
@@ -138,20 +138,14 @@ part_pool_wakeup(void)
 	if (!ps_load(&in_main_parallel)) return;
 
 	sl_cs_enter();
-	if (unlikely(ps_list_head_empty(part_thdpool_curr()))) {
-		sl_cs_exit();
-
-		/* there is nothing in the pool, should we do load-balance? */
-		//sl_xcore_load_balance();
-		return;
-	}
+	if (unlikely(ps_list_head_empty(part_thdpool_curr()))) goto done;
 
 	t = ps_list_head_first(part_thdpool_curr(), struct sl_thd, partlist);
 	assert(t != sl_thd_curr());
 	ps_list_rem(t, partlist);
-	sl_cs_exit();
-
-	sl_thd_wakeup(sl_thd_thdid(t));
+	sl_thd_wakeup_no_cs(t);
+done:
+	sl_cs_exit_schedule();
 #endif
 }
 
@@ -161,13 +155,12 @@ part_pool_block(void)
 #ifdef PART_ENABLE_BLOCKING
 	struct sl_thd *t = sl_thd_curr();
 
-	assert(ps_list_singleton(t, partlist));
 	sl_cs_enter();
+	if (ps_list_singleton(t, partlist)) ps_list_head_append(part_thdpool_curr(), t, partlist);
+	if (!sl_thd_is_runnable(t)) assert(0);
 
-	ps_list_head_append(part_thdpool_curr(), t, partlist);
-	sl_cs_exit();
-
-	sl_thd_block(0);
+	sl_thd_block_no_cs(t, SL_THD_BLOCKED, 0);
+	sl_cs_exit_schedule();
 #else
 	sl_thd_yield(0);
 #endif
@@ -300,7 +293,7 @@ part_list_peek(void)
 	i = part_task_work_try(&main_task);
 	assert(i != 0);
 
-	if (likely(i > 0 && !ps_load(&main_task.end))) return &main_task;
+	if (likely(i > 0 && ps_load(&main_task.end) != main_task.nthds)) return &main_task;
 
 	return NULL;
 #endif
@@ -310,47 +303,83 @@ void part_init(void);
 
 unsigned part_isready(void);
 
-/* a part_task.h api but uses part_list_remove in the master thread, so here! */
 static inline void
-part_task_end(struct part_task *t)
+part_task_barrier(struct part_task *t, int is_end)
 {
 	struct sl_thd *ts = sl_thd_curr();
-	struct part_task *p = t->parent;
-	int tn = part_task_work_thd_num(t);
+	unsigned cbc = 0, cbep = 0;
+	unsigned ec = 0;
+	int is_master = t->master == PART_CURR_THD ? 1 : 0;
 
 	assert(t->type != PART_TASK_T_NONE);
-	assert(tn >= 0);
+	assert(t->state == PART_TASK_S_INITIALIZED);
 	assert(t->nthds >= 1);
-	assert(ts->part_context == (void *)t);
-	if (t->nthds == 1) {
-		int i;
 
-		assert(tn == 0);
+	/* master thread to wait for child threads first, before barrier! */
+	if (is_master) {
+		assert(t->master == PART_CURR_THD);
 		part_task_wait_children(t);
-		part_task_remove_child(t->parent, t);
+	}
+
+	if (t->nthds == 1) {
+		struct part_data *d;
+
+		if (unlikely(!is_end)) return;
+
 		ps_faa(&t->end, 1);
+		/* remove myself from my parent. */
+		part_task_remove_child(t);
 		if (t->type == PART_TASK_T_WORKSHARE) {
-			assert(t->workers[tn] == t->master);
-			ts->part_context = p;
+			assert(is_master);
+			ts->part_context = t->parent;
+
+			return;
 		}
 
+		ts->part_context = NULL;
+		d = t->data_env;
+
+		part_task_free(t);
+		part_data_free(d);
+
 		return;
 	}
-	part_task_barrier(t);
 
-	if (tn == 0) {
-		if (t->type == PART_TASK_T_WORKSHARE) part_list_remove(t);
-		ts->part_context = p;
-		part_task_remove_child(t->parent, t);
-		ps_faa(&t->end, 1);
+	assert(t->type == PART_TASK_T_WORKSHARE);
+
+	cbep = ps_load(&t->barrier_epoch);
+	cbc = ps_faa(&t->barrier, -1);
+	if (cbc > 1) {
+		sl_thd_block(0);
 	} else {
-		ps_faa(&t->end, 1);
-		while (ps_load(&t->end) != t->nthds) sl_thd_yield(0);
+		if (ps_cas(&t->barrier, 0, t->nthds)) ps_faa(&t->barrier_epoch, 1);
+		if (is_master) {
+			part_peer_wakeup(t);
+		} else {
+			part_master_wakeup(t);
+			sl_thd_block(0);
+		}
+	}
+	assert(ps_load(&t->barrier_epoch) == cbep + 1);
 
+	if (!is_end) return;
+	ec = ps_faa(&t->end, 1);
+
+	if (is_master) {
+		while (ps_load(&t->end) != t->nthds) sl_thd_block(0);
+		part_task_remove_child(t);
+		part_list_remove(t);
+		ts->part_context = t->parent;
+	} else {
+		part_master_wakeup(t);
 		ts->part_context = NULL;
 	}
 }
 
+static inline void
+part_task_end(struct part_task *t)
+{ part_task_barrier(t, 1); }
+
 static inline void
 part_thd_fn(void *d)
 {
@@ -359,27 +388,26 @@ part_thd_fn(void *d)
 	/* parallel runtime not ready? */
 	/* if (unlikely(!part_isready())) part_pool_block(); */
 	/* not in the main parallel block? */
-	while (!ps_load(&in_main_parallel)) part_pool_block();
 
 	while (1) {
 		struct part_task *t = NULL;
 		int ret;
-		int thdnum = -1;
-		unsigned thd = cos_cpuid() << 16 | cos_thdid();
+
+		while (!ps_load(&in_main_parallel)) part_pool_block();
 
 		/* FIXME: nested parallel needs love! */
 		t = part_list_peek();
-		if (likely(t)) {
-			thdnum = part_task_work_try(t);
-			if (thdnum >= 0) goto found;
-		}
+		if (likely(t)) goto found;
 
 single:
 		ret = part_deque_pop(&t);
 		if (likely(ret == 0)) {
+			int thdnum = -1;
+
 			assert(t && t->type == PART_TASK_T_TASK);
 			thdnum = part_task_work_try(t);
-			if (thdnum == 0) goto found;
+			assert(thdnum == 0);
+			goto found;
 		}
 
 		if (unlikely(ret == -EAGAIN)) goto single;
@@ -389,27 +417,23 @@ part_thd_fn(void *d)
 			part_pool_block();
 
 			continue;
+		} else {
+			int thdnum = -1;
+
+			assert(t->type == PART_TASK_T_TASK);
+			thdnum = part_task_work_try(t);
+			if (thdnum < 0) continue;
+			assert(thdnum == 0);
 		}
-		assert(t->type == PART_TASK_T_TASK);
+
 found:
-		assert(t->type != PART_TASK_T_NONE);
-		if (unlikely(thdnum < 0)) thdnum = part_task_work_try(t);
-		if (unlikely(thdnum < 0)) continue;
-		if (t->type == PART_TASK_T_TASK) assert(thdnum == 0);
+		assert(t);
 		curr->part_context = (void *)t;
 
 		t->cs.fn(t->cs.data);
 
 		part_task_end(t);
-		/* free the explicit task! */
-		if (t->type == PART_TASK_T_TASK) {
-			struct part_data *d = t->data_env;
-
-			assert(t->nthds == 1 && t->end == 1);
-			part_task_free(t);
-			part_data_free(d);
-		}
-		curr->part_context = NULL;
+		assert(curr->part_context == NULL);
 	}
 
 	sl_thd_exit();
diff --git a/src/components/include/part_task.h b/src/components/include/part_task.h
index 83b1b3f7bc..6d203adaa0 100644
--- a/src/components/include/part_task.h
+++ b/src/components/include/part_task.h
@@ -6,15 +6,17 @@
 #include <ps_list.h>
 #include <cos_types.h>
 
-#define PART_THD(c, t) (cos_cpuid() << 16 | cos_thdid())
+#define PART_THD(c, t) ((unsigned)(cos_cpuid() << 16 | cos_thdid()))
 #define PART_CURR_THD  PART_THD(cos_cpuid(), cos_thdid()) 
+#define PART_THD_COREID(t) (t >> 16)
+#define PART_THD_THDID(t)  ((t << 16) >> 16)
 
-#define PART_MAX_TASKS      256 
-#define PART_MAX_DATA       128
-#define PART_MAX_PAR_THDS   NUM_CPU
+#define PART_MAX_TASKS      256
+#define PART_MAX_DATA       256
+#define PART_MAX_PAR_THDS   NUM_CPU 
 #define PART_MAX_THDS       128
 #define PART_MAX_CORE_THDS  (PART_MAX_THDS/NUM_CPU)
-#define PART_MAX_CHILD      16 
+#define PART_MAX_CHILD      16
 #define PART_MAX_WORKSHARES 16
 
 typedef void (*part_fn_t)(void *);
@@ -77,10 +79,11 @@ struct part_task {
 	struct part_closure   cs;
 
 	unsigned nthds; /* number of threads for this task, 1 in case of non-workshare work */
+	unsigned nworkers;
 	unsigned workers[PART_MAX_PAR_THDS]; /* threads sharing this work or thread doing this work! */
 	int ws_off[PART_MAX_PAR_THDS]; /* progress of the workshares in each participating thread */
 	unsigned master; /* coreid << 16 | thdid of the master */
-	unsigned barrier_in, barrier_out, end;
+	unsigned end, barrier, barrier_epoch;
 
 	struct part_data *data_env; 
 	struct part_task *parent;
@@ -103,12 +106,17 @@ part_task_init(struct part_task *t, part_task_type_t type, struct part_task *p,
 	t->cs.fn = fn;
 	t->cs.data = data;
 	t->nthds = nthds;
+	t->nworkers = 0;
 	memset(t->workers, 0, sizeof(unsigned) * PART_MAX_PAR_THDS);
 	t->master = PART_CURR_THD;
 	/* if it's worksharing, current thread is the master and does take part in the par section */
-	if (type == PART_TASK_T_WORKSHARE) t->workers[0] = t->master;
+	if (type == PART_TASK_T_WORKSHARE) {
+		t->nworkers = 1;
+		t->workers[0] = t->master;
+	}
 	for (i = 0; i < PART_MAX_PAR_THDS; i++) t->ws_off[i] = -1;
-	t->barrier_in = t->barrier_out = t->end = 0;
+	t->barrier = t->nthds;
+	t->end = t->barrier_epoch = 0;
 	t->data_env = d;
 	t->parent = p;
 	t->nchildren = 0;
@@ -137,22 +145,46 @@ part_task_add_child(struct part_task *t, struct part_task *c)
 }
 
 static inline void
-part_task_remove_child(struct part_task *t, struct part_task *c)
+part_thd_wakeup(unsigned thd)
 {
+	thdid_t t = PART_THD_THDID(thd);
+	cpuid_t c = PART_THD_COREID(thd);
+
+	assert(c >= 0 && c < NUM_CPU);
+	assert(t < MAX_NUM_THREADS);
+
+	if (thd == PART_CURR_THD) return;
+	if (c != cos_cpuid()) sl_xcore_thd_wakeup_tid(t, c);
+	else                  sl_thd_wakeup(t);
+}
+
+static inline void
+part_task_remove_child(struct part_task *c)
+{
+	struct part_task *p = c->parent;
+	unsigned wkup;
 	int i;
 
-	if (unlikely(!t || !c)) return;
-	assert(t->state == PART_TASK_S_INITIALIZED);
+	if (unlikely(!p)) return;
+	assert(c->state == PART_TASK_S_INITIALIZED);
+
+	if (c->type == PART_TASK_T_TASK) wkup = c->master;
+	else                             wkup = p->master;
 
-	i = ps_faa(&t->nchildren, -1);
+	i = ps_faa(&p->nchildren, -1);
 	assert(i > 0);
+
+	part_thd_wakeup(wkup);
 }
 
 static inline void
 part_task_wait_children(struct part_task *t)
 {
 	assert(t->state == PART_TASK_S_INITIALIZED);
-	while (ps_load(&t->nchildren) > 0) sl_thd_yield(0);
+	if (t->type == PART_TASK_T_WORKSHARE) assert(t->master == PART_CURR_THD);
+	else if (t->type == PART_TASK_T_TASK) assert(t->workers[0] == PART_CURR_THD);
+
+	while (ps_load(&t->nchildren) > 0) sl_thd_block(0);
 
 	assert(t->nchildren == 0);
 }
@@ -160,7 +192,7 @@ part_task_wait_children(struct part_task *t)
 static inline int
 part_task_work_try(struct part_task *t)
 {
-	unsigned i;
+	int i = 0;
         unsigned key = PART_CURR_THD;
 
 	assert(t->state == PART_TASK_S_INITIALIZED);
@@ -172,17 +204,15 @@ part_task_work_try(struct part_task *t)
 		assert(t->nthds >= 1);
 	}
 
-	for (i = 0; i < t->nthds; i++)
-	{
-		unsigned w = ps_load(&t->workers[i]);
-
-		if (w == key) return i;
-		if (w) continue;
+	/* task was finished! */
+	if (unlikely(ps_load(&t->end) == t->nthds)) return -1;
+	/* if you can work with this task */
+	i = ps_faa(&t->nworkers, 1);
+	if (unlikely(i >= (int)t->nthds)) return -1;
 
-		if (likely(ps_cas(&t->workers[i], w, key))) return i;
-	}
+	t->workers[i] = key;
 
-	return -1;
+	return i;
 }
 
 static inline int
@@ -191,8 +221,10 @@ part_task_work_thd_num(struct part_task *t)
 	int i; 
 	unsigned key = PART_CURR_THD;
 
+	assert(t);
+
 	assert(t->state == PART_TASK_S_INITIALIZED);
-	if (t->type == PART_TASK_T_TASK) {
+	if (likely(t->type == PART_TASK_T_TASK)) {
 		assert(t->nthds == 1);
 
 		if (ps_load(&t->workers[0]) == key) return 0;
@@ -210,40 +242,27 @@ part_task_work_thd_num(struct part_task *t)
 }
 
 static inline void
-part_task_barrier(struct part_task *t)
+part_master_wakeup(struct part_task *t)
 {
-	int tn = part_task_work_thd_num(t);
-	unsigned cin = 0, cout = 0;
-
+	assert(t->type == PART_TASK_T_WORKSHARE);
 	assert(t->state == PART_TASK_S_INITIALIZED);
-	assert(tn >= 0 && t->nthds >= 1);
-
-	if (t->nthds == 1) {
-		int i;
-
-		assert(tn == 0 && t->barrier_in == 0);
-
-		/* wait for all child tasks to complete, including explicit tasks */
-		part_task_wait_children(t);
-
-		return;
-	}
+	assert(t->nthds > 1);
+	assert(t->master && t->master != PART_CURR_THD);
 
-	/* wait for all siblings to have seen the previous barrier */
-	while (ps_load(&t->barrier_out) % t->nthds) sl_thd_yield(0);
+	part_thd_wakeup(t->master);
+}
 
-	cin = ps_faa(&t->barrier_in, 1);
-	if (cin % t->nthds == t->nthds - 1) {
-		int i;
+static inline void
+part_peer_wakeup(struct part_task *t)
+{
+	unsigned i;
 
-		/* wait for all child tasks to complete, including explicit tasks */
-		part_task_wait_children(t);
-	} else {
-		/* wait for all sibling tasks to reach in barrier! */
-		while (ps_load(&t->barrier_in) % t->nthds != 0) sl_thd_yield(0);
-	}
+	assert(t->type == PART_TASK_T_WORKSHARE);
+	assert(t->state == PART_TASK_S_INITIALIZED);
+	assert(t->nthds > 1);
+	assert(t->master == PART_CURR_THD);
 
-	ps_faa(&t->barrier_out, 1);
+	for (i = 1; i < t->nthds; i++) part_thd_wakeup(t->workers[i]);
 }
 
 #endif /* PART_TASK_H */
diff --git a/src/components/lib/cos_gomp/cos_gomp.c b/src/components/lib/cos_gomp/cos_gomp.c
index 88d88a1a74..1b1590d0c8 100644
--- a/src/components/lib/cos_gomp/cos_gomp.c
+++ b/src/components/lib/cos_gomp/cos_gomp.c
@@ -151,7 +151,7 @@ GOMP_barrier (void)
 {
 	struct part_task *t = (struct part_task *)sl_thd_curr()->part_context;
 
-	part_task_barrier(t);
+	part_task_barrier(t, 0);
 }
 
 static inline bool
@@ -276,7 +276,7 @@ GOMP_loop_end (void)
 
 	assert(t->ws[woff].type == PART_WORKSHARE_LOOP_DYNAMIC);
 
-	part_task_barrier(t);
+	part_task_barrier(t, 0);
 }
 
 void
diff --git a/src/components/lib/part.c b/src/components/lib/part.c
index e29e7d9ab0..d317ae75d7 100644
--- a/src/components/lib/part.c
+++ b/src/components/lib/part.c
@@ -25,7 +25,7 @@ static struct part_task *part_tasks = NULL;
 static struct part_data *part__data = NULL;
 struct ps_list_head part_thdpool_core[NUM_CPU];
 
-#define PART_DEQUE_SZ 64
+#define PART_DEQUE_SZ PART_MAX_TASKS
 #define _PART_PRIO TCAP_PRIO_MAX
 #define _PART_PRIO_PACK() sched_param_pack(SCHEDP_PRIO, _PART_PRIO)
 
@@ -36,7 +36,10 @@ struct ps_list_head part_thdpool_core[NUM_CPU];
 static void
 part_idle_fn(void *d)
 {
-	while (1) part_pool_wakeup();
+	while (1) {
+		part_pool_wakeup();
+		sl_thd_yield_thd(sl__globals_core()->sched_thd);
+	}
 }
 
 struct part_data *
diff --git a/src/components/lib/sl/sl_mod_part_fifo.c b/src/components/lib/sl/sl_mod_part_fifo.c
index 53eac3dd13..0c1e727284 100644
--- a/src/components/lib/sl/sl_mod_part_fifo.c
+++ b/src/components/lib/sl/sl_mod_part_fifo.c
@@ -30,12 +30,7 @@ sl_mod_schedule(void)
 	if (unlikely(ps_list_head_empty(&threads[cos_cpuid()]))) goto done;
 	t = ps_list_head_first_d(&threads[cos_cpuid()], struct sl_thd_policy);
 
-	/*
-	 * we're the only thread and we're yielding, that
-	 * means, we don't want to run anymore. run idle thread so it can
-	 * pick someone else and that can do some work!
-	 */
-	if (likely(c != t)) return t;
+	return t;
 done:
 	if (likely(idle_thd[cos_cpuid()])) return idle_thd[cos_cpuid()];
 
diff --git a/src/kernel/include/shared/cos_config.h b/src/kernel/include/shared/cos_config.h
index 8477e914dd..8c46ae5377 100644
--- a/src/kernel/include/shared/cos_config.h
+++ b/src/kernel/include/shared/cos_config.h
@@ -17,7 +17,7 @@
 
 #include "cpu_ghz.h"
 
-#define NUM_CPU 4
+#define NUM_CPU 2
 #define NUM_CPU_BMP_BYTES ((NUM_CPU + 7) / 8)
 #define NUM_CPU_BMP_WORDS ((NUM_CPU_BMP_BYTES + 3) / 4)
 

From 25b108c5125a3a58cccc41c0291a2f58272fafa4 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Fri, 17 May 2019 13:29:00 -0400
Subject: [PATCH 083/127] merge error fix: was causing general protection fault
 in ipi snd

---
 src/platform/i386/idt.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/platform/i386/idt.c b/src/platform/i386/idt.c
index 018f183aea..821806bfee 100644
--- a/src/platform/i386/idt.c
+++ b/src/platform/i386/idt.c
@@ -68,6 +68,17 @@ idt_init(const cpuid_t cpu_id)
 	idt_ptr.base  = (u32_t)&(idt_entries);
 	memset(&(idt_entries), 0, sizeof(struct idt_entry) * NUM_IDT_ENTRIES);
 
+        outb(0x20, 0x11);
+        outb(0xA0, 0x11);
+        outb(0x21, 0x20);
+        outb(0xA1, 0x28);
+        outb(0x21, 0x04);
+        outb(0xA1, 0x02);
+        outb(0x21, 0x01);
+        outb(0xA1, 0x01);
+        outb(0x21, 0x0);
+        outb(0xA1, 0x0);
+
 	idt_set_gate(IRQ_DIV_BY_ZERO_ERR_FAULT, (u32_t)div_by_zero_err_fault_irq, 0x08, 0x8E);
 	idt_set_gate(IRQ_DEBUG_TRAP, (u32_t)debug_trap_irq, 0x08, 0x8E);
 	idt_set_gate(IRQ_BREAKPOINT_TRAP, (u32_t)breakpoint_trap_irq, 0x08, 0x8E);
@@ -120,6 +131,7 @@ idt_init(const cpuid_t cpu_id)
 	idt_set_gate(HW_ID30, (u32_t)handler_hw_61, 0x08, 0x8E);
 	idt_set_gate(HW_ID31, (u32_t)handler_hw_62, 0x08, 0x8E);
 	idt_set_gate(HW_LAPIC_SPURIOUS, (u32_t)lapic_spurious_irq, 0x08, 0x8E);
+	idt_set_gate(HW_LAPIC_IPI_ASND, (u32_t)lapic_ipi_asnd_irq, 0x08, 0x8E);
 	idt_set_gate(HW_LAPIC_TIMER, (u32_t)lapic_timer_irq, 0x08, 0x8E);
 
 update:

From d6a64fb408c1d7108f582ead3cbcfcd2413a73d6 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Fri, 17 May 2019 23:18:38 -0400
Subject: [PATCH 084/127] consistency fix in lazy restore for preemption

THIS DOES NOT SOLVE THE PROBLEM WITH IPIs I'VE RIGHT NOW
---
 src/components/include/sl.h      | 117 ++++++++++++++++++++++---------
 src/components/lib/sl/sl_sched.c |   2 +-
 src/components/lib/sl/sl_xcore.c |  13 ++--
 src/kernel/capinv.c              |  38 ++++++----
 src/kernel/include/thd.h         |  12 ++--
 5 files changed, 120 insertions(+), 62 deletions(-)

diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index fba0e3c171..93f5813327 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -435,21 +435,51 @@ sl_thd_is_runnable(struct sl_thd *t)
 
 int sl_thd_kern_dispatch(thdcap_t t);
 
+static inline int
+sl_thd_activate(struct sl_thd *t, sched_tok_t tok, tcap_time_t timeout)
+{
+	struct cos_defcompinfo *dci = cos_defcompinfo_curr_get();
+	struct cos_compinfo    *ci  = &dci->ci;
+	struct sl_global_core  *g   = sl__globals_core();
+	int ret = 0;
+
+	if (t->properties & SL_THD_PROPERTY_SEND) {
+		return cos_sched_asnd(t->sndcap, timeout, g->sched_rcv, tok);
+	} else if (t->properties & SL_THD_PROPERTY_OWN_TCAP) {
+		return cos_switch(sl_thd_thdcap(t), sl_thd_tcap(t), t->prio,
+				  timeout, g->sched_rcv, tok);
+	} else {
+		ret = cos_defswitch(sl_thd_thdcap(t), t->prio, t == g->sched_thd ?
+				    TCAP_TIME_NIL : timeout, tok);
+		if (likely(t != g->sched_thd && t != g->idle_thd)) return ret;
+		if (unlikely(ret != -EPERM)) return ret;
+
+		/*
+		 * Attempting to activate scheduler thread or idle thread failed for no budget in it's tcap.
+		 * Force switch to the scheduler with current tcap.
+		 */
+		return cos_switch(g->sched_thdcap, g->sched_tcap, t->prio,
+				  timeout, g->sched_rcv, tok);
+	}
+}
+
+
+
 static inline int
 sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 {
 	struct cos_scb_info *scb = sl_scb_info_core();
 	struct cos_dcb_info *cd = sl_thd_dcbinfo(curr), *nd = sl_thd_dcbinfo(next);
 
-	if (unlikely(!cd || !nd)) {
-		return sl_thd_kern_dispatch(sl_thd_thdcap(next));
-	}
+	assert(curr != next);
+	if (unlikely(!cd || !nd)) return sl_thd_activate(next, tok, sl__globals_core()->timeout_next);
 
 	/*
 	 * jump labels in the asm routine:
 	 *
 	 * 1: slowpath dispatch using cos_thd_switch to switch to a thread
 	 *    if the dcb sp of the next thread is reset.
+	 *	(inlined slowpath sysenter to debug preemption problem)
 	 *
 	 * 2: if user-level dispatch routine completed successfully so
 	 *    the register states still retained and in the dispatched thread
@@ -461,6 +491,53 @@ sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 	 *    NOTE: kernel takes care of resetting dcb sp in this case!
 	 */
 
+	__asm__ __volatile__ (				\
+		"pushl %%eax\n\t"			\
+		"pushl %%ebx\n\t"			\
+		"pushl %%ecx\n\t"			\
+		"pushl %%edx\n\t"			\
+		"pushl %%esi\n\t"			\
+		"pushl %%edi\n\t"			\
+		"pushl %%ebp\n\t"			\
+		"movl %%esp, %%ebp\n\t"			\
+		"movl $2f, (%%eax)\n\t"			\
+		"movl %%esp, 4(%%eax)\n\t"		\
+		"cmp $0, 4(%%ebx)\n\t"			\
+		"je 1f\n\t"				\
+		"movl %%edx, (%%ecx)\n\t"		\
+		"movl 4(%%ebx), %%esp\n\t"		\
+		"jmp *(%%ebx)\n\t"			\
+		".align 4\n\t"				\
+		"1:\n\t"				\
+		"movl $3f, %%ecx\n\t"			\
+		"movl %%edx, %%eax\n\t"			\
+		"inc %%eax\n\t"				\
+		"shl $16, %%eax\n\t"			\
+		"movl $0, %%ebx\n\t"			\
+		"movl $0, %%esi\n\t"			\
+		"movl $0, %%edi\n\t"			\
+		"movl $0, %%edx\n\t"			\
+		"sysenter\n\t"				\
+		"jmp 3f\n\t"				\
+		".align 4\n\t"				\
+		"2:\n\t"				\
+		"movl $0, 4(%%ebx)\n\t"			\
+		".align 4\n\t"				\
+		"3:\n\t"				\
+		"popl %%ebp\n\t"			\
+		"popl %%edi\n\t"			\
+		"popl %%esi\n\t"			\
+		"popl %%edx\n\t"			\
+		"popl %%ecx\n\t"			\
+		"popl %%ebx\n\t"			\
+		"popl %%eax\n\t"			\
+		:
+		: "a" (cd), "b" (nd),
+		  "S" ((u32_t)((u64_t)tok >> 32)), "D" ((u32_t)(((u64_t)tok << 32) >> 32)),
+		  "c" (&(scb->curr_thd)), "d" (sl_thd_thdcap(next))
+		: "memory", "cc");
+
+#if 0
 	__asm__ __volatile__ (				\
 		"pushl %%ebp\n\t"			\
 		"movl $2f, (%%eax)\n\t"			\
@@ -487,38 +564,12 @@ sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 		  "S" ((u32_t)((u64_t)tok >> 32)), "D" ((u32_t)(((u64_t)tok << 32) >> 32)),
 		  "c" (&(scb->curr_thd)), "d" (sl_thd_thdcap(next))
 		: "memory", "cc");
+#endif
 
-	if (likely(sl_scb_info_core()->sched_tok == tok)) return 0;
-
-	return -EAGAIN;
-}
-
-static inline int
-sl_thd_activate(struct sl_thd *t, sched_tok_t tok, tcap_time_t timeout)
-{
-	struct cos_defcompinfo *dci = cos_defcompinfo_curr_get();
-	struct cos_compinfo    *ci  = &dci->ci;
-	struct sl_global_core  *g   = sl__globals_core();
-	int ret = 0;
-
-	if (t->properties & SL_THD_PROPERTY_SEND) {
-		return cos_sched_asnd(t->sndcap, timeout, g->sched_rcv, tok);
-	} else if (t->properties & SL_THD_PROPERTY_OWN_TCAP) {
-		return cos_switch(sl_thd_thdcap(t), sl_thd_tcap(t), t->prio,
-				  timeout, g->sched_rcv, tok);
-	} else {
-		ret = cos_defswitch(sl_thd_thdcap(t), t->prio, t == g->sched_thd ?
-				    TCAP_TIME_NIL : timeout, tok);
-		if (likely(t != g->sched_thd && t != g->idle_thd)) return ret;
-		if (unlikely(ret != -EPERM)) return ret;
+	//if (likely(sl_scb_info_core()->sched_tok == tok)) return 0;
 
-		/*
-		 * Attempting to activate scheduler thread or idle thread failed for no budget in it's tcap.
-		 * Force switch to the scheduler with current tcap.
-		 */
-		return cos_switch(g->sched_thdcap, g->sched_tcap, t->prio,
-				  timeout, g->sched_rcv, tok);
-	}
+	return 0;
+	//return -EAGAIN;
 }
 
 static inline int
diff --git a/src/components/lib/sl/sl_sched.c b/src/components/lib/sl/sl_sched.c
index 80ecc22297..0dd888ea07 100644
--- a/src/components/lib/sl/sl_sched.c
+++ b/src/components/lib/sl/sl_sched.c
@@ -681,7 +681,7 @@ sl_sched_loop_intern(int non_block)
 			 */
 			pending = cos_ul_sched_rcv(g->sched_rcv, rfl, g->timeout_next, &e);
 
-			if (!e.tid) goto pending_events;
+			if (pending < 0 || !e.tid) goto pending_events;
 
 			t = sl_thd_lkup(e.tid);
 			assert(t);
diff --git a/src/components/lib/sl/sl_xcore.c b/src/components/lib/sl/sl_xcore.c
index 665c4be9af..cb41b76be2 100644
--- a/src/components/lib/sl/sl_xcore.c
+++ b/src/components/lib/sl/sl_xcore.c
@@ -78,11 +78,18 @@ static inline int
 _sl_xcore_request_enqueue_no_cs(cpuid_t core, struct sl_xcore_request *rq)
 {
 	int ret = 0;
+//	asndcap_t snd = 0;
 	
 	if (unlikely(core >= NUM_CPU)) return -1;
 	if (unlikely(core == cos_cpuid())) return -1;
 	if (unlikely(!bitmap_check(sl__globals()->core_bmp, core))) return -1;
 	ret = ck_ring_enqueue_mpsc_xcore(sl__ring(core), sl__ring_buffer(core), rq);
+//	snd = sl__globals()->xcore_asnd[cos_cpuid()][core];
+//	assert(snd);
+//
+//	/* send an IPI for the request */
+//	cos_asnd(snd, 0);
+
 	if (unlikely(ret == false)) return -1;
 
 	return 0;
@@ -92,7 +99,6 @@ static inline int
 _sl_xcore_request_enqueue(cpuid_t core, struct sl_xcore_request *rq)
 {
 	int ret = 0;
-	/* asndcap_t snd = 0; */
 	
 	if (unlikely(core >= NUM_CPU)) return -1;
 	sl_cs_enter();
@@ -100,11 +106,6 @@ _sl_xcore_request_enqueue(cpuid_t core, struct sl_xcore_request *rq)
 	sl_cs_exit();
 	if (unlikely(ret)) return -1;
 
-	/* snd = sl__globals()->xcore_asnd[cos_cpuid()][core]; */
-	/* assert(snd); */
-
-	/* send an IPI for the request */
-	/* if (snd) cos_asnd(snd, 0); */
 
 	return 0;
 }
diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index cb482de624..2271153b4e 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -89,10 +89,10 @@ static inline struct thread *
 cap_ulthd_lazyupdate(struct pt_regs *regs, struct cos_cpu_local_info *cos_info, int interrupt, struct comp_info **ci_ptr)
 {
 	struct thread       *thd = thd_current(cos_info);
-	struct cap_thd      *ch_ult;
-	struct thread       *ulthd;
-	capid_t              ultc;
-	int                  invstk_top;
+	struct cap_thd      *ch_ult = NULL;
+	struct thread       *ulthd = NULL;
+	capid_t              ultc = 0;
+	int                  invstk_top = 0;
 	struct cos_scb_info *scb_core = NULL; /* per-core scb_info */
 
 	*ci_ptr = thd_invstk_current_compinfo(thd, cos_info, &invstk_top);
@@ -101,26 +101,36 @@ cap_ulthd_lazyupdate(struct pt_regs *regs, struct cos_cpu_local_info *cos_info,
 
 	if (unlikely(!(*ci_ptr)->scb_data)) goto done;
 	scb_core = (((*ci_ptr)->scb_data) + get_cpuid());
+	ultc     = scb_core->curr_thd;
+	/* reset inconsistency from user-level thd! */
+	scb_core->curr_thd = 0;
+	if (!ultc && !interrupt) goto done;
+
+	if (likely(ultc)) {
+		ch_ult = (struct cap_thd *)captbl_lkup((*ci_ptr)->captbl, ultc);
+		if (unlikely(!CAP_TYPECHK_CORE(ch_ult, CAP_THD))) ch_ult = NULL;
+		else                                              ulthd = ch_ult->t;
+	}
 
 	if (unlikely(interrupt)) {
+		struct thread *fixthd = thd;
+
 		assert(scb_core->sched_tok < ~0U);
 		cos_faa((int *)&(scb_core->sched_tok), 1);
-	}
 
-	ultc   = scb_core->curr_thd;
-	if (!ultc) goto done;
-	ch_ult = (struct cap_thd *)captbl_lkup((*ci_ptr)->captbl, ultc);
-	if (unlikely(!CAP_TYPECHK_CORE(ch_ult, CAP_THD))) goto done;
+		if (ulthd) fixthd = ulthd;
 
-	/* reset inconsistency from user-level thd! */
-	scb_core->curr_thd = 0;
+		if (unlikely(fixthd->dcbinfo && fixthd->dcbinfo->sp)) {
+			regs->ip = fixthd->dcbinfo->ip + DCB_IP_KERN_OFF;
+			regs->sp = fixthd->dcbinfo->sp;
 
-	ulthd = ch_ult->t;
-	if (unlikely(ulthd->dcbinfo == NULL)) goto done;
+			fixthd->dcbinfo->sp = 0;
+		}
+	}
+	if (unlikely(!ultc || !ulthd || ulthd->dcbinfo == NULL)) goto done;
 	if (ulthd == thd) goto done;
 	/* check if kcurr and ucurr threads are both in the same page-table(component) */
 	if (thd_current_pgtbl(ulthd) != thd_current_pgtbl(thd)) goto done;
-
 	thd_current_update(ulthd, thd, cos_info);
 	thd = ulthd;
 
diff --git a/src/kernel/include/thd.h b/src/kernel/include/thd.h
index 1705fcd78f..00cdb65e65 100644
--- a/src/kernel/include/thd.h
+++ b/src/kernel/include/thd.h
@@ -669,14 +669,10 @@ thd_switch_update(struct thread *thd, struct pt_regs *regs, int issame)
 		 */
 	}
 
-	if (likely(thd->dcbinfo && thd->dcbinfo->sp)) {
-		if (!preempt) {
-			regs->dx = regs->ip = thd->dcbinfo->ip + DCB_IP_KERN_OFF;
-			regs->cx = regs->sp = thd->dcbinfo->sp;
-		} else {
-			regs->ip = thd->dcbinfo->ip + DCB_IP_KERN_OFF;
-			regs->sp = thd->dcbinfo->sp;
-		}
+	if (unlikely(thd->dcbinfo && thd->dcbinfo->sp)) {
+		assert(preempt == 0);
+		regs->dx = regs->ip = thd->dcbinfo->ip + DCB_IP_KERN_OFF;
+		regs->cx = regs->sp = thd->dcbinfo->sp;
 		thd->dcbinfo->sp = 0;
 	}
 

From 93c52031be6243d7b61025f8a1f13892ea878834 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Fri, 17 May 2019 23:20:40 -0400
Subject: [PATCH 085/127] Fixed explicit task allocation to use per-core list

* ps_slab usage has problems, i couldn't get it to work correctly.
  will get to that eventually.
* STILL NO CLUE ON THE IPI PREEMPTION PROBLEM, just pushing the
  current working code without IPIs.
---
 src/components/Makefile.comp                  |   1 +
 .../implementation/Makefile.subsubdir         |   2 +-
 .../no_interface/omp_dijkstra/Makefile        |   2 +-
 .../no_interface/omp_fib_bots/Makefile        |   2 +-
 .../no_interface/omp_hello/Makefile           |   2 +-
 .../no_interface/omp_sort_bots/Makefile       |   2 +-
 .../tests/micro_booter/micro_booter.c         |   3 +-
 .../tests/unit_schedtests/unit_schedlib.c     |   4 +-
 src/components/include/part.h                 |   7 +-
 src/components/include/part_task.h            |   2 +
 src/components/lib/part.c                     | 296 ++++++++++++++----
 11 files changed, 254 insertions(+), 69 deletions(-)

diff --git a/src/components/Makefile.comp b/src/components/Makefile.comp
index 9ef1339e3f..2a4887534e 100644
--- a/src/components/Makefile.comp
+++ b/src/components/Makefile.comp
@@ -11,6 +11,7 @@ MUSLBIN=$(MUSLDIR)/bin
 MUSLCC=$(MUSLBIN)/musl-$(CC)
 MUSLINC=-isystem$(MUSLDIR)/include
 
+PSLIBDIR=$(LIBDIR)/ps
 CKDIR=$(LIBDIR)/ck
 CKLIBDIR=$(CKDIR)/lib
 CKINCDIR=$(CKDIR)/include
diff --git a/src/components/implementation/Makefile.subsubdir b/src/components/implementation/Makefile.subsubdir
index 693d3a11a1..89bc44b379 100644
--- a/src/components/implementation/Makefile.subsubdir
+++ b/src/components/implementation/Makefile.subsubdir
@@ -42,7 +42,7 @@ TMP_STR2=tmp2
 INCLUDE=-I../ $(DEP_INC) $(IF_INCLUDE) $(CINC)
 LIB_LIBRARIES_PRE=$(DEP_LIB_EXIST)
 LIB_LIBRARIES=$(strip $(LIB_LIBRARIES_PRE))
-LIB_FLAGS=-L$(CKLIBDIR) -L$(LIBDIR) -L$(LIBCXXDIR) $(DEP_LIB) $(LIB_LIBRARIES) $(ADDITIONAL_LIBS)
+LIB_FLAGS=-L$(PSLIBDIR) -L$(CKLIBDIR) -L$(LIBDIR) -L$(LIBCXXDIR) $(DEP_LIB) $(LIB_LIBRARIES) $(ADDITIONAL_LIBS)
 
 C_SOURCES=$(C_OBJS:%.o=%.c)
 CXX_SOURCES=$(CXX_OBJS:%.o=%.cc)
diff --git a/src/components/implementation/no_interface/omp_dijkstra/Makefile b/src/components/implementation/no_interface/omp_dijkstra/Makefile
index 2724553d78..c81e74faf6 100644
--- a/src/components/implementation/no_interface/omp_dijkstra/Makefile
+++ b/src/components/implementation/no_interface/omp_dijkstra/Makefile
@@ -2,7 +2,7 @@ COMPONENT=omp_dijkstra.o
 INTERFACES=
 DEPENDENCIES=capmgr
 IF_LIB=
-ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLCAPMGR) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart -lsl_blkpt
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLCAPMGR) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart -lsl_blkpt -lps
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/no_interface/omp_fib_bots/Makefile b/src/components/implementation/no_interface/omp_fib_bots/Makefile
index bdd8a43b44..7eea727205 100644
--- a/src/components/implementation/no_interface/omp_fib_bots/Makefile
+++ b/src/components/implementation/no_interface/omp_fib_bots/Makefile
@@ -2,7 +2,7 @@ COMPONENT=omp_fib_bots.o
 INTERFACES=
 DEPENDENCIES=capmgr
 IF_LIB=
-ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLCAPMGR) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart -lsl_blkpt
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLCAPMGR) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart -lsl_blkpt -lps
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/no_interface/omp_hello/Makefile b/src/components/implementation/no_interface/omp_hello/Makefile
index f15a5fd6dd..185540bca8 100644
--- a/src/components/implementation/no_interface/omp_hello/Makefile
+++ b/src/components/implementation/no_interface/omp_hello/Makefile
@@ -2,7 +2,7 @@ COMPONENT=omp_hello.o
 INTERFACES=
 DEPENDENCIES=capmgr
 IF_LIB=
-ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLCAPMGR) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart -lsl_blkpt
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLCAPMGR) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart -lsl_blkpt -lps
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/no_interface/omp_sort_bots/Makefile b/src/components/implementation/no_interface/omp_sort_bots/Makefile
index a711420191..5d1d63e2cf 100644
--- a/src/components/implementation/no_interface/omp_sort_bots/Makefile
+++ b/src/components/implementation/no_interface/omp_sort_bots/Makefile
@@ -2,7 +2,7 @@ COMPONENT=omp_sort_bots.o
 INTERFACES=
 DEPENDENCIES=capmgr
 IF_LIB=
-ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLCAPMGR) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart -lsl_blkpt
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLCAPMGR) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart -lsl_blkpt -lps
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/tests/micro_booter/micro_booter.c b/src/components/implementation/tests/micro_booter/micro_booter.c
index e9346c1b58..04316ddda9 100644
--- a/src/components/implementation/tests/micro_booter/micro_booter.c
+++ b/src/components/implementation/tests/micro_booter/micro_booter.c
@@ -42,7 +42,8 @@ cos_init(void)
 	assert(termthd[cos_cpuid()]);
 
 	PRINTC("Micro Booter started.\n");
-	test_run_mb();
+	//test_run_mb();
+	test_ipi_full();
 
 	/* NOTE: This is just to make sense of the output on HW! To understand that microbooter runs to completion on all cores! */
 	test_done[cos_cpuid()] = 1;
diff --git a/src/components/implementation/tests/unit_schedtests/unit_schedlib.c b/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
index d917541c88..c7cd84a532 100644
--- a/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
+++ b/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
@@ -223,8 +223,8 @@ cos_init(void)
 	cos_dcb_info_init_curr();
 	sl_init(SL_MIN_PERIOD_US);
 
-	test_yield_perf();
-	//test_yields();
+	//test_yield_perf();
+	test_yields();
 	//test_blocking_directed_yield();
 	//test_timeout_wakeup();
 
diff --git a/src/components/include/part.h b/src/components/include/part.h
index 9e60e4d1ea..53e60db085 100644
--- a/src/components/include/part.h
+++ b/src/components/include/part.h
@@ -141,11 +141,11 @@ part_pool_wakeup(void)
 	if (unlikely(ps_list_head_empty(part_thdpool_curr()))) goto done;
 
 	t = ps_list_head_first(part_thdpool_curr(), struct sl_thd, partlist);
-	assert(t != sl_thd_curr());
 	ps_list_rem(t, partlist);
+	if (t == sl_thd_curr()) goto done;
 	sl_thd_wakeup_no_cs(t);
 done:
-	sl_cs_exit_schedule();
+	sl_cs_exit();
 #endif
 }
 
@@ -308,7 +308,6 @@ part_task_barrier(struct part_task *t, int is_end)
 {
 	struct sl_thd *ts = sl_thd_curr();
 	unsigned cbc = 0, cbep = 0;
-	unsigned ec = 0;
 	int is_master = t->master == PART_CURR_THD ? 1 : 0;
 
 	assert(t->type != PART_TASK_T_NONE);
@@ -363,7 +362,7 @@ part_task_barrier(struct part_task *t, int is_end)
 	assert(ps_load(&t->barrier_epoch) == cbep + 1);
 
 	if (!is_end) return;
-	ec = ps_faa(&t->end, 1);
+	ps_faa(&t->end, 1);
 
 	if (is_master) {
 		while (ps_load(&t->end) != t->nthds) sl_thd_block(0);
diff --git a/src/components/include/part_task.h b/src/components/include/part_task.h
index 6d203adaa0..ceb757993c 100644
--- a/src/components/include/part_task.h
+++ b/src/components/include/part_task.h
@@ -67,6 +67,7 @@ struct part_closure {
 
 struct part_data {
 	int flag; /* 0 = not in use, 1 = in use */	
+	struct part_data *next_free; /* for explicit data allocation/free */
 	char data[PART_MAX_DATA];
 };
 
@@ -90,6 +91,7 @@ struct part_task {
 	int nchildren;
 
 	struct ps_list partask;
+	struct part_task *next_free; /* for explicit task allocation/free */
 } CACHE_ALIGNED;
 
 static inline void
diff --git a/src/components/lib/part.c b/src/components/lib/part.c
index d317ae75d7..fce8c5879f 100644
--- a/src/components/lib/part.c
+++ b/src/components/lib/part.c
@@ -3,14 +3,12 @@
 #include <part_task.h>
 #include <part.h>
 #include <../interface/capmgr/memmgr.h>
+#include <ps.h>
+#include <ps_slab.h>
 
 #include <sl.h>
 #include <sl_xcore.h>
 
-#define PART_MAX_PAGES (((PART_MAX_TASKS * sizeof(struct part_task)) / PAGE_SIZE) + 1)
-#define PART_MAX_DATA_PAGES (((PART_MAX_TASKS * sizeof(struct part_data)) / PAGE_SIZE) + 1)
-#define PART_DEQUE_MAX_PAGES ((sizeof(struct deque_part) / PAGE_SIZE) + 1)
-
 struct deque_part *part_dq_percore[NUM_CPU];
 //struct cirque_par parcq_global;
 static volatile unsigned part_ready = 0;
@@ -21,8 +19,8 @@ struct ps_list_head part_l_global;
 #else
 struct part_task main_task;
 #endif
-static struct part_task *part_tasks = NULL;
-static struct part_data *part__data = NULL;
+//static struct part_task *part_tasks = NULL;
+//static struct part_data *part__data = NULL;
 struct ps_list_head part_thdpool_core[NUM_CPU];
 
 #define PART_DEQUE_SZ PART_MAX_TASKS
@@ -32,79 +30,256 @@ struct ps_list_head part_thdpool_core[NUM_CPU];
 #define _PART_IDLE_PRIO (_PART_PRIO+4)
 #define _PART_IDLE_PRIO_PACK() sched_param_pack(SCHEDP_PRIO, _PART_IDLE_PRIO)
 
-/* idle thread to wakeup when there is nothing to do on this core! */
-static void
-part_idle_fn(void *d)
+//struct ps_slab *
+//ps_slab_memmgr_alloc(struct ps_mem *m, size_t sz, coreid_t coreid)
+//{
+//	PRINTC("%s:%d\n", __func__, __LINE__);
+//	unsigned npages = round_up_to_page(sz) / PAGE_SIZE;
+//	vaddr_t addr = memmgr_heap_page_allocn(npages);
+//
+//	assert(addr);
+//	memset((void *)addr, 0, npages * PAGE_SIZE);
+//	PRINTC("%s:%d\n", __func__, __LINE__);
+//
+//	return (struct ps_slab *)addr;
+//}
+//
+//void
+//ps_slab_memmgr_free(struct ps_mem *m, struct ps_slab *s, size_t sz, coreid_t coreid)
+//{
+//	/* do nothing */
+//}
+
+/* this? */
+//PS_SLAB_CREATE_AFNS(parttask, sizeof(struct part_task), PART_TASKS_MAX_SZ, 0, ps_slab_memmgr_alloc, ps_slab_memmgr_free);
+//PS_SLAB_CREATE_AFNS(partdata, sizeof(struct part_data), PART_DATA_MAX_SZ, 0, ps_slab_memmgr_alloc, ps_slab_memmgr_free);
+/* or this. */
+//PS_SLAB_CREATE(parttask, sizeof(struct part_task), PART_TASKS_MAX_SZ)
+//PS_SLAB_CREATE(partdata, sizeof(struct part_data), PART_DATA_MAX_SZ)
+
+/* for task pool, per core list. tasks in pool can migrate cores */
+struct parttask_head {
+	struct part_task *head;
+};
+
+static inline void
+parttask_store_init(struct parttask_head *h)
 {
-	while (1) {
-		part_pool_wakeup();
-		sl_thd_yield_thd(sl__globals_core()->sched_thd);
-	}
+	h->head = NULL;
 }
 
-struct part_data *
-part_data_alloc(void)
+static inline void
+parttask_store_add(struct parttask_head *h, struct part_task *l)
 {
-	int i;
-	struct part_data *d;
+	struct part_task *n;
+	l->next_free = NULL;
 
-	for (i = 0; i < PART_MAX_TASKS; i++) {
-		d = part__data + i;
+	assert(h);
+	do {
+		n = ps_load(&h->head);
+		l->next_free = n;
+	} while (!ps_cas(&h->head, (unsigned long)n, (unsigned long)l)); 
+}
 
-		if (d->flag) continue;
+static inline struct part_task *
+parttask_store_dequeue(struct parttask_head *h)
+{
+	struct part_task *l = NULL;
 
-		/* if this fails, someone else just alloced it! */
-		if (!ps_cas(&d->flag, 0, 1)) continue;
+	do {
+		l = ps_load(&h->head);
+		if (unlikely(!l)) return NULL;
+	} while (!ps_cas(&h->head, (unsigned long)l, (unsigned long)l->next_free));
 
-		return d;
-	}
+	l->next_free = NULL;
 
-	return NULL;
+	return l;
 }
 
-void
-part_data_free(struct part_data *d)
+/* for task data, per core pool - task data could migrate pools. */
+struct partdata_head {
+	struct part_data *head;
+};
+
+static inline void
+partdata_store_init(struct partdata_head *h)
 {
-	int f;
+	h->head = NULL;
+}
 
-	if (!d) return;
+static inline void
+partdata_store_add(struct partdata_head *h, struct part_data *l)
+{
+	struct part_data *n = NULL;
+	l->next_free = NULL;
 
+	assert(h);
 	do {
-		f = d->flag;
-		assert(f);
-	} while (!ps_cas(&d->flag, f, 0));
+		n = ps_load(&h->head);
+
+		l->next_free = n;
+	} while (!ps_cas(&h->head, (unsigned long)n, (unsigned long)l)); 
 }
-struct part_task *
-part_task_alloc(part_task_type_t type)
+
+static inline struct part_data *
+partdata_store_dequeue(struct partdata_head *h)
+{
+	struct part_data *l = NULL;
+
+	do {
+		l = ps_load(&h->head);
+		if (unlikely(!l)) return NULL;
+	} while (!ps_cas(&h->head, (unsigned long)l, (unsigned long)l->next_free));
+
+	l->next_free = NULL;
+
+	return l;
+}
+
+/* end treiber stacks */
+#define PART_TASKS_MAX_SZ round_up_to_page(PART_MAX_TASKS * sizeof(struct part_task))
+#define PART_MAX_PAGES (PART_TASKS_MAX_SZ / PAGE_SIZE)
+#define PART_DATA_MAX_SZ round_up_to_page(PART_MAX_TASKS * sizeof(struct part_data))
+#define PART_MAX_DATA_PAGES (PART_DATA_MAX_SZ / PAGE_SIZE)
+#define PART_DEQUE_MAX_PAGES (round_up_to_page(sizeof(struct deque_part)) / PAGE_SIZE)
+
+struct partdata_head pd_head[NUM_CPU];
+
+static inline void
+partdata_store_init_all(vaddr_t mem)
 {
 	int i;
-	struct part_task *t;
 
-	for (i = 0; i < PART_MAX_TASKS; i++) {
-		t = part_tasks + i;
+	for (i = 0; i < NUM_CPU; i++) {
+		int j;
+		struct part_data *st = (struct part_data *)(mem + (PART_DATA_MAX_SZ * i));
 
-		if (ps_load(&t->state) != PART_TASK_S_FREED) continue;
+		partdata_store_init(&pd_head[i]);
+		
+		for (j = 0; j < PART_MAX_TASKS; j++) partdata_store_add(&pd_head[i], st + j);
+	}
+}
+
+struct parttask_head pt_head[NUM_CPU];
 
-		/* if this fails, someone else just alloced it! */
-		if (!ps_cas(&t->state, PART_TASK_S_FREED, PART_TASK_S_ALLOCATED)) continue;
+static inline void
+parttask_store_init_all(vaddr_t mem)
+{
+	int i;
 
-		return t;
+	for (i = 0; i < NUM_CPU; i++) {
+		int j;
+		struct part_task *st = (struct part_task *)(mem + (PART_TASKS_MAX_SZ * i));
+
+		parttask_store_init(&pt_head[i]);
+		
+		for (j = 0; j < PART_MAX_TASKS; j++) parttask_store_add(&pt_head[i], st + j);
 	}
+}
 
-	return NULL;
+/* idle thread to wakeup when there is nothing to do on this core! */
+static void
+part_idle_fn(void *d)
+{
+	while (1) {
+		part_pool_wakeup();
+		sl_thd_yield_thd(sl__globals_core()->sched_thd);
+	}
 }
 
-void
-part_task_free(struct part_task *t)
+struct part_data *
+part_data_alloc(void)
 {
-	part_task_state_t s = 0;
+	struct part_data *d = partdata_store_dequeue(&pd_head[cos_cpuid()]);
+
+	if (!d) return d;
+	if (!ps_cas(&d->flag, 0, 1)) assert(0);
+
+	return d;
+//	int i;
+//	struct part_data *d = ps_slab_alloc_partdata();
+//
+//	if (!ps_cas(&d->flag, 0, 1)) assert(0);
+//
+//	return d;
+//	for (i = 0; i < PART_MAX_TASKS; i++) {
+//		d = part__data + i;
+//
+//		if (d->flag) continue;
+//
+//		/* if this fails, someone else just alloced it! */
+//		if (!ps_cas(&d->flag, 0, 1)) continue;
+//
+//		return d;
+//	}
+//
+//	return NULL;
+}
 
-	if (!t) return;
+void
+part_data_free(struct part_data *d)
+{
+	if (!ps_cas(&d->flag, 1, 0)) assert(0);
+
+	partdata_store_add(&pd_head[cos_cpuid()], d);
+//	ps_slab_free_partdata(d);
+//	int f;
+//
+//	if (!d) return;
+//
+//	do {
+//		f = d->flag;
+//		assert(f);
+//	} while (!ps_cas(&d->flag, f, 0));
+}
+struct part_task *
+part_task_alloc(part_task_type_t type)
+{
+	struct part_task *t = parttask_store_dequeue(&pt_head[cos_cpuid()]);
+
+	if (!t) return t;
+
+	/* use upcas ? */
+	if (!ps_cas(&t->state, PART_TASK_S_FREED, PART_TASK_S_ALLOCATED)) assert(0);
+
+	return t;
+//	struct part_task *t = ps_slab_alloc_parttask();
+//
+//	if (!ps_cas(&t->state, PART_TASK_S_FREED, PART_TASK_S_ALLOCATED)) assert(0);
+//
+//	return t;
+//	int i;
+//	struct part_task *t;
+//
+//	for (i = 0; i < PART_MAX_TASKS; i++) {
+//		t = part_tasks + i;
+//
+//		if (ps_load(&t->state) != PART_TASK_S_FREED) continue;
+//
+//		/* if this fails, someone else just alloced it! */
+//		if (!ps_cas(&t->state, PART_TASK_S_FREED, PART_TASK_S_ALLOCATED)) continue;
+//
+//		return t;
+//	}
+//
+//	return NULL;
+}
 
-	do {
-		s = ps_load(&t->state);
-		if (s != PART_TASK_S_INITIALIZED) return;
-	} while (!ps_cas(&t->state, s, PART_TASK_S_FREED));
+void
+part_task_free(struct part_task *t)
+{
+	if (!ps_cas(&t->state, PART_TASK_S_INITIALIZED, PART_TASK_S_FREED)) assert(0);
+
+	parttask_store_add(&pt_head[cos_cpuid()], t);
+//	ps_slab_free_parttask(t);
+//	part_task_state_t s = 0;
+//
+//	if (!t) return;
+//
+//	do {
+//		s = ps_load(&t->state);
+//		if (s != PART_TASK_S_INITIALIZED) return;
+//	} while (!ps_cas(&t->state, s, PART_TASK_S_FREED));
 }
 
 unsigned
@@ -123,18 +298,25 @@ part_init(void)
 
 	ps_list_head_init(&part_thdpool_core[cos_cpuid()]);
 	if (ps_cas(&is_first, NUM_CPU, cos_cpuid())) {
+		vaddr_t ptmem = 0, pdmem = 0;
+
 		for (k = 0; k < NUM_CPU; k++) {
 			part_dq_percore[k] = (struct deque_part *)memmgr_heap_page_allocn(PART_DEQUE_MAX_PAGES);
 			assert(part_dq_percore[k]);
 			deque_init_part(part_dq_percore[k], PART_DEQUE_SZ);
 		}
-		part_tasks = (struct part_task *)memmgr_heap_page_allocn(PART_MAX_PAGES);
-		assert(part_tasks);
-		memset(part_tasks, 0, PART_MAX_PAGES * PAGE_SIZE);
-
-		part__data = (struct part_data *)memmgr_heap_page_allocn(PART_MAX_DATA_PAGES);
-		assert(part__data);
-		memset(part__data, 0, PART_MAX_DATA_PAGES * PAGE_SIZE);
+		ptmem = memmgr_heap_page_allocn(PART_MAX_PAGES * NUM_CPU);
+		assert(ptmem);
+		memset((void *)ptmem, 0, PART_MAX_PAGES * PAGE_SIZE * NUM_CPU);
+
+		pdmem = memmgr_heap_page_allocn(PART_MAX_DATA_PAGES * NUM_CPU);
+		assert(pdmem);
+		memset((void *)pdmem, 0, PART_MAX_DATA_PAGES * PAGE_SIZE * NUM_CPU);
+
+		partdata_store_init_all(pdmem);
+		parttask_store_init_all(ptmem);
+//		ps_slab_init_parttask();
+//		ps_slab_init_partdata();
 
 #if defined(PART_ENABLE_NESTED)
 		ps_list_head_init(&part_l_global);

From 31dc99ccf3a0629da1276c1f0cb72661e1f679dc Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Fri, 17 May 2019 23:44:26 -0400
Subject: [PATCH 086/127] remove unnecessary pushes used in debugging

---
 src/components/include/sl.h | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index 93f5813327..f7d817655b 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -492,12 +492,6 @@ sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 	 */
 
 	__asm__ __volatile__ (				\
-		"pushl %%eax\n\t"			\
-		"pushl %%ebx\n\t"			\
-		"pushl %%ecx\n\t"			\
-		"pushl %%edx\n\t"			\
-		"pushl %%esi\n\t"			\
-		"pushl %%edi\n\t"			\
 		"pushl %%ebp\n\t"			\
 		"movl %%esp, %%ebp\n\t"			\
 		"movl $2f, (%%eax)\n\t"			\
@@ -525,12 +519,6 @@ sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 		".align 4\n\t"				\
 		"3:\n\t"				\
 		"popl %%ebp\n\t"			\
-		"popl %%edi\n\t"			\
-		"popl %%esi\n\t"			\
-		"popl %%edx\n\t"			\
-		"popl %%ecx\n\t"			\
-		"popl %%ebx\n\t"			\
-		"popl %%eax\n\t"			\
 		:
 		: "a" (cd), "b" (nd),
 		  "S" ((u32_t)((u64_t)tok >> 32)), "D" ((u32_t)(((u64_t)tok << 32) >> 32)),

From 8779376684ceff7d168be5f01d90087c91c69db8 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Sat, 18 May 2019 00:20:54 -0400
Subject: [PATCH 087/127] schedule something instead of spinnning for response

---
 src/components/lib/sl/sl_xcore.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/components/lib/sl/sl_xcore.c b/src/components/lib/sl/sl_xcore.c
index cb41b76be2..78724c4ed6 100644
--- a/src/components/lib/sl/sl_xcore.c
+++ b/src/components/lib/sl/sl_xcore.c
@@ -16,7 +16,10 @@ _sl_xcore_response_wait(struct sl_xcore_response *r)
 	if (sl_thd_curr() != sl__globals_core()->sched_thd) {
 		if (!ps_load(&r->resp_ready)) sl_thd_block(0);
 	} else {
-		while (!ps_load(&r->resp_ready)) ;
+		while (!ps_load(&r->resp_ready)) {
+			if (sl_cs_enter_sched()) continue;
+			sl_cs_exit_schedule_nospin();
+		}
 	}
 	assert(r->resp_ready);
 }
@@ -78,17 +81,17 @@ static inline int
 _sl_xcore_request_enqueue_no_cs(cpuid_t core, struct sl_xcore_request *rq)
 {
 	int ret = 0;
-//	asndcap_t snd = 0;
+	asndcap_t snd = 0;
 	
 	if (unlikely(core >= NUM_CPU)) return -1;
 	if (unlikely(core == cos_cpuid())) return -1;
 	if (unlikely(!bitmap_check(sl__globals()->core_bmp, core))) return -1;
 	ret = ck_ring_enqueue_mpsc_xcore(sl__ring(core), sl__ring_buffer(core), rq);
-//	snd = sl__globals()->xcore_asnd[cos_cpuid()][core];
-//	assert(snd);
-//
+	snd = sl__globals()->xcore_asnd[cos_cpuid()][core];
+	assert(snd);
+
 //	/* send an IPI for the request */
-//	cos_asnd(snd, 0);
+	cos_asnd(snd, 0);
 
 	if (unlikely(ret == false)) return -1;
 

From 7776e5008ec2ea5cfabb0a0eb996f82d873dc1d9 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Sat, 18 May 2019 00:22:06 -0400
Subject: [PATCH 088/127] fixing tied version in sort

---
 .../implementation/no_interface/omp_sort_bots/sort.c   | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/components/implementation/no_interface/omp_sort_bots/sort.c b/src/components/implementation/no_interface/omp_sort_bots/sort.c
index e8347e4ff9..d8140970d6 100644
--- a/src/components/implementation/no_interface/omp_sort_bots/sort.c
+++ b/src/components/implementation/no_interface/omp_sort_bots/sort.c
@@ -345,11 +345,19 @@ void cilkmerge_par(ELM *low1, ELM *high1, ELM *low2, ELM *high2, ELM *lowdest)
       * the appropriate location
       */
      *(lowdest + lowsize + 1) = *split1;
+#if defined(FORCE_TIED_TASKS)
+#pragma omp task
+     cilkmerge_par(low1, split1 - 1, low2, split2, lowdest);
+#pragma omp task
+     cilkmerge_par(split1 + 1, high1, split2 + 1, high2,
+		     lowdest + lowsize + 2);
+#else
 #pragma omp task untied
      cilkmerge_par(low1, split1 - 1, low2, split2, lowdest);
 #pragma omp task untied
      cilkmerge_par(split1 + 1, high1, split2 + 1, high2,
 		     lowdest + lowsize + 2);
+#endif
 #pragma omp taskwait
 
      return;
@@ -488,7 +496,7 @@ void sort_par ( void )
 	#pragma omp parallel
 	#pragma omp single nowait
 #if defined(FORCE_TIED_TASKS)
-	#pragma omp task untied
+	#pragma omp task
 	     cilksort_par(array, tmp, bots_arg_size);
 #else
 	#pragma omp task untied

From 99af5e8b64178d96a9c549be1419f8b904a63acc Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Sat, 18 May 2019 00:22:42 -0400
Subject: [PATCH 089/127] the last thread doesn't block after waking up master
 on barrier

---
 src/components/include/part.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/components/include/part.h b/src/components/include/part.h
index 53e60db085..fb502ab294 100644
--- a/src/components/include/part.h
+++ b/src/components/include/part.h
@@ -356,7 +356,7 @@ part_task_barrier(struct part_task *t, int is_end)
 			part_peer_wakeup(t);
 		} else {
 			part_master_wakeup(t);
-			sl_thd_block(0);
+			//sl_thd_block(0);
 		}
 	}
 	assert(ps_load(&t->barrier_epoch) == cbep + 1);

From 223b7d3c028262adbf14f561442a275e93ecaff2 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Sat, 18 May 2019 12:29:51 -0400
Subject: [PATCH 090/127] Fixed ipi preemption problem

---
 .../tests/unit_slxcore/Makefile               |  8 ++
 .../implementation/tests/unit_slxcore/init.c  | 86 +++++++++++++++++++
 src/components/include/sl.h                   | 31 -------
 src/components/lib/sl/sl_raw.c                |  1 +
 src/kernel/capinv.c                           |  6 +-
 src/platform/i386/runscripts/unit_slxcore.sh  |  4 +
 6 files changed, 104 insertions(+), 32 deletions(-)
 create mode 100644 src/components/implementation/tests/unit_slxcore/Makefile
 create mode 100644 src/components/implementation/tests/unit_slxcore/init.c
 create mode 100644 src/platform/i386/runscripts/unit_slxcore.sh

diff --git a/src/components/implementation/tests/unit_slxcore/Makefile b/src/components/implementation/tests/unit_slxcore/Makefile
new file mode 100644
index 0000000000..0bc62b21b8
--- /dev/null
+++ b/src/components/implementation/tests/unit_slxcore/Makefile
@@ -0,0 +1,8 @@
+COMPONENT=unit_slxcoretests.o
+INTERFACES=
+DEPENDENCIES=
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format $(LIBSLRAW) -lsl_mod_rr -lsl_thd_static_backend -lcos_dcb
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/tests/unit_slxcore/init.c b/src/components/implementation/tests/unit_slxcore/init.c
new file mode 100644
index 0000000000..7038d767fc
--- /dev/null
+++ b/src/components/implementation/tests/unit_slxcore/init.c
@@ -0,0 +1,86 @@
+#include <cos_kernel_api.h>
+#include <cos_defkernel_api.h>
+#include <llprint.h>
+#include <sl.h>
+#include <cos_dcb.h>
+#include <hypercall.h>
+
+#define MAX_PONG 20
+static struct sl_xcore_thd *ping;
+static struct sl_xcore_thd *pong[MAX_PONG];
+
+static inline void
+ping_fn(void *d)
+{
+	int k = 0;
+
+	while (1) {
+		sl_xcore_thd_wakeup(pong[k % MAX_PONG]);
+		k++;
+	}
+}
+
+static inline void
+pong_fn(void *d)
+{
+	while (1) {
+		sl_thd_block(0);
+	}
+}
+
+void
+cos_init(void *d)
+{
+	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
+	struct cos_compinfo *   ci    = cos_compinfo_get(defci);
+	int i;
+	static volatile unsigned long first = NUM_CPU + 1, init_done[NUM_CPU] = { 0 };
+	static unsigned b1 = 0, b2 = 0, b3 = 0;
+
+	if (ps_cas(&first, NUM_CPU + 1, cos_cpuid())) {
+		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
+		cos_defcompinfo_llinit();
+	} else {
+		while (!ps_load(&init_done[first])) ;
+
+		cos_defcompinfo_sched_init();
+	}
+	cos_dcb_info_init_curr();
+	ps_faa(&init_done[cos_cpuid()], 1);
+
+	/* make sure the INITTHD of the scheduler is created on all cores.. for cross-core sl initialization to work! */
+	for (i = 0; i < NUM_CPU; i++) {
+		while (!ps_load(&init_done[i])) ;
+	}
+	sl_init(SL_MIN_PERIOD_US);
+	/* barrier, wait for sl_init to be done on all cores */
+	ps_faa(&b1, 1);
+	while (ps_load(&b1) != NUM_CPU) ;
+	if (cos_cpuid()) {
+		for (i = 0; i < MAX_PONG; i++) {
+			struct sl_thd *t = sl_thd_alloc(pong_fn, NULL);
+
+			assert(t);
+			sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX));
+			pong[i] = sl_xcore_thd_lookup(sl_thd_thdid(t));
+			assert(pong[i]);
+		}
+	} else {
+		struct sl_thd *t = sl_thd_alloc(ping_fn, NULL);
+
+		assert(t);
+		sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX));
+
+		ping = sl_xcore_thd_lookup(sl_thd_thdid(t));
+		assert(ping);
+	}
+	ps_faa(&b2, 1);
+	while (ps_load(&b2) != NUM_CPU) ;
+	PRINTC("Ready!");
+//	hypercall_comp_init_done();
+
+	sl_sched_loop_nonblock();
+
+	PRINTC("Should never get here!\n");
+	assert(0);
+}
diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index f7d817655b..828a198d4e 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -463,8 +463,6 @@ sl_thd_activate(struct sl_thd *t, sched_tok_t tok, tcap_time_t timeout)
 	}
 }
 
-
-
 static inline int
 sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 {
@@ -525,35 +523,6 @@ sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 		  "c" (&(scb->curr_thd)), "d" (sl_thd_thdcap(next))
 		: "memory", "cc");
 
-#if 0
-	__asm__ __volatile__ (				\
-		"pushl %%ebp\n\t"			\
-		"movl $2f, (%%eax)\n\t"			\
-		"movl %%esp, 4(%%eax)\n\t"		\
-		"cmp $0, 4(%%ebx)\n\t"			\
-		"je 1f\n\t"				\
-		"movl %%edx, (%%ecx)\n\t"		\
-		"movl 4(%%ebx), %%esp\n\t"		\
-		"jmp *(%%ebx)\n\t"			\
-		"1:\n\t"				\
-		"movl %%esp, %%ebp\n\t"			\
-		"pushl %%edx\n\t"			\
-		"call sl_thd_kern_dispatch\n\t"		\
-		"addl $4, %%esp\n\t"			\
-		"jmp 3f\n\t"				\
-		".align 4\n\t"				\
-		"2:\n\t"				\
-		"movl $0, 4(%%ebx)\n\t"			\
-		".align 4\n\t"				\
-		"3:\n\t"				\
-		"popl %%ebp\n\t"			\
-		:
-		: "a" (cd), "b" (nd),
-		  "S" ((u32_t)((u64_t)tok >> 32)), "D" ((u32_t)(((u64_t)tok << 32) >> 32)),
-		  "c" (&(scb->curr_thd)), "d" (sl_thd_thdcap(next))
-		: "memory", "cc");
-#endif
-
 	//if (likely(sl_scb_info_core()->sched_tok == tok)) return 0;
 
 	return 0;
diff --git a/src/components/lib/sl/sl_raw.c b/src/components/lib/sl/sl_raw.c
index 618eac31b5..b73384e10e 100644
--- a/src/components/lib/sl/sl_raw.c
+++ b/src/components/lib/sl/sl_raw.c
@@ -75,6 +75,7 @@ sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t
 	ps_list_init(t, SL_THD_EVENT_LIST);
 	ps_list_init(t, partlist);
 	sl_thd_event_info_reset(t);
+	sl_xcore_thd_lookup_init(aep->tid, cos_cpuid());
 
 done:
 	return t;
diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index 2271153b4e..357ba7b5fb 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -711,6 +711,10 @@ cap_thd_op(struct cap_thd *thd_cap, struct thread *thd, struct pt_regs *regs, st
 	int          ret;
 
 	if (thd_cap->cpuid != get_cpuid() || thd_cap->cpuid != next->cpuid) return -EINVAL;
+	if (unlikely(thd->dcbinfo && thd->dcbinfo->sp)) {
+		assert((unsigned long)regs->cx == thd->dcbinfo->ip + DCB_IP_KERN_OFF);
+		assert((unsigned long)regs->bp == thd->dcbinfo->sp);
+	}
 
 	if (arcv) {
 		struct cap_arcv *arcv_cap;
@@ -781,7 +785,7 @@ cap_ipi_process(struct pt_regs *regs)
 	int                         i, scan_base;
 	unsigned long               ip, sp;
 
-	thd_curr       = thd_next = thd_current(cos_info);
+	thd_next       = thd_curr = cap_ulthd_lazyupdate(regs, cos_info, 1, &ci);
 	receiver_rings = &IPI_cap_dest[get_cpuid()];
 	tcap_curr      = tcap_next = tcap_current(cos_info);
 	ci             = thd_invstk_current(thd_curr, &ip, &sp, cos_info);
diff --git a/src/platform/i386/runscripts/unit_slxcore.sh b/src/platform/i386/runscripts/unit_slxcore.sh
new file mode 100644
index 0000000000..4cb06cf503
--- /dev/null
+++ b/src/platform/i386/runscripts/unit_slxcore.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+cp unit_slxcoretests.o llboot.o
+./cos_linker "llboot.o, :" ./gen_client_stub

From abb169b21ed17801e25e53e8b6e01ae59257ffa9 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Sat, 18 May 2019 12:29:51 -0400
Subject: [PATCH 091/127] Fixed ipi preemption problem

---
 .../tests/unit_slxcore/Makefile               |  8 ++
 .../implementation/tests/unit_slxcore/init.c  | 86 +++++++++++++++++++
 src/components/include/sl.h                   | 31 -------
 src/components/lib/sl/sl_raw.c                |  1 +
 src/kernel/capinv.c                           | 10 ++-
 src/platform/i386/runscripts/unit_slxcore.sh  |  4 +
 6 files changed, 105 insertions(+), 35 deletions(-)
 create mode 100644 src/components/implementation/tests/unit_slxcore/Makefile
 create mode 100644 src/components/implementation/tests/unit_slxcore/init.c
 create mode 100644 src/platform/i386/runscripts/unit_slxcore.sh

diff --git a/src/components/implementation/tests/unit_slxcore/Makefile b/src/components/implementation/tests/unit_slxcore/Makefile
new file mode 100644
index 0000000000..0bc62b21b8
--- /dev/null
+++ b/src/components/implementation/tests/unit_slxcore/Makefile
@@ -0,0 +1,8 @@
+COMPONENT=unit_slxcoretests.o
+INTERFACES=
+DEPENDENCIES=
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format $(LIBSLRAW) -lsl_mod_rr -lsl_thd_static_backend -lcos_dcb
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/tests/unit_slxcore/init.c b/src/components/implementation/tests/unit_slxcore/init.c
new file mode 100644
index 0000000000..7038d767fc
--- /dev/null
+++ b/src/components/implementation/tests/unit_slxcore/init.c
@@ -0,0 +1,86 @@
+#include <cos_kernel_api.h>
+#include <cos_defkernel_api.h>
+#include <llprint.h>
+#include <sl.h>
+#include <cos_dcb.h>
+#include <hypercall.h>
+
+#define MAX_PONG 20
+static struct sl_xcore_thd *ping;
+static struct sl_xcore_thd *pong[MAX_PONG];
+
+static inline void
+ping_fn(void *d)
+{
+	int k = 0;
+
+	while (1) {
+		sl_xcore_thd_wakeup(pong[k % MAX_PONG]);
+		k++;
+	}
+}
+
+static inline void
+pong_fn(void *d)
+{
+	while (1) {
+		sl_thd_block(0);
+	}
+}
+
+void
+cos_init(void *d)
+{
+	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
+	struct cos_compinfo *   ci    = cos_compinfo_get(defci);
+	int i;
+	static volatile unsigned long first = NUM_CPU + 1, init_done[NUM_CPU] = { 0 };
+	static unsigned b1 = 0, b2 = 0, b3 = 0;
+
+	if (ps_cas(&first, NUM_CPU + 1, cos_cpuid())) {
+		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
+		cos_defcompinfo_llinit();
+	} else {
+		while (!ps_load(&init_done[first])) ;
+
+		cos_defcompinfo_sched_init();
+	}
+	cos_dcb_info_init_curr();
+	ps_faa(&init_done[cos_cpuid()], 1);
+
+	/* make sure the INITTHD of the scheduler is created on all cores.. for cross-core sl initialization to work! */
+	for (i = 0; i < NUM_CPU; i++) {
+		while (!ps_load(&init_done[i])) ;
+	}
+	sl_init(SL_MIN_PERIOD_US);
+	/* barrier, wait for sl_init to be done on all cores */
+	ps_faa(&b1, 1);
+	while (ps_load(&b1) != NUM_CPU) ;
+	if (cos_cpuid()) {
+		for (i = 0; i < MAX_PONG; i++) {
+			struct sl_thd *t = sl_thd_alloc(pong_fn, NULL);
+
+			assert(t);
+			sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX));
+			pong[i] = sl_xcore_thd_lookup(sl_thd_thdid(t));
+			assert(pong[i]);
+		}
+	} else {
+		struct sl_thd *t = sl_thd_alloc(ping_fn, NULL);
+
+		assert(t);
+		sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX));
+
+		ping = sl_xcore_thd_lookup(sl_thd_thdid(t));
+		assert(ping);
+	}
+	ps_faa(&b2, 1);
+	while (ps_load(&b2) != NUM_CPU) ;
+	PRINTC("Ready!");
+//	hypercall_comp_init_done();
+
+	sl_sched_loop_nonblock();
+
+	PRINTC("Should never get here!\n");
+	assert(0);
+}
diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index f7d817655b..828a198d4e 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -463,8 +463,6 @@ sl_thd_activate(struct sl_thd *t, sched_tok_t tok, tcap_time_t timeout)
 	}
 }
 
-
-
 static inline int
 sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 {
@@ -525,35 +523,6 @@ sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 		  "c" (&(scb->curr_thd)), "d" (sl_thd_thdcap(next))
 		: "memory", "cc");
 
-#if 0
-	__asm__ __volatile__ (				\
-		"pushl %%ebp\n\t"			\
-		"movl $2f, (%%eax)\n\t"			\
-		"movl %%esp, 4(%%eax)\n\t"		\
-		"cmp $0, 4(%%ebx)\n\t"			\
-		"je 1f\n\t"				\
-		"movl %%edx, (%%ecx)\n\t"		\
-		"movl 4(%%ebx), %%esp\n\t"		\
-		"jmp *(%%ebx)\n\t"			\
-		"1:\n\t"				\
-		"movl %%esp, %%ebp\n\t"			\
-		"pushl %%edx\n\t"			\
-		"call sl_thd_kern_dispatch\n\t"		\
-		"addl $4, %%esp\n\t"			\
-		"jmp 3f\n\t"				\
-		".align 4\n\t"				\
-		"2:\n\t"				\
-		"movl $0, 4(%%ebx)\n\t"			\
-		".align 4\n\t"				\
-		"3:\n\t"				\
-		"popl %%ebp\n\t"			\
-		:
-		: "a" (cd), "b" (nd),
-		  "S" ((u32_t)((u64_t)tok >> 32)), "D" ((u32_t)(((u64_t)tok << 32) >> 32)),
-		  "c" (&(scb->curr_thd)), "d" (sl_thd_thdcap(next))
-		: "memory", "cc");
-#endif
-
 	//if (likely(sl_scb_info_core()->sched_tok == tok)) return 0;
 
 	return 0;
diff --git a/src/components/lib/sl/sl_raw.c b/src/components/lib/sl/sl_raw.c
index 618eac31b5..b73384e10e 100644
--- a/src/components/lib/sl/sl_raw.c
+++ b/src/components/lib/sl/sl_raw.c
@@ -75,6 +75,7 @@ sl_thd_alloc_init(struct cos_aep_info *aep, asndcap_t sndcap, sl_thd_property_t
 	ps_list_init(t, SL_THD_EVENT_LIST);
 	ps_list_init(t, partlist);
 	sl_thd_event_info_reset(t);
+	sl_xcore_thd_lookup_init(aep->tid, cos_cpuid());
 
 done:
 	return t;
diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index 2271153b4e..51a363453f 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -711,6 +711,10 @@ cap_thd_op(struct cap_thd *thd_cap, struct thread *thd, struct pt_regs *regs, st
 	int          ret;
 
 	if (thd_cap->cpuid != get_cpuid() || thd_cap->cpuid != next->cpuid) return -EINVAL;
+	if (unlikely(thd->dcbinfo && thd->dcbinfo->sp)) {
+		assert((unsigned long)regs->cx == thd->dcbinfo->ip + DCB_IP_KERN_OFF);
+		assert((unsigned long)regs->bp == thd->dcbinfo->sp);
+	}
 
 	if (arcv) {
 		struct cap_arcv *arcv_cap;
@@ -779,13 +783,11 @@ cap_ipi_process(struct pt_regs *regs)
 	struct tcap 		   *tcap_curr, *tcap_next;
 	struct comp_info 	   *ci;
 	int                         i, scan_base;
-	unsigned long               ip, sp;
 
-	thd_curr       = thd_next = thd_current(cos_info);
+	thd_next       = thd_curr = cap_ulthd_lazyupdate(regs, cos_info, 1, &ci);
+	assert(ci && ci->captbl);
 	receiver_rings = &IPI_cap_dest[get_cpuid()];
 	tcap_curr      = tcap_next = tcap_current(cos_info);
-	ci             = thd_invstk_current(thd_curr, &ip, &sp, cos_info);
-	assert(ci && ci->captbl);
 
 	scan_base = receiver_rings->start;
 	receiver_rings->start = (receiver_rings->start + 1) % NUM_CPU;
diff --git a/src/platform/i386/runscripts/unit_slxcore.sh b/src/platform/i386/runscripts/unit_slxcore.sh
new file mode 100644
index 0000000000..4cb06cf503
--- /dev/null
+++ b/src/platform/i386/runscripts/unit_slxcore.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+cp unit_slxcoretests.o llboot.o
+./cos_linker "llboot.o, :" ./gen_client_stub

From 1a4aa60fe098214722b06bba7820146dcab058c8 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Sat, 18 May 2019 16:33:24 -0400
Subject: [PATCH 092/127] token checking code restored (missed in debugging)

---
 src/components/include/sl.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index 828a198d4e..c628ee41b4 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -523,10 +523,9 @@ sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 		  "c" (&(scb->curr_thd)), "d" (sl_thd_thdcap(next))
 		: "memory", "cc");
 
-	//if (likely(sl_scb_info_core()->sched_tok == tok)) return 0;
+	if (likely(sl_scb_info_core()->sched_tok == tok)) return 0;
 
-	return 0;
-	//return -EAGAIN;
+	return -EAGAIN;
 }
 
 static inline int

From 51f6cd91767c30c4e76353c99c22d8278783f412 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Sat, 18 May 2019 16:34:09 -0400
Subject: [PATCH 093/127] Multiple wakeups cause race in SL.

* with multi-core execution, a thread executing parent task could be
  woken up multiple times by threads on other cores finishing up
  with the child tasks before master goes to synchronize.
  For now, fixed the way I fixed it long back in voter code.

* I don't know if this problem will go away if we use blockpoints
  for parent-child sync, but will try that.
---
 src/components/include/part.h            | 11 +++++++----
 src/components/lib/sl/sl_mod_part_fifo.c |  4 ++++
 src/components/lib/sl/sl_sched.c         | 20 ++++++++++++++++++--
 3 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/src/components/include/part.h b/src/components/include/part.h
index fb502ab294..522d04abed 100644
--- a/src/components/include/part.h
+++ b/src/components/include/part.h
@@ -141,7 +141,7 @@ part_pool_wakeup(void)
 	if (unlikely(ps_list_head_empty(part_thdpool_curr()))) goto done;
 
 	t = ps_list_head_first(part_thdpool_curr(), struct sl_thd, partlist);
-	ps_list_rem(t, partlist);
+	/* removal from the list is taken care in mod_part_fifo */
 	if (t == sl_thd_curr()) goto done;
 	sl_thd_wakeup_no_cs(t);
 done:
@@ -155,12 +155,15 @@ part_pool_block(void)
 #ifdef PART_ENABLE_BLOCKING
 	struct sl_thd *t = sl_thd_curr();
 
+	/* very much a replica of sl_thd_block + adding to thread pool in part */
 	sl_cs_enter();
+	if (sl_thd_block_no_cs(t, SL_THD_BLOCKED, 0)) {
+		sl_cs_exit();
+		return;
+	}
 	if (ps_list_singleton(t, partlist)) ps_list_head_append(part_thdpool_curr(), t, partlist);
-	if (!sl_thd_is_runnable(t)) assert(0);
-
-	sl_thd_block_no_cs(t, SL_THD_BLOCKED, 0);
 	sl_cs_exit_schedule();
+	assert(sl_thd_is_runnable(t));
 #else
 	sl_thd_yield(0);
 #endif
diff --git a/src/components/lib/sl/sl_mod_part_fifo.c b/src/components/lib/sl/sl_mod_part_fifo.c
index 0c1e727284..1b9e5cb72e 100644
--- a/src/components/lib/sl/sl_mod_part_fifo.c
+++ b/src/components/lib/sl/sl_mod_part_fifo.c
@@ -59,10 +59,14 @@ sl_mod_block(struct sl_thd_policy *t)
 void
 sl_mod_wakeup(struct sl_thd_policy *t)
 {
+	struct sl_thd *tm = sl_mod_thd_get(t);
+
 	assert(t != idle_thd[cos_cpuid()]);
 	assert(ps_list_singleton_d(t));
 
 	ps_list_head_append_d(&threads[cos_cpuid()], t);
+	/* remove from partlist used for tracking free pool of tasks on this core! */
+	if (!ps_list_singleton(tm, partlist)) ps_list_rem(tm, partlist);
 }
 
 void
diff --git a/src/components/lib/sl/sl_sched.c b/src/components/lib/sl/sl_sched.c
index 0dd888ea07..7f6329a2a6 100644
--- a/src/components/lib/sl/sl_sched.c
+++ b/src/components/lib/sl/sl_sched.c
@@ -259,6 +259,7 @@ sl_thd_block(thdid_t tid)
 		return;
 	}
 	sl_cs_exit_schedule();
+	assert(sl_thd_is_runnable(t));
 
 	return;
 }
@@ -413,8 +414,23 @@ sl_thd_wakeup_no_cs(struct sl_thd *t)
 		return 0;
 	}
 
-	if (unlikely(sl_thd_is_runnable(t))) {
-		/* t->state == SL_THD_WOKEN? multiple wakeups? */
+//	if (unlikely(sl_thd_is_runnable(t))) {
+//		/* t->state == SL_THD_WOKEN? multiple wakeups? */
+//		t->state = SL_THD_WOKEN;
+//		return 1;
+//	}
+	/*
+	 * TODO: with blockpoints, multiple wakeup problem might go away.
+	 * will try that next!
+	 *
+	 * For now, if a thread creates N tasks and if at least two of them
+	 * complete before master goes to block, which can happen on multi-core
+	 * execution of tasks, then that results in multiple wakeups!
+	 */
+	if (unlikely(t->state == SL_THD_WOKEN)) {
+		t->state = SL_THD_RUNNABLE;
+		return 1;
+	} else if (unlikely(t->state == SL_THD_RUNNABLE)) {
 		t->state = SL_THD_WOKEN;
 		return 1;
 	}

From 48cd69b79e164c9a613d3da8979a690072655e7a Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Sat, 18 May 2019 16:34:09 -0400
Subject: [PATCH 094/127] Multiple wakeups cause race in SL.

* with multi-core execution, a thread executing parent task could be
  woken up multiple times by threads on other cores finishing up
  with the child tasks before master goes to synchronize.
  For now, fixed the way I fixed it long back in voter code.

* perhaps use of blockpoints will solve this problem? lets see!
---
 src/components/include/part.h            | 11 +++++++----
 src/components/lib/sl/sl_mod_part_fifo.c |  4 ++++
 src/components/lib/sl/sl_sched.c         | 20 ++++++++++++++++++--
 3 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/src/components/include/part.h b/src/components/include/part.h
index fb502ab294..522d04abed 100644
--- a/src/components/include/part.h
+++ b/src/components/include/part.h
@@ -141,7 +141,7 @@ part_pool_wakeup(void)
 	if (unlikely(ps_list_head_empty(part_thdpool_curr()))) goto done;
 
 	t = ps_list_head_first(part_thdpool_curr(), struct sl_thd, partlist);
-	ps_list_rem(t, partlist);
+	/* removal from the list is taken care in mod_part_fifo */
 	if (t == sl_thd_curr()) goto done;
 	sl_thd_wakeup_no_cs(t);
 done:
@@ -155,12 +155,15 @@ part_pool_block(void)
 #ifdef PART_ENABLE_BLOCKING
 	struct sl_thd *t = sl_thd_curr();
 
+	/* very much a replica of sl_thd_block + adding to thread pool in part */
 	sl_cs_enter();
+	if (sl_thd_block_no_cs(t, SL_THD_BLOCKED, 0)) {
+		sl_cs_exit();
+		return;
+	}
 	if (ps_list_singleton(t, partlist)) ps_list_head_append(part_thdpool_curr(), t, partlist);
-	if (!sl_thd_is_runnable(t)) assert(0);
-
-	sl_thd_block_no_cs(t, SL_THD_BLOCKED, 0);
 	sl_cs_exit_schedule();
+	assert(sl_thd_is_runnable(t));
 #else
 	sl_thd_yield(0);
 #endif
diff --git a/src/components/lib/sl/sl_mod_part_fifo.c b/src/components/lib/sl/sl_mod_part_fifo.c
index 0c1e727284..1b9e5cb72e 100644
--- a/src/components/lib/sl/sl_mod_part_fifo.c
+++ b/src/components/lib/sl/sl_mod_part_fifo.c
@@ -59,10 +59,14 @@ sl_mod_block(struct sl_thd_policy *t)
 void
 sl_mod_wakeup(struct sl_thd_policy *t)
 {
+	struct sl_thd *tm = sl_mod_thd_get(t);
+
 	assert(t != idle_thd[cos_cpuid()]);
 	assert(ps_list_singleton_d(t));
 
 	ps_list_head_append_d(&threads[cos_cpuid()], t);
+	/* remove from partlist used for tracking free pool of tasks on this core! */
+	if (!ps_list_singleton(tm, partlist)) ps_list_rem(tm, partlist);
 }
 
 void
diff --git a/src/components/lib/sl/sl_sched.c b/src/components/lib/sl/sl_sched.c
index 0dd888ea07..7f6329a2a6 100644
--- a/src/components/lib/sl/sl_sched.c
+++ b/src/components/lib/sl/sl_sched.c
@@ -259,6 +259,7 @@ sl_thd_block(thdid_t tid)
 		return;
 	}
 	sl_cs_exit_schedule();
+	assert(sl_thd_is_runnable(t));
 
 	return;
 }
@@ -413,8 +414,23 @@ sl_thd_wakeup_no_cs(struct sl_thd *t)
 		return 0;
 	}
 
-	if (unlikely(sl_thd_is_runnable(t))) {
-		/* t->state == SL_THD_WOKEN? multiple wakeups? */
+//	if (unlikely(sl_thd_is_runnable(t))) {
+//		/* t->state == SL_THD_WOKEN? multiple wakeups? */
+//		t->state = SL_THD_WOKEN;
+//		return 1;
+//	}
+	/*
+	 * TODO: with blockpoints, multiple wakeup problem might go away.
+	 * will try that next!
+	 *
+	 * For now, if a thread creates N tasks and if at least two of them
+	 * complete before master goes to block, which can happen on multi-core
+	 * execution of tasks, then that results in multiple wakeups!
+	 */
+	if (unlikely(t->state == SL_THD_WOKEN)) {
+		t->state = SL_THD_RUNNABLE;
+		return 1;
+	} else if (unlikely(t->state == SL_THD_RUNNABLE)) {
 		t->state = SL_THD_WOKEN;
 		return 1;
 	}

From 7dffdb459747882b57a55e5722a6cf6776c30a55 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Sat, 18 May 2019 22:37:26 -0400
Subject: [PATCH 095/127] fix comment

---
 src/components/lib/sl/sl_xcore.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/components/lib/sl/sl_xcore.c b/src/components/lib/sl/sl_xcore.c
index 78724c4ed6..e46fc92113 100644
--- a/src/components/lib/sl/sl_xcore.c
+++ b/src/components/lib/sl/sl_xcore.c
@@ -90,7 +90,7 @@ _sl_xcore_request_enqueue_no_cs(cpuid_t core, struct sl_xcore_request *rq)
 	snd = sl__globals()->xcore_asnd[cos_cpuid()][core];
 	assert(snd);
 
-//	/* send an IPI for the request */
+	/* send an IPI for the request */
 	cos_asnd(snd, 0);
 
 	if (unlikely(ret == false)) return -1;

From 84d860ee1bb7bf640e72e4ba725914c824cdf52a Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Sun, 19 May 2019 13:09:51 -0400
Subject: [PATCH 096/127] Modified to not use capmgr for now

---
 .../no_interface/omp_dijkstra/Makefile        |   4 +-
 .../no_interface/omp_dijkstra/posix_basic.c   |   6 +-
 .../no_interface/omp_fib_bots/Makefile        |   4 +-
 .../no_interface/omp_hello/Makefile           |   4 +-
 .../no_interface/omp_hello/init.c             |   6 +-
 .../no_interface/omp_sort_bots/Makefile       |   4 +-
 src/components/lib/Makefile                   |   2 +-
 src/components/lib/{part.c => part_capmgr.c}  |   0
 src/components/lib/part_raw.c                 | 365 ++++++++++++++++++
 src/kernel/include/shared/consts.h            |   2 +-
 src/kernel/include/shared/cos_types.h         |   7 +-
 src/platform/i386/runscripts/omp_dijkstra.sh  |  13 +-
 src/platform/i386/runscripts/omp_fib_bots.sh  |  13 +-
 src/platform/i386/runscripts/omp_hello.sh     |  13 +-
 src/platform/i386/runscripts/omp_sort_bots.sh |  13 +-
 15 files changed, 417 insertions(+), 39 deletions(-)
 rename src/components/lib/{part.c => part_capmgr.c} (100%)
 create mode 100644 src/components/lib/part_raw.c

diff --git a/src/components/implementation/no_interface/omp_dijkstra/Makefile b/src/components/implementation/no_interface/omp_dijkstra/Makefile
index c81e74faf6..a702328c38 100644
--- a/src/components/implementation/no_interface/omp_dijkstra/Makefile
+++ b/src/components/implementation/no_interface/omp_dijkstra/Makefile
@@ -1,8 +1,8 @@
 COMPONENT=omp_dijkstra.o
 INTERFACES=
-DEPENDENCIES=capmgr
+#DEPENDENCIES=capmgr
 IF_LIB=
-ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLCAPMGR) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart -lsl_blkpt -lps
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/no_interface/omp_dijkstra/posix_basic.c b/src/components/implementation/no_interface/omp_dijkstra/posix_basic.c
index c9ae04645d..c29ae23770 100644
--- a/src/components/implementation/no_interface/omp_dijkstra/posix_basic.c
+++ b/src/components/implementation/no_interface/omp_dijkstra/posix_basic.c
@@ -8,10 +8,11 @@
 
 #include <cos_component.h>
 #include <cos_kernel_api.h>
+#include <cos_defkernel_api.h>
 #include <cos_types.h>
 #include <ps.h>
 
-#include <memmgr.h>
+//#include <memmgr.h>
 
 // HACK: The hack to end all hacks
 void *
@@ -37,7 +38,8 @@ cos_mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset)
 		pages = length / 4096;
 	}
 
-	addr = (void *)memmgr_heap_page_allocn(pages);
+	//addr = (void *)memmgr_heap_page_allocn(pages);
+	addr = (void *)cos_page_bump_allocn(cos_compinfo_get(cos_defcompinfo_curr_get()), pages * PAGE_SIZE);
 	if (!addr){
 		ret = (void *) -1;
 	} else {
diff --git a/src/components/implementation/no_interface/omp_fib_bots/Makefile b/src/components/implementation/no_interface/omp_fib_bots/Makefile
index 7eea727205..bee96fd0aa 100644
--- a/src/components/implementation/no_interface/omp_fib_bots/Makefile
+++ b/src/components/implementation/no_interface/omp_fib_bots/Makefile
@@ -1,8 +1,8 @@
 COMPONENT=omp_fib_bots.o
 INTERFACES=
-DEPENDENCIES=capmgr
+#DEPENDENCIES=capmgr
 IF_LIB=
-ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLCAPMGR) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart -lsl_blkpt -lps
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/no_interface/omp_hello/Makefile b/src/components/implementation/no_interface/omp_hello/Makefile
index 185540bca8..ba90175127 100644
--- a/src/components/implementation/no_interface/omp_hello/Makefile
+++ b/src/components/implementation/no_interface/omp_hello/Makefile
@@ -1,8 +1,8 @@
 COMPONENT=omp_hello.o
 INTERFACES=
-DEPENDENCIES=capmgr
+#DEPENDENCIES=capmgr
 IF_LIB=
-ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLCAPMGR) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart -lsl_blkpt -lps
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/no_interface/omp_hello/init.c b/src/components/implementation/no_interface/omp_hello/init.c
index f4ce213f12..2583300907 100644
--- a/src/components/implementation/no_interface/omp_hello/init.c
+++ b/src/components/implementation/no_interface/omp_hello/init.c
@@ -3,7 +3,7 @@
 #include <llprint.h>
 #include <sl.h>
 #include <cos_omp.h>
-#include <hypercall.h>
+#include <cos_dcb.h>
 
 int main(void);
 
@@ -36,12 +36,13 @@ cos_init(void *d)
 	PRINTC("In an OpenMP program!\n");
 	if (ps_cas(&first, NUM_CPU + 1, cos_cpuid())) {
 		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
-		cos_defcompinfo_init();
+		cos_defcompinfo_llinit();
 	} else {
 		while (!ps_load(&init_done[first])) ;
 
 		cos_defcompinfo_sched_init();
 	}
+	cos_dcb_info_init_curr();
 	ps_faa(&init_done[cos_cpuid()], 1);
 
 	/* make sure the INITTHD of the scheduler is created on all cores.. for cross-core sl initialization to work! */
@@ -56,7 +57,6 @@ cos_init(void *d)
 	/* barrier, wait for gomp_init to be done on all cores */
 	ps_faa(&b2, 1);
 	while (ps_load(&b2) != NUM_CPU) ;
-	hypercall_comp_init_done();
 
 	if (!cos_cpuid()) {
 		struct sl_thd *t = NULL;
diff --git a/src/components/implementation/no_interface/omp_sort_bots/Makefile b/src/components/implementation/no_interface/omp_sort_bots/Makefile
index 5d1d63e2cf..05d43d1f94 100644
--- a/src/components/implementation/no_interface/omp_sort_bots/Makefile
+++ b/src/components/implementation/no_interface/omp_sort_bots/Makefile
@@ -1,8 +1,8 @@
 COMPONENT=omp_sort_bots.o
 INTERFACES=
-DEPENDENCIES=capmgr
+#DEPENDENCIES=capmgr
 IF_LIB=
-ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLCAPMGR) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart -lsl_blkpt -lps
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/lib/Makefile b/src/components/lib/Makefile
index 0255456e16..e6313481a5 100644
--- a/src/components/lib/Makefile
+++ b/src/components/lib/Makefile
@@ -1,6 +1,6 @@
 include Makefile.src Makefile.comp
 
-LIB_OBJS=heap.o cobj_format.o cos_kernel_api.o cos_defkernel_api.o cos_dcb.o part.o
+LIB_OBJS=heap.o cobj_format.o cos_kernel_api.o cos_defkernel_api.o cos_dcb.o part_raw.o part_capmgr.o
 LIBS=$(LIB_OBJS:%.o=%.a)
 MANDITORY=c_stub.o cos_asm_upcall.o cos_asm_ainv.o cos_component.o
 MAND=$(MANDITORY_LIB)
diff --git a/src/components/lib/part.c b/src/components/lib/part_capmgr.c
similarity index 100%
rename from src/components/lib/part.c
rename to src/components/lib/part_capmgr.c
diff --git a/src/components/lib/part_raw.c b/src/components/lib/part_raw.c
new file mode 100644
index 0000000000..a499c7efec
--- /dev/null
+++ b/src/components/lib/part_raw.c
@@ -0,0 +1,365 @@
+#include <cos_types.h>
+#include <cos_component.h>
+#include <part_task.h>
+#include <part.h>
+#include <cos_kernel_api.h>
+#include <cos_defkernel_api.h>
+#include <ps.h>
+#include <ps_slab.h>
+
+#include <sl.h>
+#include <sl_xcore.h>
+
+struct deque_part *part_dq_percore[NUM_CPU];
+//struct cirque_par parcq_global;
+static volatile unsigned part_ready = 0;
+volatile int in_main_parallel;
+#if defined(PART_ENABLE_NESTED)
+struct crt_lock part_l_lock;
+struct ps_list_head part_l_global;
+#else
+struct part_task main_task;
+#endif
+//static struct part_task *part_tasks = NULL;
+//static struct part_data *part__data = NULL;
+struct ps_list_head part_thdpool_core[NUM_CPU];
+
+#define PART_DEQUE_SZ PART_MAX_TASKS
+#define _PART_PRIO TCAP_PRIO_MAX
+#define _PART_PRIO_PACK() sched_param_pack(SCHEDP_PRIO, _PART_PRIO)
+
+#define _PART_IDLE_PRIO (_PART_PRIO+4)
+#define _PART_IDLE_PRIO_PACK() sched_param_pack(SCHEDP_PRIO, _PART_IDLE_PRIO)
+
+//struct ps_slab *
+//ps_slab_memmgr_alloc(struct ps_mem *m, size_t sz, coreid_t coreid)
+//{
+//	PRINTC("%s:%d\n", __func__, __LINE__);
+//	unsigned npages = round_up_to_page(sz) / PAGE_SIZE;
+//	vaddr_t addr = memmgr_heap_page_allocn(npages);
+//
+//	assert(addr);
+//	memset((void *)addr, 0, npages * PAGE_SIZE);
+//	PRINTC("%s:%d\n", __func__, __LINE__);
+//
+//	return (struct ps_slab *)addr;
+//}
+//
+//void
+//ps_slab_memmgr_free(struct ps_mem *m, struct ps_slab *s, size_t sz, coreid_t coreid)
+//{
+//	/* do nothing */
+//}
+
+/* this? */
+//PS_SLAB_CREATE_AFNS(parttask, sizeof(struct part_task), PART_TASKS_MAX_SZ, 0, ps_slab_memmgr_alloc, ps_slab_memmgr_free);
+//PS_SLAB_CREATE_AFNS(partdata, sizeof(struct part_data), PART_DATA_MAX_SZ, 0, ps_slab_memmgr_alloc, ps_slab_memmgr_free);
+/* or this. */
+//PS_SLAB_CREATE(parttask, sizeof(struct part_task), PART_TASKS_MAX_SZ)
+//PS_SLAB_CREATE(partdata, sizeof(struct part_data), PART_DATA_MAX_SZ)
+
+/* for task pool, per core list. tasks in pool can migrate cores */
+struct parttask_head {
+	struct part_task *head;
+};
+
+static inline void
+parttask_store_init(struct parttask_head *h)
+{
+	h->head = NULL;
+}
+
+static inline void
+parttask_store_add(struct parttask_head *h, struct part_task *l)
+{
+	struct part_task *n;
+	l->next_free = NULL;
+
+	assert(h);
+	do {
+		n = ps_load(&h->head);
+		l->next_free = n;
+	} while (!ps_cas(&h->head, (unsigned long)n, (unsigned long)l)); 
+}
+
+static inline struct part_task *
+parttask_store_dequeue(struct parttask_head *h)
+{
+	struct part_task *l = NULL;
+
+	do {
+		l = ps_load(&h->head);
+		if (unlikely(!l)) return NULL;
+	} while (!ps_cas(&h->head, (unsigned long)l, (unsigned long)l->next_free));
+
+	l->next_free = NULL;
+
+	return l;
+}
+
+/* for task data, per core pool - task data could migrate pools. */
+struct partdata_head {
+	struct part_data *head;
+};
+
+static inline void
+partdata_store_init(struct partdata_head *h)
+{
+	h->head = NULL;
+}
+
+static inline void
+partdata_store_add(struct partdata_head *h, struct part_data *l)
+{
+	struct part_data *n = NULL;
+	l->next_free = NULL;
+
+	assert(h);
+	do {
+		n = ps_load(&h->head);
+
+		l->next_free = n;
+	} while (!ps_cas(&h->head, (unsigned long)n, (unsigned long)l)); 
+}
+
+static inline struct part_data *
+partdata_store_dequeue(struct partdata_head *h)
+{
+	struct part_data *l = NULL;
+
+	do {
+		l = ps_load(&h->head);
+		if (unlikely(!l)) return NULL;
+	} while (!ps_cas(&h->head, (unsigned long)l, (unsigned long)l->next_free));
+
+	l->next_free = NULL;
+
+	return l;
+}
+
+/* end treiber stacks */
+#define PART_TASKS_MAX_SZ round_up_to_page(PART_MAX_TASKS * sizeof(struct part_task))
+#define PART_MAX_PAGES (PART_TASKS_MAX_SZ / PAGE_SIZE)
+#define PART_DATA_MAX_SZ round_up_to_page(PART_MAX_TASKS * sizeof(struct part_data))
+#define PART_MAX_DATA_PAGES (PART_DATA_MAX_SZ / PAGE_SIZE)
+#define PART_DEQUE_MAX_SZ round_up_to_page(sizeof(struct deque_part))
+#define PART_DEQUE_MAX_PAGES (PART_DEQUE_MAX_SZ / PAGE_SIZE)
+
+struct partdata_head pd_head[NUM_CPU];
+
+static inline void
+partdata_store_init_all(vaddr_t mem)
+{
+	int i;
+
+	for (i = 0; i < NUM_CPU; i++) {
+		int j;
+		struct part_data *st = (struct part_data *)(mem + (PART_DATA_MAX_SZ * i));
+
+		partdata_store_init(&pd_head[i]);
+		
+		for (j = 0; j < PART_MAX_TASKS; j++) partdata_store_add(&pd_head[i], st + j);
+	}
+}
+
+struct parttask_head pt_head[NUM_CPU];
+
+static inline void
+parttask_store_init_all(vaddr_t mem)
+{
+	int i;
+
+	for (i = 0; i < NUM_CPU; i++) {
+		int j;
+		struct part_task *st = (struct part_task *)(mem + (PART_TASKS_MAX_SZ * i));
+
+		parttask_store_init(&pt_head[i]);
+		
+		for (j = 0; j < PART_MAX_TASKS; j++) parttask_store_add(&pt_head[i], st + j);
+	}
+}
+
+/* idle thread to wakeup when there is nothing to do on this core! */
+static void
+part_idle_fn(void *d)
+{
+	while (1) {
+		part_pool_wakeup();
+		sl_thd_yield_thd(sl__globals_core()->sched_thd);
+	}
+}
+
+struct part_data *
+part_data_alloc(void)
+{
+	struct part_data *d = partdata_store_dequeue(&pd_head[cos_cpuid()]);
+
+	if (!d) return d;
+	if (!ps_cas(&d->flag, 0, 1)) assert(0);
+
+	return d;
+//	int i;
+//	struct part_data *d = ps_slab_alloc_partdata();
+//
+//	if (!ps_cas(&d->flag, 0, 1)) assert(0);
+//
+//	return d;
+//	for (i = 0; i < PART_MAX_TASKS; i++) {
+//		d = part__data + i;
+//
+//		if (d->flag) continue;
+//
+//		/* if this fails, someone else just alloced it! */
+//		if (!ps_cas(&d->flag, 0, 1)) continue;
+//
+//		return d;
+//	}
+//
+//	return NULL;
+}
+
+void
+part_data_free(struct part_data *d)
+{
+	if (!ps_cas(&d->flag, 1, 0)) assert(0);
+
+	partdata_store_add(&pd_head[cos_cpuid()], d);
+//	ps_slab_free_partdata(d);
+//	int f;
+//
+//	if (!d) return;
+//
+//	do {
+//		f = d->flag;
+//		assert(f);
+//	} while (!ps_cas(&d->flag, f, 0));
+}
+struct part_task *
+part_task_alloc(part_task_type_t type)
+{
+	struct part_task *t = parttask_store_dequeue(&pt_head[cos_cpuid()]);
+
+	if (!t) return t;
+
+	/* use upcas ? */
+	if (!ps_cas(&t->state, PART_TASK_S_FREED, PART_TASK_S_ALLOCATED)) assert(0);
+
+	return t;
+//	struct part_task *t = ps_slab_alloc_parttask();
+//
+//	if (!ps_cas(&t->state, PART_TASK_S_FREED, PART_TASK_S_ALLOCATED)) assert(0);
+//
+//	return t;
+//	int i;
+//	struct part_task *t;
+//
+//	for (i = 0; i < PART_MAX_TASKS; i++) {
+//		t = part_tasks + i;
+//
+//		if (ps_load(&t->state) != PART_TASK_S_FREED) continue;
+//
+//		/* if this fails, someone else just alloced it! */
+//		if (!ps_cas(&t->state, PART_TASK_S_FREED, PART_TASK_S_ALLOCATED)) continue;
+//
+//		return t;
+//	}
+//
+//	return NULL;
+}
+
+void
+part_task_free(struct part_task *t)
+{
+	if (!ps_cas(&t->state, PART_TASK_S_INITIALIZED, PART_TASK_S_FREED)) assert(0);
+
+	parttask_store_add(&pt_head[cos_cpuid()], t);
+//	ps_slab_free_parttask(t);
+//	part_task_state_t s = 0;
+//
+//	if (!t) return;
+//
+//	do {
+//		s = ps_load(&t->state);
+//		if (s != PART_TASK_S_INITIALIZED) return;
+//	} while (!ps_cas(&t->state, s, PART_TASK_S_FREED));
+}
+
+unsigned
+part_isready(void)
+{ return (part_ready == NUM_CPU); }
+
+void
+part_init(void)
+{
+	struct cos_compinfo *ci = cos_compinfo_get(cos_defcompinfo_curr_get());
+	int k;
+	static volatile int is_first = NUM_CPU;
+	struct sl_thd *it = NULL;
+	struct sl_xcore_thd *xit = NULL;
+	sched_param_t ip = _PART_IDLE_PRIO_PACK();
+	static volatile int all_done = 0;
+
+	ps_list_head_init(&part_thdpool_core[cos_cpuid()]);
+	if (ps_cas(&is_first, NUM_CPU, cos_cpuid())) {
+		vaddr_t ptmem = 0, pdmem = 0;
+
+		for (k = 0; k < NUM_CPU; k++) {
+			part_dq_percore[k] = (struct deque_part *)cos_page_bump_allocn(ci, PART_DEQUE_MAX_SZ);
+			assert(part_dq_percore[k]);
+			deque_init_part(part_dq_percore[k], PART_DEQUE_SZ);
+		}
+		ptmem = cos_page_bump_allocn(ci, PART_TASKS_MAX_SZ * NUM_CPU);
+		assert(ptmem);
+		memset((void *)ptmem, 0, PART_MAX_PAGES * PAGE_SIZE * NUM_CPU);
+
+		pdmem = cos_page_bump_allocn(ci, PART_DATA_MAX_SZ * NUM_CPU);
+		assert(pdmem);
+		memset((void *)pdmem, 0, PART_MAX_DATA_PAGES * PAGE_SIZE * NUM_CPU);
+
+		partdata_store_init_all(pdmem);
+		parttask_store_init_all(ptmem);
+//		ps_slab_init_parttask();
+//		ps_slab_init_partdata();
+
+#if defined(PART_ENABLE_NESTED)
+		ps_list_head_init(&part_l_global);
+		crt_lock_init(&part_l_lock);
+#else
+		memset(&main_task, 0, sizeof(main_task));
+#endif
+		in_main_parallel = 0;
+	}
+	
+	for (k = 0; k < PART_MAX_CORE_THDS; k++) {
+		struct sl_xcore_thd *x;
+		struct sl_thd *t;
+		sched_param_t p = _PART_PRIO_PACK();
+
+		t = sl_thd_alloc(part_thd_fn, NULL);
+		assert(t);
+
+		sl_thd_param_set(t, p);
+
+		x = sl_xcore_thd_lookup_init(sl_thd_thdid(t), cos_cpuid());
+		assert(x);
+	}
+
+#ifdef PART_ENABLE_BLOCKING
+	sl_cs_enter();
+	/* 
+	 * because it's fifo, all threads would go block 
+	 * themselves up as there is no work yet
+	 * eventually returning to this main thread on core-0, 
+	 * and on all other cores, scheduler would be running!
+	 */
+	sl_cs_exit_schedule(); 
+	it = sl_thd_alloc(part_idle_fn, NULL);
+	assert(it);
+	sl_thd_param_set(it, ip);
+#endif
+
+	ps_faa(&all_done, 1);
+	while (ps_load(&all_done) != NUM_CPU) ;
+
+	ps_faa(&part_ready, 1);
+}
diff --git a/src/kernel/include/shared/consts.h b/src/kernel/include/shared/consts.h
index 2c891fdc20..d5cb53b9d9 100644
--- a/src/kernel/include/shared/consts.h
+++ b/src/kernel/include/shared/consts.h
@@ -48,7 +48,7 @@ struct pt_regs {
 #endif
 
 #define MAX_SERVICE_DEPTH 31
-#define MAX_NUM_THREADS (256 * NUM_CPU)
+#define MAX_NUM_THREADS (2048)
 
 /* Stacks are 2 * page_size (expressed in words) */
 #define MAX_STACK_SZ_BYTE_ORDER 12
diff --git a/src/kernel/include/shared/cos_types.h b/src/kernel/include/shared/cos_types.h
index 5eb0cf7e3c..0cd3cbc7f4 100644
--- a/src/kernel/include/shared/cos_types.h
+++ b/src/kernel/include/shared/cos_types.h
@@ -273,7 +273,6 @@ enum
 	/*
 	 * NOTE: kernel doesn't support sharing a cache-line across cores,
 	 *       so optimize to place INIT THD/TCAP on same cache line and bump by 64B for next CPU
-	 * Update: add per-core INIT DCB cap in to the same cache-line.
 	 */
 	BOOT_CAPTBL_SELF_INITRCV_BASE  = round_up_to_pow2(BOOT_CAPTBL_SELF_INITTHD_BASE + NUM_CPU * CAP64B_IDSZ,
                                                          CAPMAX_ENTRY_SZ),
@@ -295,11 +294,11 @@ enum
 enum llboot_scb_dcb_caps
 {
 	LLBOOT_CAPTBL_SCB     = round_up_to_pow2(BOOT_CAPTBL_LAST_CAP, CAPMAX_ENTRY_SZ),
-	LLBOOT_CAPTBL_INITDCB = LLBOOT_CAPTBL_SCB + CAP32B_IDSZ,
-	LLBOOT_CAPTBL_FREE    = round_up_to_pow2(LLBOOT_CAPTBL_INITDCB + (CAP32B_IDSZ * NUM_CPU), CAPMAX_ENTRY_SZ),
+	LLBOOT_CAPTBL_INITDCB = LLBOOT_CAPTBL_SCB + CAP64B_IDSZ,
+	LLBOOT_CAPTBL_FREE    = round_up_to_pow2(LLBOOT_CAPTBL_INITDCB + (CAP64B_IDSZ * NUM_CPU), CAPMAX_ENTRY_SZ),
 };
 
-#define LLBOOT_CAPTBL_INITDCB_CPU(cpuid) (LLBOOT_CAPTBL_INITDCB + (CAP32B_IDSZ * cpuid))
+#define LLBOOT_CAPTBL_INITDCB_CPU(cpuid) (LLBOOT_CAPTBL_INITDCB + (CAP64B_IDSZ * cpuid))
 #define LLBOOT_CAPTBL_CPU_INITDCB        (LLBOOT_CAPTBL_INITDCB_CPU(cos_cpuid()))
 
 /*
diff --git a/src/platform/i386/runscripts/omp_dijkstra.sh b/src/platform/i386/runscripts/omp_dijkstra.sh
index 0906da77e8..128366ed60 100644
--- a/src/platform/i386/runscripts/omp_dijkstra.sh
+++ b/src/platform/i386/runscripts/omp_dijkstra.sh
@@ -1,7 +1,10 @@
 #!/bin/sh
 
-cp llboot_comp.o llboot.o
-cp omp_dijkstra.o boot.o
-cp test_boot.o dummy1.o
-cp test_boot.o dummy2.o
-./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub
+cp omp_dijkstra.o llboot.o
+./cos_linker "llboot.o, :" ./gen_client_stub
+
+#cp llboot_comp.o llboot.o
+#cp omp_dijkstra.o boot.o
+#cp test_boot.o dummy1.o
+#cp test_boot.o dummy2.o
+#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub
diff --git a/src/platform/i386/runscripts/omp_fib_bots.sh b/src/platform/i386/runscripts/omp_fib_bots.sh
index 22edc6b958..5c4465f351 100644
--- a/src/platform/i386/runscripts/omp_fib_bots.sh
+++ b/src/platform/i386/runscripts/omp_fib_bots.sh
@@ -1,7 +1,10 @@
 #!/bin/sh
 
-cp llboot_comp.o llboot.o
-cp omp_fib_bots.o boot.o
-cp test_boot.o dummy1.o
-cp test_boot.o dummy2.o
-./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub
+cp omp_fib_bots.o llboot.o
+./cos_linker "llboot.o, :" ./gen_client_stub
+
+#cp llboot_comp.o llboot.o
+#cp omp_fib_bots.o boot.o
+#cp test_boot.o dummy1.o
+#cp test_boot.o dummy2.o
+#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub
diff --git a/src/platform/i386/runscripts/omp_hello.sh b/src/platform/i386/runscripts/omp_hello.sh
index 5284d0f941..342a043e00 100644
--- a/src/platform/i386/runscripts/omp_hello.sh
+++ b/src/platform/i386/runscripts/omp_hello.sh
@@ -1,7 +1,10 @@
 #!/bin/sh
 
-cp llboot_comp.o llboot.o
-cp omp_hello.o boot.o
-cp test_boot.o dummy1.o
-cp test_boot.o dummy2.o
-./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub
+cp omp_hello.o llboot.o
+./cos_linker "llboot.o, :" ./gen_client_stub
+
+#cp llboot_comp.o llboot.o
+#cp omp_hello.o boot.o
+#cp test_boot.o dummy1.o
+#cp test_boot.o dummy2.o
+#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub
diff --git a/src/platform/i386/runscripts/omp_sort_bots.sh b/src/platform/i386/runscripts/omp_sort_bots.sh
index 3f65db092f..cf71756905 100644
--- a/src/platform/i386/runscripts/omp_sort_bots.sh
+++ b/src/platform/i386/runscripts/omp_sort_bots.sh
@@ -1,7 +1,10 @@
 #!/bin/sh
 
-cp llboot_comp.o llboot.o
-cp omp_sort_bots.o boot.o
-cp test_boot.o dummy1.o
-cp test_boot.o dummy2.o
-./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub
+cp omp_sort_bots.o llboot.o
+./cos_linker "llboot.o, :" ./gen_client_stub
+
+#cp llboot_comp.o llboot.o
+#cp omp_sort_bots.o boot.o
+#cp test_boot.o dummy1.o
+#cp test_boot.o dummy2.o
+#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub

From 7bc6a9d15086e4d7d5db115dc9fb06e32f986409 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Sun, 19 May 2019 21:23:53 -0400
Subject: [PATCH 097/127] Fixed? :Thread blocking when it is already blocked

* reverted thread capability size back to 16B. Gabe said we
  probably don't need to do that. So!

* There is a strange block->block problem and it seems to very
  much relate to the return value of the dispatch function obviously.
  However, I'm not clear why the return value could be 0 and the
  thread that we returned to is not runnable! Something strange with
  thread releasing critical section and the time between release to
  actual dispatch that could cause additional dispatches or some state
  change. I'd think that it will not cause problems but for now, I
  have a workaround that says if the thread is not runnable upon
  return from dispatch, just return error.
---
 .../no_interface/omp_hello/init.c             |   1 +
 src/components/include/part.h                 |  25 +---
 src/components/include/part_task.h            | 139 +++++++++---------
 src/components/include/sl.h                   |  46 +++++-
 src/components/lib/cos_gomp/cos_gomp.c        |  10 +-
 src/components/lib/cos_gomp/cos_omp.c         |   2 +-
 src/components/lib/part_capmgr.c              |   6 +-
 src/components/lib/part_raw.c                 |  16 +-
 src/components/lib/sl/sl_mod_part_fifo.c      |   1 -
 src/components/lib/sl/sl_sched.c              |  24 +--
 src/kernel/capinv.c                           |   1 +
 src/kernel/include/shared/cos_types.h         |  14 +-
 12 files changed, 153 insertions(+), 132 deletions(-)

diff --git a/src/components/implementation/no_interface/omp_hello/init.c b/src/components/implementation/no_interface/omp_hello/init.c
index 2583300907..ddba532393 100644
--- a/src/components/implementation/no_interface/omp_hello/init.c
+++ b/src/components/implementation/no_interface/omp_hello/init.c
@@ -17,6 +17,7 @@ cos_exit(int x)
 static void
 cos_main(void *d)
 {
+	assert(sl_thd_thdid(sl_thd_curr()) == cos_thdid());
 	main();
 
 	while (1) ;
diff --git a/src/components/include/part.h b/src/components/include/part.h
index 522d04abed..b3f01fbe75 100644
--- a/src/components/include/part.h
+++ b/src/components/include/part.h
@@ -157,12 +157,9 @@ part_pool_block(void)
 
 	/* very much a replica of sl_thd_block + adding to thread pool in part */
 	sl_cs_enter();
-	if (sl_thd_block_no_cs(t, SL_THD_BLOCKED, 0)) {
-		sl_cs_exit();
-		return;
-	}
 	if (ps_list_singleton(t, partlist)) ps_list_head_append(part_thdpool_curr(), t, partlist);
-	sl_cs_exit_schedule();
+	sl_cs_exit();
+	sl_thd_block(0);
 	assert(sl_thd_is_runnable(t));
 #else
 	sl_thd_yield(0);
@@ -225,7 +222,7 @@ part_list_append(struct part_task *t)
 	 * wake up as many threads on this core! 
 	 * some may not get work if other cores pull work before they get to it.
 	 */
-	for (i = 0; i < t->nthds; i++) part_pool_wakeup();
+	for (i = 1; i < t->nthds; i++) part_pool_wakeup();
 
 	/* if this is the first time in a parallel, make everyone know */
 	if (likely(!in_nest)) ps_faa(&in_main_parallel, 1);
@@ -317,11 +314,7 @@ part_task_barrier(struct part_task *t, int is_end)
 	assert(t->state == PART_TASK_S_INITIALIZED);
 	assert(t->nthds >= 1);
 
-	/* master thread to wait for child threads first, before barrier! */
-	if (is_master) {
-		assert(t->master == PART_CURR_THD);
-		part_task_wait_children(t);
-	}
+	part_task_wait_children(t);
 
 	if (t->nthds == 1) {
 		struct part_data *d;
@@ -355,12 +348,8 @@ part_task_barrier(struct part_task *t, int is_end)
 		sl_thd_block(0);
 	} else {
 		if (ps_cas(&t->barrier, 0, t->nthds)) ps_faa(&t->barrier_epoch, 1);
-		if (is_master) {
-			part_peer_wakeup(t);
-		} else {
-			part_master_wakeup(t);
-			//sl_thd_block(0);
-		}
+		if (is_master) part_peer_wakeup(t);
+		else part_master_wakeup(t);
 	}
 	assert(ps_load(&t->barrier_epoch) == cbep + 1);
 
@@ -395,7 +384,7 @@ part_thd_fn(void *d)
 		struct part_task *t = NULL;
 		int ret;
 
-		while (!ps_load(&in_main_parallel)) part_pool_block();
+		if (!ps_load(&in_main_parallel)) part_pool_block();
 
 		/* FIXME: nested parallel needs love! */
 		t = part_list_peek();
diff --git a/src/components/include/part_task.h b/src/components/include/part_task.h
index ceb757993c..352d0a52ee 100644
--- a/src/components/include/part_task.h
+++ b/src/components/include/part_task.h
@@ -13,8 +13,8 @@
 
 #define PART_MAX_TASKS      256
 #define PART_MAX_DATA       256
-#define PART_MAX_PAR_THDS   NUM_CPU 
-#define PART_MAX_THDS       128
+#define PART_MAX_PAR_THDS   NUM_CPU
+#define PART_MAX_THDS       512
 #define PART_MAX_CORE_THDS  (PART_MAX_THDS/NUM_CPU)
 #define PART_MAX_CHILD      16
 #define PART_MAX_WORKSHARES 16
@@ -88,7 +88,8 @@ struct part_task {
 
 	struct part_data *data_env; 
 	struct part_task *parent;
-	int nchildren;
+	/* in data-parallel task, each thread waits for its children. */
+	int nchildren[PART_MAX_PAR_THDS];
 
 	struct ps_list partask;
 	struct part_task *next_free; /* for explicit task allocation/free */
@@ -121,7 +122,7 @@ part_task_init(struct part_task *t, part_task_type_t type, struct part_task *p,
 	t->end = t->barrier_epoch = 0;
 	t->data_env = d;
 	t->parent = p;
-	t->nchildren = 0;
+	memset(t->nchildren, 0, sizeof(int) * PART_MAX_PAR_THDS);
 
 	ps_list_init(t, partask);
 }
@@ -131,66 +132,6 @@ void part_task_free(struct part_task *);
 struct part_data *part_data_alloc(void);
 void part_data_free(struct part_data *);
 
-static inline int
-part_task_add_child(struct part_task *t, struct part_task *c)
-{
-	int i;
-
-	assert(t->state == PART_TASK_S_INITIALIZED);
-
-	if (unlikely(!t || !c)) return -1;
-
-	i = ps_faa(&t->nchildren, 1);
-	assert(i < PART_MAX_CHILD);
-	
-	return i;
-}
-
-static inline void
-part_thd_wakeup(unsigned thd)
-{
-	thdid_t t = PART_THD_THDID(thd);
-	cpuid_t c = PART_THD_COREID(thd);
-
-	assert(c >= 0 && c < NUM_CPU);
-	assert(t < MAX_NUM_THREADS);
-
-	if (thd == PART_CURR_THD) return;
-	if (c != cos_cpuid()) sl_xcore_thd_wakeup_tid(t, c);
-	else                  sl_thd_wakeup(t);
-}
-
-static inline void
-part_task_remove_child(struct part_task *c)
-{
-	struct part_task *p = c->parent;
-	unsigned wkup;
-	int i;
-
-	if (unlikely(!p)) return;
-	assert(c->state == PART_TASK_S_INITIALIZED);
-
-	if (c->type == PART_TASK_T_TASK) wkup = c->master;
-	else                             wkup = p->master;
-
-	i = ps_faa(&p->nchildren, -1);
-	assert(i > 0);
-
-	part_thd_wakeup(wkup);
-}
-
-static inline void
-part_task_wait_children(struct part_task *t)
-{
-	assert(t->state == PART_TASK_S_INITIALIZED);
-	if (t->type == PART_TASK_T_WORKSHARE) assert(t->master == PART_CURR_THD);
-	else if (t->type == PART_TASK_T_TASK) assert(t->workers[0] == PART_CURR_THD);
-
-	while (ps_load(&t->nchildren) > 0) sl_thd_block(0);
-
-	assert(t->nchildren == 0);
-}
-
 static inline int
 part_task_work_try(struct part_task *t)
 {
@@ -218,10 +159,10 @@ part_task_work_try(struct part_task *t)
 }
 
 static inline int
-part_task_work_thd_num(struct part_task *t)
+part_task_work_thd_num(struct part_task *t, unsigned core_thd)
 {
 	int i; 
-	unsigned key = PART_CURR_THD;
+	unsigned key = core_thd;
 
 	assert(t);
 
@@ -243,6 +184,20 @@ part_task_work_thd_num(struct part_task *t)
 	return -1;
 }
 
+static inline void
+part_thd_wakeup(unsigned thd)
+{
+	thdid_t t = PART_THD_THDID(thd);
+	cpuid_t c = PART_THD_COREID(thd);
+
+	assert(c >= 0 && c < NUM_CPU);
+	assert(t < MAX_NUM_THREADS);
+
+	if (thd == PART_CURR_THD) return;
+	if (c != cos_cpuid()) sl_xcore_thd_wakeup_tid(t, c);
+	else                  sl_thd_wakeup(t);
+}
+
 static inline void
 part_master_wakeup(struct part_task *t)
 {
@@ -267,4 +222,56 @@ part_peer_wakeup(struct part_task *t)
 	for (i = 1; i < t->nthds; i++) part_thd_wakeup(t->workers[i]);
 }
 
+static inline int
+part_task_add_child(struct part_task *t, struct part_task *c)
+{
+	int i;
+	int num = part_task_work_thd_num(t, PART_CURR_THD);
+
+	assert(num >= 0);
+	assert(t->state == PART_TASK_S_INITIALIZED);
+
+	if (unlikely(!t || !c)) return -1;
+
+	i = ps_faa(&t->nchildren[num], 1);
+	assert(i < PART_MAX_CHILD);
+	
+	return i;
+}
+
+static inline void
+part_task_remove_child(struct part_task *c)
+{
+	struct part_task *p = c->parent;
+	unsigned wkup;
+	int i, num;
+
+	if (unlikely(!p)) return;
+	assert(c->state == PART_TASK_S_INITIALIZED);
+
+	if (c->type == PART_TASK_T_TASK) wkup = c->master;
+	else                             wkup = p->master;
+
+	num = part_task_work_thd_num(p, wkup);
+	assert(num >= 0);
+
+	assert(p->nchildren[num] != 0);
+	i = ps_faa(&p->nchildren[num], -1);
+	assert(i > 0);
+
+	/* only the last child to wake up the parent */
+	if (i == 1) part_thd_wakeup(wkup);
+}
+
+static inline void
+part_task_wait_children(struct part_task *t)
+{
+	int num = part_task_work_thd_num(t, PART_CURR_THD);
+
+	assert(num >= 0);
+	assert(t->state == PART_TASK_S_INITIALIZED);
+
+	if (ps_load(&(t->nchildren[num])) > 0) sl_thd_block(0);
+}
+
 #endif /* PART_TASK_H */
diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index c628ee41b4..3ac908ac21 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -466,7 +466,7 @@ sl_thd_activate(struct sl_thd *t, sched_tok_t tok, tcap_time_t timeout)
 static inline int
 sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 {
-	struct cos_scb_info *scb = sl_scb_info_core();
+	volatile struct cos_scb_info *scb = sl_scb_info_core();
 	struct cos_dcb_info *cd = sl_thd_dcbinfo(curr), *nd = sl_thd_dcbinfo(next);
 
 	assert(curr != next);
@@ -523,9 +523,10 @@ sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 		  "c" (&(scb->curr_thd)), "d" (sl_thd_thdcap(next))
 		: "memory", "cc");
 
-	if (likely(sl_scb_info_core()->sched_tok == tok)) return 0;
+	scb = sl_scb_info_core();
+	if (unlikely(ps_load(&scb->sched_tok) != tok)) return -EAGAIN;
 
-	return -EAGAIN;
+	return 0;
 }
 
 static inline int
@@ -614,7 +615,7 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 	sl_thd_replenish_no_cs(t, now);
 #endif
 
-//	assert(t && sl_thd_is_runnable(t));
+	assert(t && sl_thd_is_runnable(t));
 #ifdef SL_CS
 	sl_cs_exit();
 #endif
@@ -624,8 +625,36 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 	 * if the periodic timer is already ahead,
 	 * don't reprogram it!
 	 */
-	if (likely(offset > globals->cyc_per_usec && globals->timer_prev)) ret = sl_thd_dispatch(t, tok, sl_thd_curr());
-	else ret = sl_thd_activate(t, tok, globals->timeout_next);
+	if (likely(offset > globals->cyc_per_usec && globals->timer_prev)) {
+		ret = sl_thd_dispatch(t, tok, sl_thd_curr());
+	} else {
+		ret = sl_thd_activate(t, tok, globals->timeout_next);
+	}
+
+	/*
+	 * one observation, in slowpath switch:
+	 *        if the kernel decides to switch over to scheduler thread and
+	 *        later at some point decides to resume this thread, the ret value
+	 *        from the syscall is probably 0, even though token has advanced and
+	 *        the switch this thread intended, did not go through.
+	 *
+	 * there is some wierd race in user-level thread switch:
+	 *        a thread sl_thd_block()'s itself and decides to switch to a runnable
+	 *        thread at user-level.
+	 *        if a preemption occurs and eventually this thread is resumed, 
+	 *        for some reason the token check is not working well.
+	 *
+	 * what is more wierd is, even in slowpath sl_thd_activate(), I see that
+	 * on return from syscall, this thread is not runnable. 
+	 * how is this possible? is there a race? i don't think so.
+	 * only the current thread can block itself, of course this is not true for AEPs.
+	 * But for non AEPs, I don't know why this triggers!
+	 *
+	 * I'll need to rethink about some possible scenario, perhaps some bug in the code
+	 * that returns to this thread when it is not runnable.
+	 * something!!!!
+	 */
+	if (unlikely(!sl_thd_is_runnable(sl_thd_curr()))) return -EAGAIN;
 
 #ifdef SL_REPLENISH 
 	/*
@@ -641,6 +670,8 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 		if (unlikely(sl_thd_curr() != globals->sched_thd)) ret = sl_thd_activate(globals->sched_thd, tok, globals->timeout_next);
 	}
 #endif
+	/* either this thread is runnable at this point or a switch failed */
+	assert(sl_thd_is_runnable(sl_thd_curr()) || ret);
 
 	return ret;
 }
@@ -691,7 +722,7 @@ sl_cs_exit_schedule_nospin_arg_timeout(struct sl_thd *to, cycles_t abs_timeout)
 	sl_thd_replenish_no_cs(t, now);
 #endif
 
-//	assert(t && sl_thd_is_runnable(t));
+	assert(t && sl_thd_is_runnable(t));
 #ifdef SL_CS
 	sl_cs_exit();
 #endif
@@ -711,6 +742,7 @@ sl_cs_exit_schedule_nospin_arg_timeout(struct sl_thd *to, cycles_t abs_timeout)
 		ret = sl_thd_activate(t, tok, abs_timeout < globals->timer_next 
 				      ? tcap_cyc2time(abs_timeout) : globals->timeout_next);
 	}
+	if (unlikely(!sl_thd_is_runnable(sl_thd_curr()))) return -EAGAIN;
 
 #ifdef SL_REPLENISH 
 	/*
diff --git a/src/components/lib/cos_gomp/cos_gomp.c b/src/components/lib/cos_gomp/cos_gomp.c
index 1b1590d0c8..f388db628f 100644
--- a/src/components/lib/cos_gomp/cos_gomp.c
+++ b/src/components/lib/cos_gomp/cos_gomp.c
@@ -107,7 +107,7 @@ GOMP_single_start(void)
 {
 	struct part_task *t = (struct part_task *)sl_thd_curr()->part_context;
 	int i;
-	int coff = part_task_work_thd_num(t);
+	int coff = part_task_work_thd_num(t, PART_CURR_THD);
 	unsigned b = 1 << coff;
 
 	assert(coff >= 0 && coff < (int)t->nthds);
@@ -183,7 +183,7 @@ GOMP_loop_dynamic_start (long start, long end, long incr, long chunk_size,
 {
 	struct part_task *t = (struct part_task *)sl_thd_curr()->part_context;
 	int i;
-	int coff = part_task_work_thd_num(t);
+	int coff = part_task_work_thd_num(t, PART_CURR_THD);
 	unsigned b = 1 << coff;
 
 	assert(coff >= 0 && coff < (int)t->nthds);
@@ -258,7 +258,7 @@ bool
 GOMP_loop_dynamic_next (long *istart, long *iend)
 {
 	struct part_task *t = (struct part_task *)sl_thd_curr()->part_context;
-	unsigned coff = part_task_work_thd_num(t);
+	unsigned coff = part_task_work_thd_num(t, PART_CURR_THD);
 	int woff = t->ws_off[coff];
 
 	if (unlikely(woff < 0)) t->ws_off[coff] = woff = 0;
@@ -271,7 +271,7 @@ void
 GOMP_loop_end (void)
 {
 	struct part_task *t = (struct part_task *)sl_thd_curr()->part_context;
-	unsigned coff = part_task_work_thd_num(t);
+	unsigned coff = part_task_work_thd_num(t, PART_CURR_THD);
 	int woff = t->ws_off[coff], c = 0;
 
 	assert(t->ws[woff].type == PART_WORKSHARE_LOOP_DYNAMIC);
@@ -283,7 +283,7 @@ void
 GOMP_loop_end_nowait (void)
 {
 	struct part_task *t = (struct part_task *)sl_thd_curr()->part_context;
-	unsigned coff = part_task_work_thd_num(t);
+	unsigned coff = part_task_work_thd_num(t, PART_CURR_THD);
 	int woff = t->ws_off[coff], c = 0;
 
 	assert(t->ws[woff].type == PART_WORKSHARE_LOOP_DYNAMIC);
diff --git a/src/components/lib/cos_gomp/cos_omp.c b/src/components/lib/cos_gomp/cos_omp.c
index c8b7309ae6..b74ea94785 100644
--- a/src/components/lib/cos_gomp/cos_omp.c
+++ b/src/components/lib/cos_gomp/cos_omp.c
@@ -63,7 +63,7 @@ omp_get_thread_num(void)
 
 	if (!pt) return 0;
 	
-	return part_task_work_thd_num(pt);
+	return part_task_work_thd_num(pt, PART_CURR_THD);
 }
 
 static inline void
diff --git a/src/components/lib/part_capmgr.c b/src/components/lib/part_capmgr.c
index fce8c5879f..660fe38afe 100644
--- a/src/components/lib/part_capmgr.c
+++ b/src/components/lib/part_capmgr.c
@@ -181,9 +181,11 @@ parttask_store_init_all(vaddr_t mem)
 static void
 part_idle_fn(void *d)
 {
+	struct sl_thd *sched = sl__globals_core()->sched_thd, *curr = sl_thd_curr();
+
 	while (1) {
-		part_pool_wakeup();
-		sl_thd_yield_thd(sl__globals_core()->sched_thd);
+		if (likely(ps_load(&in_main_parallel))) part_pool_wakeup();
+		sl_thd_yield_thd(sched);
 	}
 }
 
diff --git a/src/components/lib/part_raw.c b/src/components/lib/part_raw.c
index a499c7efec..a04f6ee2d1 100644
--- a/src/components/lib/part_raw.c
+++ b/src/components/lib/part_raw.c
@@ -183,9 +183,11 @@ parttask_store_init_all(vaddr_t mem)
 static void
 part_idle_fn(void *d)
 {
+	struct sl_thd *sched = sl__globals_core()->sched_thd, *curr = sl_thd_curr();
+
 	while (1) {
-		part_pool_wakeup();
-		sl_thd_yield_thd(sl__globals_core()->sched_thd);
+		if (likely(ps_load(&in_main_parallel))) part_pool_wakeup();
+		sl_thd_yield_thd(sched);
 	}
 }
 
@@ -308,11 +310,11 @@ part_init(void)
 			assert(part_dq_percore[k]);
 			deque_init_part(part_dq_percore[k], PART_DEQUE_SZ);
 		}
-		ptmem = cos_page_bump_allocn(ci, PART_TASKS_MAX_SZ * NUM_CPU);
+		ptmem = (vaddr_t)cos_page_bump_allocn(ci, PART_TASKS_MAX_SZ * NUM_CPU);
 		assert(ptmem);
 		memset((void *)ptmem, 0, PART_MAX_PAGES * PAGE_SIZE * NUM_CPU);
 
-		pdmem = cos_page_bump_allocn(ci, PART_DATA_MAX_SZ * NUM_CPU);
+		pdmem = (vaddr_t)cos_page_bump_allocn(ci, PART_DATA_MAX_SZ * NUM_CPU);
 		assert(pdmem);
 		memset((void *)pdmem, 0, PART_MAX_DATA_PAGES * PAGE_SIZE * NUM_CPU);
 
@@ -345,6 +347,9 @@ part_init(void)
 	}
 
 #ifdef PART_ENABLE_BLOCKING
+	it = sl_thd_alloc(part_idle_fn, NULL);
+	assert(it);
+	sl_thd_param_set(it, ip);
 	sl_cs_enter();
 	/* 
 	 * because it's fifo, all threads would go block 
@@ -353,9 +358,6 @@ part_init(void)
 	 * and on all other cores, scheduler would be running!
 	 */
 	sl_cs_exit_schedule(); 
-	it = sl_thd_alloc(part_idle_fn, NULL);
-	assert(it);
-	sl_thd_param_set(it, ip);
 #endif
 
 	ps_faa(&all_done, 1);
diff --git a/src/components/lib/sl/sl_mod_part_fifo.c b/src/components/lib/sl/sl_mod_part_fifo.c
index 1b9e5cb72e..3584d0dc26 100644
--- a/src/components/lib/sl/sl_mod_part_fifo.c
+++ b/src/components/lib/sl/sl_mod_part_fifo.c
@@ -24,7 +24,6 @@ sl_mod_execution(struct sl_thd_policy *t, cycles_t cycles)
 struct sl_thd_policy *
 sl_mod_schedule(void)
 {
-	struct sl_thd_policy *c = sl_mod_thd_policy_get(sl_thd_curr());
 	struct sl_thd_policy *t = NULL;
 
 	if (unlikely(ps_list_head_empty(&threads[cos_cpuid()]))) goto done;
diff --git a/src/components/lib/sl/sl_sched.c b/src/components/lib/sl/sl_sched.c
index 7f6329a2a6..5037b02c98 100644
--- a/src/components/lib/sl/sl_sched.c
+++ b/src/components/lib/sl/sl_sched.c
@@ -216,9 +216,9 @@ sl_thd_sched_unblock_no_cs(struct sl_thd *t)
 int
 sl_thd_block_no_cs(struct sl_thd *t, sl_thd_state_t block_type, cycles_t timeout)
 {
-	assert(t);
+	assert(t && sl_thd_curr() == t); /* only current thread is allowed to block itself */
 	assert(t != sl__globals_core()->idle_thd && t != sl__globals_core()->sched_thd);
-	assert(sl_thd_curr() == t); /* only current thread is allowed to block itself */
+	assert(sl_thd_is_runnable(t));
 	assert(block_type == SL_THD_BLOCKED_TIMEOUT || block_type == SL_THD_BLOCKED);
 
 	if (t->schedthd) {
@@ -414,23 +414,8 @@ sl_thd_wakeup_no_cs(struct sl_thd *t)
 		return 0;
 	}
 
-//	if (unlikely(sl_thd_is_runnable(t))) {
-//		/* t->state == SL_THD_WOKEN? multiple wakeups? */
-//		t->state = SL_THD_WOKEN;
-//		return 1;
-//	}
-	/*
-	 * TODO: with blockpoints, multiple wakeup problem might go away.
-	 * will try that next!
-	 *
-	 * For now, if a thread creates N tasks and if at least two of them
-	 * complete before master goes to block, which can happen on multi-core
-	 * execution of tasks, then that results in multiple wakeups!
-	 */
-	if (unlikely(t->state == SL_THD_WOKEN)) {
-		t->state = SL_THD_RUNNABLE;
-		return 1;
-	} else if (unlikely(t->state == SL_THD_RUNNABLE)) {
+	if (unlikely(sl_thd_is_runnable(t))) {
+		/* t->state == SL_THD_WOKEN? multiple wakeups? */
 		t->state = SL_THD_WOKEN;
 		return 1;
 	}
@@ -680,6 +665,7 @@ sl_sched_loop_intern(int non_block)
 	struct sl_global_core *g   = sl__globals_core();
 	rcv_flags_t            rfl = (non_block ? RCV_NON_BLOCKING : 0);
 
+	assert(sl_thd_curr() == g->sched_thd);
 	assert(sl_core_active());
 
 	while (1) {
diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index 51a363453f..c280a4cfda 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -123,6 +123,7 @@ cap_ulthd_lazyupdate(struct pt_regs *regs, struct cos_cpu_local_info *cos_info,
 		if (unlikely(fixthd->dcbinfo && fixthd->dcbinfo->sp)) {
 			regs->ip = fixthd->dcbinfo->ip + DCB_IP_KERN_OFF;
 			regs->sp = fixthd->dcbinfo->sp;
+			regs->dx = 0; /* sched token is in edx! */
 
 			fixthd->dcbinfo->sp = 0;
 		}
diff --git a/src/kernel/include/shared/cos_types.h b/src/kernel/include/shared/cos_types.h
index 0cd3cbc7f4..afb92c1edb 100644
--- a/src/kernel/include/shared/cos_types.h
+++ b/src/kernel/include/shared/cos_types.h
@@ -210,6 +210,7 @@ __captbl_cap2sz(cap_t c)
 	switch (c) {
 	case CAP_SRET:
 	case CAP_TCAP:
+	case CAP_THD:
 		return CAP_SZ_16B;
 	case CAP_SCB:
 	case CAP_DCB:
@@ -217,7 +218,6 @@ __captbl_cap2sz(cap_t c)
 	case CAP_PGTBL:
 	case CAP_HW: /* TODO: 256bits = 32B * 8b */
 		return CAP_SZ_32B;
-	case CAP_THD: /* to allow thread migration across cores using the same capability */
 	case CAP_SINV:
 	case CAP_COMP:
 	case CAP_ASND:
@@ -276,19 +276,21 @@ enum
 	 */
 	BOOT_CAPTBL_SELF_INITRCV_BASE  = round_up_to_pow2(BOOT_CAPTBL_SELF_INITTHD_BASE + NUM_CPU * CAP64B_IDSZ,
                                                          CAPMAX_ENTRY_SZ),
-	BOOT_CAPTBL_SELF_INITTCAP_BASE  = round_up_to_pow2(BOOT_CAPTBL_SELF_INITRCV_BASE + NUM_CPU * CAP64B_IDSZ,
-                                                         CAPMAX_ENTRY_SZ),
-	BOOT_CAPTBL_LAST_CAP           = BOOT_CAPTBL_SELF_INITTCAP_BASE + NUM_CPU * CAP64B_IDSZ,
+	/* BOOT_CAPTBL_SELF_INITTCAP_BASE  = round_up_to_pow2(BOOT_CAPTBL_SELF_INITRCV_BASE + NUM_CPU * CAP64B_IDSZ,
+                                                         CAPMAX_ENTRY_SZ), */
+	BOOT_CAPTBL_LAST_CAP           = BOOT_CAPTBL_SELF_INITRCV_BASE + NUM_CPU * CAP64B_IDSZ,
 	/* round up to next entry */
 	BOOT_CAPTBL_FREE = round_up_to_pow2(BOOT_CAPTBL_LAST_CAP, CAPMAX_ENTRY_SZ)
 };
 
+#define BOOT_CAPTBL_SELF_INITTCAP_BASE BOOT_CAPTBL_SELF_INITTHD_BASE + CAP16B_IDSZ
+
 #define BOOT_CAPTBL_SELF_INITTHD_CPU_BASE (BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cos_cpuid()))
 #define BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE (BOOT_CAPTBL_SELF_INITTCAP_BASE_CPU(cos_cpuid()))
 #define BOOT_CAPTBL_SELF_INITRCV_CPU_BASE (BOOT_CAPTBL_SELF_INITRCV_BASE_CPU(cos_cpuid()))
 
 #define BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cpuid) (BOOT_CAPTBL_SELF_INITTHD_BASE + cpuid * CAP64B_IDSZ)
-#define BOOT_CAPTBL_SELF_INITTCAP_BASE_CPU(cpuid) (BOOT_CAPTBL_SELF_INITTCAP_BASE + cpuid * CAP64B_IDSZ)
+#define BOOT_CAPTBL_SELF_INITTCAP_BASE_CPU(cpuid) (BOOT_CAPTBL_SELF_INITTHD_BASE_CPU(cpuid) + CAP16B_IDSZ)
 #define BOOT_CAPTBL_SELF_INITRCV_BASE_CPU(cpuid) (BOOT_CAPTBL_SELF_INITRCV_BASE + cpuid * CAP64B_IDSZ)
 
 enum llboot_scb_dcb_caps
@@ -299,7 +301,7 @@ enum llboot_scb_dcb_caps
 };
 
 #define LLBOOT_CAPTBL_INITDCB_CPU(cpuid) (LLBOOT_CAPTBL_INITDCB + (CAP64B_IDSZ * cpuid))
-#define LLBOOT_CAPTBL_CPU_INITDCB        (LLBOOT_CAPTBL_INITDCB_CPU(cos_cpuid()))
+#define LLBOOT_CAPTBL_CPU_INITDCB        LLBOOT_CAPTBL_INITDCB_CPU(cos_cpuid())
 
 /*
  * The half of the first page of init captbl is devoted to root node. So, the

From e1bf367af0a94939cff4781a53dd64733a68031b Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Sun, 19 May 2019 23:08:00 -0400
Subject: [PATCH 098/127] Fixed end barrier for worker threads to not wait

---
 src/components/include/part.h      | 16 ++++++++--------
 src/components/include/part_task.h |  2 +-
 src/components/lib/part_capmgr.c   |  3 +++
 src/components/lib/part_raw.c      |  9 ++++++---
 4 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/src/components/include/part.h b/src/components/include/part.h
index 773eaa4153..cdf97767bf 100644
--- a/src/components/include/part.h
+++ b/src/components/include/part.h
@@ -157,10 +157,6 @@ part_pool_block(void)
 
 	/* very much a replica of sl_thd_block + adding to thread pool in part */
 	sl_cs_enter();
-	if (sl_thd_block_no_cs(t, SL_THD_BLOCKED, 0)) {
-		sl_cs_exit();
-		return;
-	}
 	if (ps_list_singleton(t, partlist)) ps_list_head_append(part_thdpool_curr(), t, partlist);
 	sl_cs_exit();
 	sl_thd_block(0);
@@ -350,10 +346,16 @@ part_task_barrier(struct part_task *t, int is_end)
 	cbc = ps_faa(&t->barrier, -1);
 	if (cbc > 1) {
 		sl_thd_block(0);
+		if (is_master) part_peer_wakeup(t);
 	} else {
 		if (ps_cas(&t->barrier, 0, t->nthds)) ps_faa(&t->barrier_epoch, 1);
-		if (is_master) part_peer_wakeup(t);
-		else part_master_wakeup(t);
+		if (is_master) {
+			part_peer_wakeup(t);
+		}
+		else {
+			part_master_wakeup(t);
+			sl_thd_block(0);
+		}
 	}
 	assert(ps_load(&t->barrier_epoch) == cbep + 1);
 
@@ -361,12 +363,10 @@ part_task_barrier(struct part_task *t, int is_end)
 	ps_faa(&t->end, 1);
 
 	if (is_master) {
-		while (ps_load(&t->end) != t->nthds) sl_thd_block(0);
 		part_task_remove_child(t);
 		part_list_remove(t);
 		ts->part_context = t->parent;
 	} else {
-		part_master_wakeup(t);
 		ts->part_context = NULL;
 	}
 }
diff --git a/src/components/include/part_task.h b/src/components/include/part_task.h
index 352d0a52ee..ffe6ffa847 100644
--- a/src/components/include/part_task.h
+++ b/src/components/include/part_task.h
@@ -14,8 +14,8 @@
 #define PART_MAX_TASKS      256
 #define PART_MAX_DATA       256
 #define PART_MAX_PAR_THDS   NUM_CPU
+#define PART_MAX_CORE_THDS  64
 #define PART_MAX_THDS       512
-#define PART_MAX_CORE_THDS  (PART_MAX_THDS/NUM_CPU)
 #define PART_MAX_CHILD      16
 #define PART_MAX_WORKSHARES 16
 
diff --git a/src/components/lib/part_capmgr.c b/src/components/lib/part_capmgr.c
index 660fe38afe..9d09024af4 100644
--- a/src/components/lib/part_capmgr.c
+++ b/src/components/lib/part_capmgr.c
@@ -184,6 +184,9 @@ part_idle_fn(void *d)
 	struct sl_thd *sched = sl__globals_core()->sched_thd, *curr = sl_thd_curr();
 
 	while (1) {
+		/*
+		 * TODO: threads could be woken up even if there is no work!
+		 */
 		if (likely(ps_load(&in_main_parallel))) part_pool_wakeup();
 		sl_thd_yield_thd(sched);
 	}
diff --git a/src/components/lib/part_raw.c b/src/components/lib/part_raw.c
index a04f6ee2d1..04c130d9b0 100644
--- a/src/components/lib/part_raw.c
+++ b/src/components/lib/part_raw.c
@@ -186,6 +186,9 @@ part_idle_fn(void *d)
 	struct sl_thd *sched = sl__globals_core()->sched_thd, *curr = sl_thd_curr();
 
 	while (1) {
+		/*
+		 * TODO: threads could be woken up even if there is no work!
+		 */
 		if (likely(ps_load(&in_main_parallel))) part_pool_wakeup();
 		sl_thd_yield_thd(sched);
 	}
@@ -347,9 +350,6 @@ part_init(void)
 	}
 
 #ifdef PART_ENABLE_BLOCKING
-	it = sl_thd_alloc(part_idle_fn, NULL);
-	assert(it);
-	sl_thd_param_set(it, ip);
 	sl_cs_enter();
 	/* 
 	 * because it's fifo, all threads would go block 
@@ -358,6 +358,9 @@ part_init(void)
 	 * and on all other cores, scheduler would be running!
 	 */
 	sl_cs_exit_schedule(); 
+	it = sl_thd_alloc(part_idle_fn, NULL);
+	assert(it);
+	sl_thd_param_set(it, ip);
 #endif
 
 	ps_faa(&all_done, 1);

From 4bf72b03a886dd9355002041dc6846403aca02aa Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Mon, 20 May 2019 23:57:09 -0400
Subject: [PATCH 099/127] Added more BOTS benchmarks - fft, sparselu (not
 working), strassen

---
 src/components/Makefile.comp                  |    1 +
 .../no_interface/omp_dijkstra/posix_basic.c   |    2 +-
 .../no_interface/omp_fft_bots/Makefile        |   19 +
 .../no_interface/omp_fft_bots/app-desc.h      |   56 +
 .../no_interface/omp_fft_bots/bots.h          |    1 +
 .../no_interface/omp_fft_bots/bots_common.c   |    1 +
 .../no_interface/omp_fft_bots/bots_common.h   |    1 +
 .../no_interface/omp_fft_bots/bots_main.c     |    1 +
 .../no_interface/omp_fft_bots/bots_main.h     |    1 +
 .../no_interface/omp_fft_bots/fft.c           | 5015 +++++++++++++++++
 .../no_interface/omp_fft_bots/fft.h           |   55 +
 .../no_interface/omp_fft_bots/init.c          |    1 +
 .../no_interface/omp_fft_bots/omp-tasks-app.h |    1 +
 .../no_interface/omp_fft_bots/posix_basic.c   |    1 +
 .../no_interface/omp_fib_bots/bots.h          |    2 +-
 .../no_interface/omp_fib_bots/bots_main.c     |    7 +-
 .../omp_sparselu_for_bots/Makefile            |   19 +
 .../omp_sparselu_for_bots/app-desc.h          |   56 +
 .../no_interface/omp_sparselu_for_bots/bots.h |    1 +
 .../omp_sparselu_for_bots/bots_common.c       |    1 +
 .../omp_sparselu_for_bots/bots_common.h       |    1 +
 .../omp_sparselu_for_bots/bots_main.c         |    1 +
 .../omp_sparselu_for_bots/bots_main.h         |    1 +
 .../no_interface/omp_sparselu_for_bots/init.c |    1 +
 .../omp_sparselu_for_bots/omp-tasks-app.h     |    1 +
 .../omp_sparselu_for_bots/posix_basic.c       |    1 +
 .../omp_sparselu_for_bots/sparselu.c          |  326 ++
 .../omp_sparselu_for_bots/sparselu.h          |   24 +
 .../omp_sparselu_single_bots/Makefile         |   19 +
 .../omp_sparselu_single_bots/app-desc.h       |   56 +
 .../omp_sparselu_single_bots/bots.h           |    1 +
 .../omp_sparselu_single_bots/bots_common.c    |    1 +
 .../omp_sparselu_single_bots/bots_common.h    |    1 +
 .../omp_sparselu_single_bots/bots_main.c      |    1 +
 .../omp_sparselu_single_bots/bots_main.h      |    1 +
 .../omp_sparselu_single_bots/init.c           |    1 +
 .../omp_sparselu_single_bots/omp-tasks-app.h  |    1 +
 .../omp_sparselu_single_bots/posix_basic.c    |    1 +
 .../omp_sparselu_single_bots/sparselu.c       |  325 ++
 .../omp_sparselu_single_bots/sparselu.h       |   24 +
 .../no_interface/omp_strassen_bots/Makefile   |   19 +
 .../no_interface/omp_strassen_bots/app-desc.h |   81 +
 .../no_interface/omp_strassen_bots/bots.h     |    1 +
 .../omp_strassen_bots/bots_common.c           |    1 +
 .../omp_strassen_bots/bots_common.h           |    1 +
 .../omp_strassen_bots/bots_main.c             |    1 +
 .../omp_strassen_bots/bots_main.h             |    1 +
 .../no_interface/omp_strassen_bots/init.c     |    1 +
 .../omp_strassen_bots/omp-tasks-app.h         |    1 +
 .../omp_strassen_bots/posix_basic.c           |    1 +
 .../no_interface/omp_strassen_bots/strassen.c | 1375 +++++
 .../no_interface/omp_strassen_bots/strassen.h |   66 +
 src/components/include/part_task.h            |    8 +-
 src/components/lib/Makefile                   |    2 +-
 src/components/lib/sl/sl_xcore.c              |   10 +-
 src/platform/i386/runscripts/omp_fft_bots.sh  |   10 +
 .../i386/runscripts/omp_sparselu_for_bots.sh  |   10 +
 .../runscripts/omp_sparselu_single_bots.sh    |   10 +
 .../i386/runscripts/omp_strassen_bots.sh      |   10 +
 59 files changed, 7625 insertions(+), 14 deletions(-)
 create mode 100644 src/components/implementation/no_interface/omp_fft_bots/Makefile
 create mode 100644 src/components/implementation/no_interface/omp_fft_bots/app-desc.h
 create mode 120000 src/components/implementation/no_interface/omp_fft_bots/bots.h
 create mode 120000 src/components/implementation/no_interface/omp_fft_bots/bots_common.c
 create mode 120000 src/components/implementation/no_interface/omp_fft_bots/bots_common.h
 create mode 120000 src/components/implementation/no_interface/omp_fft_bots/bots_main.c
 create mode 120000 src/components/implementation/no_interface/omp_fft_bots/bots_main.h
 create mode 100644 src/components/implementation/no_interface/omp_fft_bots/fft.c
 create mode 100644 src/components/implementation/no_interface/omp_fft_bots/fft.h
 create mode 120000 src/components/implementation/no_interface/omp_fft_bots/init.c
 create mode 120000 src/components/implementation/no_interface/omp_fft_bots/omp-tasks-app.h
 create mode 120000 src/components/implementation/no_interface/omp_fft_bots/posix_basic.c
 create mode 100644 src/components/implementation/no_interface/omp_sparselu_for_bots/Makefile
 create mode 100644 src/components/implementation/no_interface/omp_sparselu_for_bots/app-desc.h
 create mode 120000 src/components/implementation/no_interface/omp_sparselu_for_bots/bots.h
 create mode 120000 src/components/implementation/no_interface/omp_sparselu_for_bots/bots_common.c
 create mode 120000 src/components/implementation/no_interface/omp_sparselu_for_bots/bots_common.h
 create mode 120000 src/components/implementation/no_interface/omp_sparselu_for_bots/bots_main.c
 create mode 120000 src/components/implementation/no_interface/omp_sparselu_for_bots/bots_main.h
 create mode 120000 src/components/implementation/no_interface/omp_sparselu_for_bots/init.c
 create mode 120000 src/components/implementation/no_interface/omp_sparselu_for_bots/omp-tasks-app.h
 create mode 120000 src/components/implementation/no_interface/omp_sparselu_for_bots/posix_basic.c
 create mode 100644 src/components/implementation/no_interface/omp_sparselu_for_bots/sparselu.c
 create mode 100644 src/components/implementation/no_interface/omp_sparselu_for_bots/sparselu.h
 create mode 100644 src/components/implementation/no_interface/omp_sparselu_single_bots/Makefile
 create mode 100644 src/components/implementation/no_interface/omp_sparselu_single_bots/app-desc.h
 create mode 120000 src/components/implementation/no_interface/omp_sparselu_single_bots/bots.h
 create mode 120000 src/components/implementation/no_interface/omp_sparselu_single_bots/bots_common.c
 create mode 120000 src/components/implementation/no_interface/omp_sparselu_single_bots/bots_common.h
 create mode 120000 src/components/implementation/no_interface/omp_sparselu_single_bots/bots_main.c
 create mode 120000 src/components/implementation/no_interface/omp_sparselu_single_bots/bots_main.h
 create mode 120000 src/components/implementation/no_interface/omp_sparselu_single_bots/init.c
 create mode 120000 src/components/implementation/no_interface/omp_sparselu_single_bots/omp-tasks-app.h
 create mode 120000 src/components/implementation/no_interface/omp_sparselu_single_bots/posix_basic.c
 create mode 100644 src/components/implementation/no_interface/omp_sparselu_single_bots/sparselu.c
 create mode 100644 src/components/implementation/no_interface/omp_sparselu_single_bots/sparselu.h
 create mode 100644 src/components/implementation/no_interface/omp_strassen_bots/Makefile
 create mode 100644 src/components/implementation/no_interface/omp_strassen_bots/app-desc.h
 create mode 120000 src/components/implementation/no_interface/omp_strassen_bots/bots.h
 create mode 120000 src/components/implementation/no_interface/omp_strassen_bots/bots_common.c
 create mode 120000 src/components/implementation/no_interface/omp_strassen_bots/bots_common.h
 create mode 120000 src/components/implementation/no_interface/omp_strassen_bots/bots_main.c
 create mode 120000 src/components/implementation/no_interface/omp_strassen_bots/bots_main.h
 create mode 120000 src/components/implementation/no_interface/omp_strassen_bots/init.c
 create mode 120000 src/components/implementation/no_interface/omp_strassen_bots/omp-tasks-app.h
 create mode 120000 src/components/implementation/no_interface/omp_strassen_bots/posix_basic.c
 create mode 100644 src/components/implementation/no_interface/omp_strassen_bots/strassen.c
 create mode 100644 src/components/implementation/no_interface/omp_strassen_bots/strassen.h
 create mode 100644 src/platform/i386/runscripts/omp_fft_bots.sh
 create mode 100644 src/platform/i386/runscripts/omp_sparselu_for_bots.sh
 create mode 100644 src/platform/i386/runscripts/omp_sparselu_single_bots.sh
 create mode 100644 src/platform/i386/runscripts/omp_strassen_bots.sh

diff --git a/src/components/Makefile.comp b/src/components/Makefile.comp
index 2a4887534e..6afb70aaff 100644
--- a/src/components/Makefile.comp
+++ b/src/components/Makefile.comp
@@ -56,6 +56,7 @@ LDFLAGS+=-no-pie
 CFLAGS+=-fno-pie
 CXXFLAGS+=-fno-pie
 endif
+CFLAGS+=-march=i386 -msse
 
 SERVER_STUB=s_stub.o
 CLIENT_STUB=c_stub.o
diff --git a/src/components/implementation/no_interface/omp_dijkstra/posix_basic.c b/src/components/implementation/no_interface/omp_dijkstra/posix_basic.c
index c29ae23770..5a86408010 100644
--- a/src/components/implementation/no_interface/omp_dijkstra/posix_basic.c
+++ b/src/components/implementation/no_interface/omp_dijkstra/posix_basic.c
@@ -70,7 +70,7 @@ cos_syscall_handler(int syscall_num, long a, long b, long c, long d, long e, lon
 		return (long)cos_mmap((void *)a, (size_t)b, (int)c, (int)d, (int)e, (off_t)f);
 	}
 
-	if (syscall_num == __NR_brk) {
+	if (syscall_num == __NR_brk || syscall_num == __NR_munmap) {
 		return 0;
 	}
 
diff --git a/src/components/implementation/no_interface/omp_fft_bots/Makefile b/src/components/implementation/no_interface/omp_fft_bots/Makefile
new file mode 100644
index 0000000000..c5d7dddf99
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fft_bots/Makefile
@@ -0,0 +1,19 @@
+COMPONENT=omp_fft_bots.o
+INTERFACES=
+#DEPENDENCIES=capmgr
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
+
+CFLAGS += -fopenmp
+# if tied tasks are required
+CFLAGS += -DFORCE_TIED_TASKS
+
+OMPC_FINAL_FLAGS=
+
+# one per compilation or none
+#CFLAGS += -DMANUAL_CUTOFF
+#CFLAGS += -DIF_CUTOFF
+#CFLAGS += -DFINAL_CUTOFF $(OMPC_FINAL_FLAGS) 
diff --git a/src/components/implementation/no_interface/omp_fft_bots/app-desc.h b/src/components/implementation/no_interface/omp_fft_bots/app-desc.h
new file mode 100644
index 0000000000..d31b29104e
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fft_bots/app-desc.h
@@ -0,0 +1,56 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include "omp-tasks-app.h"
+#include "fft.h"
+
+#define BOTS_APP_NAME "FFT"
+#define BOTS_APP_PARAMETERS_DESC "Size=%d"
+#define BOTS_APP_PARAMETERS_LIST ,bots_arg_size
+
+#define BOTS_APP_USES_ARG_SIZE
+#define BOTS_APP_DEF_ARG_SIZE 32*1024*1024
+#define BOTS_APP_DESC_ARG_SIZE "Matrix Size"
+
+#define BOTS_APP_INIT int i;\
+     COMPLEX *in, *out1=NULL, *out2=NULL;\
+     in = malloc(bots_arg_size * sizeof(COMPLEX));\
+
+#define KERNEL_INIT\
+     out1 = malloc(bots_arg_size * sizeof(COMPLEX));\
+     for (i = 0; i < bots_arg_size; ++i) {\
+          c_re(in[i]) = 1.0;\
+          c_im(in[i]) = 1.0;\
+     }
+#define KERNEL_CALL fft(bots_arg_size, in, out1);
+#define KERNEL_FINI 
+
+#define KERNEL_SEQ_INIT\
+     out2 = malloc(bots_arg_size * sizeof(COMPLEX));\
+     for (i = 0; i < bots_arg_size; ++i) {\
+          c_re(in[i]) = 1.0;\
+          c_im(in[i]) = 1.0;\
+     }
+#define KERNEL_SEQ_CALL fft_seq(bots_arg_size, in, out2);
+#define KERNEL_SEQ_FINI
+
+#undef BOTS_APP_CHECK_USES_SEQ_RESULT
+#define KERNEL_CHECK test_correctness(bots_arg_size, out1, out2)
+
diff --git a/src/components/implementation/no_interface/omp_fft_bots/bots.h b/src/components/implementation/no_interface/omp_fft_bots/bots.h
new file mode 120000
index 0000000000..ea0ad2b59f
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fft_bots/bots.h
@@ -0,0 +1 @@
+../omp_fib_bots/bots.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_fft_bots/bots_common.c b/src/components/implementation/no_interface/omp_fft_bots/bots_common.c
new file mode 120000
index 0000000000..4802b0cf70
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fft_bots/bots_common.c
@@ -0,0 +1 @@
+../omp_fib_bots/bots_common.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_fft_bots/bots_common.h b/src/components/implementation/no_interface/omp_fft_bots/bots_common.h
new file mode 120000
index 0000000000..14eda863e4
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fft_bots/bots_common.h
@@ -0,0 +1 @@
+../omp_fib_bots/bots_common.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_fft_bots/bots_main.c b/src/components/implementation/no_interface/omp_fft_bots/bots_main.c
new file mode 120000
index 0000000000..14f2dab009
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fft_bots/bots_main.c
@@ -0,0 +1 @@
+../omp_fib_bots/bots_main.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_fft_bots/bots_main.h b/src/components/implementation/no_interface/omp_fft_bots/bots_main.h
new file mode 120000
index 0000000000..86c06ad286
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fft_bots/bots_main.h
@@ -0,0 +1 @@
+../omp_fib_bots/bots_main.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_fft_bots/fft.c b/src/components/implementation/no_interface/omp_fft_bots/fft.c
new file mode 100644
index 0000000000..b030676e26
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fft_bots/fft.c
@@ -0,0 +1,5015 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+/* 
+ * Original code from the Cilk project 
+ *
+ * Copyright (c) 2000 Massachusetts Institute of Technology
+ * Copyright (c) 2000 Matteo Frigo
+ */
+
+#include <stdio.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "bots.h"
+#include "app-desc.h"
+
+/* Definitions and operations for complex numbers */
+
+/*
+ * compute the W coefficients (that is, powers of the root of 1)
+ * and store them into an array.
+ */
+void compute_w_coefficients(int n, int a, int b, COMPLEX * W)
+{
+     register double twoPiOverN;
+     register int k;
+     register REAL s, c;
+
+     if (b - a < 128) {
+	  twoPiOverN = 2.0 * 3.1415926535897932384626434 / n;
+	  for (k = a; k <= b; ++k) {
+	       c = cos(twoPiOverN * k);
+	       c_re(W[k]) = c_re(W[n - k]) = c;
+	       s = sin(twoPiOverN * k);
+	       c_im(W[k]) = -s;
+	       c_im(W[n - k]) = s;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+#if defined(FORCE_TIED_TASKS)
+          #pragma omp task
+	  compute_w_coefficients(n, a, ab, W);
+          #pragma omp task
+	  compute_w_coefficients(n, ab + 1, b, W);
+#else
+          #pragma omp task untied
+	  compute_w_coefficients(n, a, ab, W);
+          #pragma omp task untied
+	  compute_w_coefficients(n, ab + 1, b, W);
+#endif
+          #pragma omp taskwait
+     }
+}
+void compute_w_coefficients_seq(int n, int a, int b, COMPLEX * W)
+{
+     register double twoPiOverN;
+     register int k;
+     register REAL s, c;
+
+     if (b - a < 128) {
+	  twoPiOverN = 2.0 * 3.1415926535897932384626434 / n;
+	  for (k = a; k <= b; ++k) {
+	       c = cos(twoPiOverN * k);
+	       c_re(W[k]) = c_re(W[n - k]) = c;
+	       s = sin(twoPiOverN * k);
+	       c_im(W[k]) = -s;
+	       c_im(W[n - k]) = s;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  compute_w_coefficients_seq(n, a, ab, W);
+	  compute_w_coefficients_seq(n, ab + 1, b, W);
+     }
+}
+/*
+ * Determine (in a stupid way) if n is divisible by eight, then by four, else
+ * find the smallest prime factor of n.
+ */
+int factor(int n)
+{
+     int r;
+
+     if (n < 2) return 1;
+     if (n == 64 || n == 128 || n == 256 || n == 1024 || n == 2048 || n == 4096) return 8;
+     if ((n & 15) == 0) return 16;
+     if ((n & 7) == 0) return 8;
+     if ((n & 3) == 0) return 4;
+     if ((n & 1) == 0) return 2;
+
+     /* try odd numbers up to n (computing the sqrt may be slower) */
+     for (r = 3; r < n; r += 2) if (n % r == 0) return r;
+
+     /* n is prime */
+     return n;
+}
+
+void unshuffle(int a, int b, COMPLEX * in, COMPLEX * out, int r, int m)
+{
+     int i, j;
+     int r4 = r & (~0x3);
+     const COMPLEX *ip;
+     COMPLEX *jp;
+
+     if (b - a < 16) {
+	  ip = in + a * r;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       for (j = 0; j < r4; j += 4) {
+		    jp[0] = ip[0];
+		    jp[m] = ip[1];
+		    jp[2 * m] = ip[2];
+		    jp[3 * m] = ip[3];
+		    jp += 4 * m;
+		    ip += 4;
+	       }
+	       for (; j < r; ++j) {
+		    *jp = *ip;
+		    ip++;
+		    jp += m;
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+#if defined(FORCE_TIED_TASKS)
+          #pragma omp task
+	  unshuffle(a, ab, in, out, r, m);
+          #pragma omp task
+	  unshuffle(ab, b, in, out, r, m);
+#else
+          #pragma omp task untied
+	  unshuffle(a, ab, in, out, r, m);
+          #pragma omp task untied
+	  unshuffle(ab, b, in, out, r, m);
+#endif
+          #pragma omp taskwait
+     }
+}
+void unshuffle_seq(int a, int b, COMPLEX * in, COMPLEX * out, int r, int m)
+{
+     int i, j;
+     int r4 = r & (~0x3);
+     const COMPLEX *ip;
+     COMPLEX *jp;
+
+     if (b - a < 16) {
+	  ip = in + a * r;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       for (j = 0; j < r4; j += 4) {
+		    jp[0] = ip[0];
+		    jp[m] = ip[1];
+		    jp[2 * m] = ip[2];
+		    jp[3 * m] = ip[3];
+		    jp += 4 * m;
+		    ip += 4;
+	       }
+	       for (; j < r; ++j) {
+		    *jp = *ip;
+		    ip++;
+		    jp += m;
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  unshuffle_seq(a, ab, in, out, r, m);
+	  unshuffle_seq(ab, b, in, out, r, m);
+     }
+}
+void fft_twiddle_gen1(COMPLEX * in, COMPLEX * out,
+				  COMPLEX * W, int r, int m,
+				  int nW, int nWdnti, int nWdntm)
+{
+     int j, k;
+     COMPLEX *jp, *kp;
+
+     for (k = 0, kp = out; k < r; ++k, kp += m) {
+	  REAL r0, i0, rt, it, rw, iw;
+	  int l1 = nWdnti + nWdntm * k;
+	  int l0;
+
+	  r0 = i0 = 0.0;
+	  for (j = 0, jp = in, l0 = 0; j < r; ++j, jp += m) {
+	       rw = c_re(W[l0]);
+	       iw = c_im(W[l0]);
+	       rt = c_re(*jp);
+	       it = c_im(*jp);
+	       r0 += rt * rw - it * iw;
+	       i0 += rt * iw + it * rw;
+	       l0 += l1;
+	       if (l0 > nW)
+		    l0 -= nW;
+	  }
+	  c_re(*kp) = r0;
+	  c_im(*kp) = i0;
+     }
+}
+
+void fft_twiddle_gen(int i, int i1, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int r, int m)
+{
+#if defined(FORCE_TIED_TASKS)
+     if (i == i1 - 1) {
+          #pragma omp task
+	  fft_twiddle_gen1(in + i, out + i, W,
+				 r, m, nW, nWdn * i, nWdn * m);
+     } else {
+	  int i2 = (i + i1) / 2;
+          #pragma omp task
+	  fft_twiddle_gen(i, i2, in, out, W, nW,
+				nWdn, r, m);
+          #pragma omp task
+	  fft_twiddle_gen(i2, i1, in, out, W, nW,
+				nWdn, r, m);
+     }
+#else
+     if (i == i1 - 1) {
+          #pragma omp task untied
+	  fft_twiddle_gen1(in + i, out + i, W,
+				 r, m, nW, nWdn * i, nWdn * m);
+     } else {
+	  int i2 = (i + i1) / 2;
+          #pragma omp task untied
+	  fft_twiddle_gen(i, i2, in, out, W, nW,
+				nWdn, r, m);
+          #pragma omp task untied
+	  fft_twiddle_gen(i2, i1, in, out, W, nW,
+				nWdn, r, m);
+     }
+#endif
+     #pragma omp taskwait
+}
+void fft_twiddle_gen_seq(int i, int i1, COMPLEX * in, COMPLEX * out, COMPLEX * W,
+                         int nW, int nWdn, int r, int m)
+{
+     if (i == i1 - 1) {
+	  fft_twiddle_gen1(in + i, out + i, W,
+				 r, m, nW, nWdn * i, nWdn * m);
+     } else {
+	  int i2 = (i + i1) / 2;
+	  fft_twiddle_gen_seq(i, i2, in, out, W, nW,
+				nWdn, r, m);
+	  fft_twiddle_gen_seq(i2, i1, in, out, W, nW,
+				nWdn, r, m);
+     }
+}
+/* machine-generated code begins here */
+void fft_base_2(COMPLEX * in, COMPLEX * out)
+{
+     REAL r1_0, i1_0;
+     REAL r1_1, i1_1;
+     r1_0 = c_re(in[0]);
+     i1_0 = c_im(in[0]);
+     r1_1 = c_re(in[1]);
+     i1_1 = c_im(in[1]);
+     c_re(out[0]) = (r1_0 + r1_1);
+     c_im(out[0]) = (i1_0 + i1_1);
+     c_re(out[1]) = (r1_0 - r1_1);
+     c_im(out[1]) = (i1_0 - i1_1);
+}
+void fft_twiddle_2(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m)
+{
+     int l1, i;
+     COMPLEX *jp, *kp;
+     REAL tmpr, tmpi, wr, wi;
+     if ((b - a) < 128) {
+	  for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+	       i++, l1 += nWdn, kp++) {
+	       jp = in + i;
+	       {
+		    REAL r1_0, i1_0;
+		    REAL r1_1, i1_1;
+		    r1_0 = c_re(jp[0 * m]);
+		    i1_0 = c_im(jp[0 * m]);
+		    wr = c_re(W[1 * l1]);
+		    wi = c_im(W[1 * l1]);
+		    tmpr = c_re(jp[1 * m]);
+		    tmpi = c_im(jp[1 * m]);
+		    r1_1 = ((wr * tmpr) - (wi * tmpi));
+		    i1_1 = ((wi * tmpr) + (wr * tmpi));
+		    c_re(kp[0 * m]) = (r1_0 + r1_1);
+		    c_im(kp[0 * m]) = (i1_0 + i1_1);
+		    c_re(kp[1 * m]) = (r1_0 - r1_1);
+		    c_im(kp[1 * m]) = (i1_0 - i1_1);
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+#if defined(FORCE_TIED_TASKS)
+          #pragma omp task
+	  fft_twiddle_2(a, ab, in, out, W, nW, nWdn, m);
+          #pragma omp task
+	  fft_twiddle_2(ab, b, in, out, W, nW, nWdn, m);
+#else
+          #pragma omp task untied
+	  fft_twiddle_2(a, ab, in, out, W, nW, nWdn, m);
+          #pragma omp task untied
+	  fft_twiddle_2(ab, b, in, out, W, nW, nWdn, m);
+#endif
+          #pragma omp taskwait
+     }
+}
+void fft_twiddle_2_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m)
+{
+     int l1, i;
+     COMPLEX *jp, *kp;
+     REAL tmpr, tmpi, wr, wi;
+     if ((b - a) < 128) {
+	  for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+	       i++, l1 += nWdn, kp++) {
+	       jp = in + i;
+	       {
+		    REAL r1_0, i1_0;
+		    REAL r1_1, i1_1;
+		    r1_0 = c_re(jp[0 * m]);
+		    i1_0 = c_im(jp[0 * m]);
+		    wr = c_re(W[1 * l1]);
+		    wi = c_im(W[1 * l1]);
+		    tmpr = c_re(jp[1 * m]);
+		    tmpi = c_im(jp[1 * m]);
+		    r1_1 = ((wr * tmpr) - (wi * tmpi));
+		    i1_1 = ((wi * tmpr) + (wr * tmpi));
+		    c_re(kp[0 * m]) = (r1_0 + r1_1);
+		    c_im(kp[0 * m]) = (i1_0 + i1_1);
+		    c_re(kp[1 * m]) = (r1_0 - r1_1);
+		    c_im(kp[1 * m]) = (i1_0 - i1_1);
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  fft_twiddle_2_seq(a, ab, in, out, W, nW, nWdn, m);
+	  fft_twiddle_2_seq(ab, b, in, out, W, nW, nWdn, m);
+     }
+}
+void fft_unshuffle_2(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+     int i;
+     const COMPLEX *ip;
+     COMPLEX *jp;
+     if ((b - a) < 128) {
+	  ip = in + a * 2;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+#if defined(FORCE_TIED_TASKS)
+          #pragma omp task
+	  fft_unshuffle_2(a, ab, in, out, m);
+          #pragma omp task
+	  fft_unshuffle_2(ab, b, in, out, m);
+#else
+          #pragma omp task untied
+	  fft_unshuffle_2(a, ab, in, out, m);
+          #pragma omp task untied
+	  fft_unshuffle_2(ab, b, in, out, m);
+#endif
+          #pragma omp taskwait
+     }
+}
+void fft_unshuffle_2_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+     int i;
+     const COMPLEX *ip;
+     COMPLEX *jp;
+     if ((b - a) < 128) {
+	  ip = in + a * 2;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  fft_unshuffle_2_seq(a, ab, in, out, m);
+	  fft_unshuffle_2_seq(ab, b, in, out, m);
+     }
+}
+void fft_base_4(COMPLEX * in, COMPLEX * out)
+{
+     REAL r1_0, i1_0;
+     REAL r1_1, i1_1;
+     REAL r1_2, i1_2;
+     REAL r1_3, i1_3;
+     {
+	  REAL r2_0, i2_0;
+	  REAL r2_2, i2_2;
+	  r2_0 = c_re(in[0]);
+	  i2_0 = c_im(in[0]);
+	  r2_2 = c_re(in[2]);
+	  i2_2 = c_im(in[2]);
+	  r1_0 = (r2_0 + r2_2);
+	  i1_0 = (i2_0 + i2_2);
+	  r1_2 = (r2_0 - r2_2);
+	  i1_2 = (i2_0 - i2_2);
+     }
+     {
+	  REAL r2_1, i2_1;
+	  REAL r2_3, i2_3;
+	  r2_1 = c_re(in[1]);
+	  i2_1 = c_im(in[1]);
+	  r2_3 = c_re(in[3]);
+	  i2_3 = c_im(in[3]);
+	  r1_1 = (r2_1 + r2_3);
+	  i1_1 = (i2_1 + i2_3);
+	  r1_3 = (r2_1 - r2_3);
+	  i1_3 = (i2_1 - i2_3);
+     }
+     c_re(out[0]) = (r1_0 + r1_1);
+     c_im(out[0]) = (i1_0 + i1_1);
+     c_re(out[2]) = (r1_0 - r1_1);
+     c_im(out[2]) = (i1_0 - i1_1);
+     c_re(out[1]) = (r1_2 + i1_3);
+     c_im(out[1]) = (i1_2 - r1_3);
+     c_re(out[3]) = (r1_2 - i1_3);
+     c_im(out[3]) = (i1_2 + r1_3);
+}
+void fft_twiddle_4(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m)
+{
+     int l1, i;
+     COMPLEX *jp, *kp;
+     REAL tmpr, tmpi, wr, wi;
+     if ((b - a) < 128) {
+	  for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+	       i++, l1 += nWdn, kp++) {
+	       jp = in + i;
+	       {
+		    REAL r1_0, i1_0;
+		    REAL r1_1, i1_1;
+		    REAL r1_2, i1_2;
+		    REAL r1_3, i1_3;
+		    {
+			 REAL r2_0, i2_0;
+			 REAL r2_2, i2_2;
+			 r2_0 = c_re(jp[0 * m]);
+			 i2_0 = c_im(jp[0 * m]);
+			 wr = c_re(W[2 * l1]);
+			 wi = c_im(W[2 * l1]);
+			 tmpr = c_re(jp[2 * m]);
+			 tmpi = c_im(jp[2 * m]);
+			 r2_2 = ((wr * tmpr) - (wi * tmpi));
+			 i2_2 = ((wi * tmpr) + (wr * tmpi));
+			 r1_0 = (r2_0 + r2_2);
+			 i1_0 = (i2_0 + i2_2);
+			 r1_2 = (r2_0 - r2_2);
+			 i1_2 = (i2_0 - i2_2);
+		    }
+		    {
+			 REAL r2_1, i2_1;
+			 REAL r2_3, i2_3;
+			 wr = c_re(W[1 * l1]);
+			 wi = c_im(W[1 * l1]);
+			 tmpr = c_re(jp[1 * m]);
+			 tmpi = c_im(jp[1 * m]);
+			 r2_1 = ((wr * tmpr) - (wi * tmpi));
+			 i2_1 = ((wi * tmpr) + (wr * tmpi));
+			 wr = c_re(W[3 * l1]);
+			 wi = c_im(W[3 * l1]);
+			 tmpr = c_re(jp[3 * m]);
+			 tmpi = c_im(jp[3 * m]);
+			 r2_3 = ((wr * tmpr) - (wi * tmpi));
+			 i2_3 = ((wi * tmpr) + (wr * tmpi));
+			 r1_1 = (r2_1 + r2_3);
+			 i1_1 = (i2_1 + i2_3);
+			 r1_3 = (r2_1 - r2_3);
+			 i1_3 = (i2_1 - i2_3);
+		    }
+		    c_re(kp[0 * m]) = (r1_0 + r1_1);
+		    c_im(kp[0 * m]) = (i1_0 + i1_1);
+		    c_re(kp[2 * m]) = (r1_0 - r1_1);
+		    c_im(kp[2 * m]) = (i1_0 - i1_1);
+		    c_re(kp[1 * m]) = (r1_2 + i1_3);
+		    c_im(kp[1 * m]) = (i1_2 - r1_3);
+		    c_re(kp[3 * m]) = (r1_2 - i1_3);
+		    c_im(kp[3 * m]) = (i1_2 + r1_3);
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+#if defined(FORCE_TIED_TASKS)
+          #pragma omp task
+	  fft_twiddle_4(a, ab, in, out, W, nW, nWdn, m);
+          #pragma omp task
+	  fft_twiddle_4(ab, b, in, out, W, nW, nWdn, m);
+#else
+          #pragma omp task untied
+	  fft_twiddle_4(a, ab, in, out, W, nW, nWdn, m);
+          #pragma omp task untied
+	  fft_twiddle_4(ab, b, in, out, W, nW, nWdn, m);
+#endif
+          #pragma omp taskwait
+     }
+}
+void fft_twiddle_4_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m)
+{
+     int l1, i;
+     COMPLEX *jp, *kp;
+     REAL tmpr, tmpi, wr, wi;
+     if ((b - a) < 128) {
+	  for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+	       i++, l1 += nWdn, kp++) {
+	       jp = in + i;
+	       {
+		    REAL r1_0, i1_0;
+		    REAL r1_1, i1_1;
+		    REAL r1_2, i1_2;
+		    REAL r1_3, i1_3;
+		    {
+			 REAL r2_0, i2_0;
+			 REAL r2_2, i2_2;
+			 r2_0 = c_re(jp[0 * m]);
+			 i2_0 = c_im(jp[0 * m]);
+			 wr = c_re(W[2 * l1]);
+			 wi = c_im(W[2 * l1]);
+			 tmpr = c_re(jp[2 * m]);
+			 tmpi = c_im(jp[2 * m]);
+			 r2_2 = ((wr * tmpr) - (wi * tmpi));
+			 i2_2 = ((wi * tmpr) + (wr * tmpi));
+			 r1_0 = (r2_0 + r2_2);
+			 i1_0 = (i2_0 + i2_2);
+			 r1_2 = (r2_0 - r2_2);
+			 i1_2 = (i2_0 - i2_2);
+		    }
+		    {
+			 REAL r2_1, i2_1;
+			 REAL r2_3, i2_3;
+			 wr = c_re(W[1 * l1]);
+			 wi = c_im(W[1 * l1]);
+			 tmpr = c_re(jp[1 * m]);
+			 tmpi = c_im(jp[1 * m]);
+			 r2_1 = ((wr * tmpr) - (wi * tmpi));
+			 i2_1 = ((wi * tmpr) + (wr * tmpi));
+			 wr = c_re(W[3 * l1]);
+			 wi = c_im(W[3 * l1]);
+			 tmpr = c_re(jp[3 * m]);
+			 tmpi = c_im(jp[3 * m]);
+			 r2_3 = ((wr * tmpr) - (wi * tmpi));
+			 i2_3 = ((wi * tmpr) + (wr * tmpi));
+			 r1_1 = (r2_1 + r2_3);
+			 i1_1 = (i2_1 + i2_3);
+			 r1_3 = (r2_1 - r2_3);
+			 i1_3 = (i2_1 - i2_3);
+		    }
+		    c_re(kp[0 * m]) = (r1_0 + r1_1);
+		    c_im(kp[0 * m]) = (i1_0 + i1_1);
+		    c_re(kp[2 * m]) = (r1_0 - r1_1);
+		    c_im(kp[2 * m]) = (i1_0 - i1_1);
+		    c_re(kp[1 * m]) = (r1_2 + i1_3);
+		    c_im(kp[1 * m]) = (i1_2 - r1_3);
+		    c_re(kp[3 * m]) = (r1_2 - i1_3);
+		    c_im(kp[3 * m]) = (i1_2 + r1_3);
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  fft_twiddle_4_seq(a, ab, in, out, W, nW, nWdn, m);
+	  fft_twiddle_4_seq(ab, b, in, out, W, nW, nWdn, m);
+     }
+}
+void fft_unshuffle_4(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+     int i;
+     const COMPLEX *ip;
+     COMPLEX *jp;
+     if ((b - a) < 128) {
+	  ip = in + a * 4;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+#if defined(FORCE_TIED_TASKS)
+          #pragma omp task
+	  fft_unshuffle_4(a, ab, in, out, m);
+          #pragma omp task
+	  fft_unshuffle_4(ab, b, in, out, m);
+#else
+          #pragma omp task untied
+	  fft_unshuffle_4(a, ab, in, out, m);
+          #pragma omp task untied
+	  fft_unshuffle_4(ab, b, in, out, m);
+#endif
+          #pragma omp taskwait
+     }
+}
+void fft_unshuffle_4_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+     int i;
+     const COMPLEX *ip;
+     COMPLEX *jp;
+     if ((b - a) < 128) {
+	  ip = in + a * 4;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  fft_unshuffle_4_seq(a, ab, in, out, m);
+	  fft_unshuffle_4_seq(ab, b, in, out, m);
+     }
+}
+void fft_base_8(COMPLEX * in, COMPLEX * out)
+{
+     REAL tmpr, tmpi;
+     {
+	  REAL r1_0, i1_0;
+	  REAL r1_1, i1_1;
+	  REAL r1_2, i1_2;
+	  REAL r1_3, i1_3;
+	  REAL r1_4, i1_4;
+	  REAL r1_5, i1_5;
+	  REAL r1_6, i1_6;
+	  REAL r1_7, i1_7;
+	  {
+	       REAL r2_0, i2_0;
+	       REAL r2_2, i2_2;
+	       REAL r2_4, i2_4;
+	       REAL r2_6, i2_6;
+	       {
+		    REAL r3_0, i3_0;
+		    REAL r3_4, i3_4;
+		    r3_0 = c_re(in[0]);
+		    i3_0 = c_im(in[0]);
+		    r3_4 = c_re(in[4]);
+		    i3_4 = c_im(in[4]);
+		    r2_0 = (r3_0 + r3_4);
+		    i2_0 = (i3_0 + i3_4);
+		    r2_4 = (r3_0 - r3_4);
+		    i2_4 = (i3_0 - i3_4);
+	       }
+	       {
+		    REAL r3_2, i3_2;
+		    REAL r3_6, i3_6;
+		    r3_2 = c_re(in[2]);
+		    i3_2 = c_im(in[2]);
+		    r3_6 = c_re(in[6]);
+		    i3_6 = c_im(in[6]);
+		    r2_2 = (r3_2 + r3_6);
+		    i2_2 = (i3_2 + i3_6);
+		    r2_6 = (r3_2 - r3_6);
+		    i2_6 = (i3_2 - i3_6);
+	       }
+	       r1_0 = (r2_0 + r2_2);
+	       i1_0 = (i2_0 + i2_2);
+	       r1_4 = (r2_0 - r2_2);
+	       i1_4 = (i2_0 - i2_2);
+	       r1_2 = (r2_4 + i2_6);
+	       i1_2 = (i2_4 - r2_6);
+	       r1_6 = (r2_4 - i2_6);
+	       i1_6 = (i2_4 + r2_6);
+	  }
+	  {
+	       REAL r2_1, i2_1;
+	       REAL r2_3, i2_3;
+	       REAL r2_5, i2_5;
+	       REAL r2_7, i2_7;
+	       {
+		    REAL r3_1, i3_1;
+		    REAL r3_5, i3_5;
+		    r3_1 = c_re(in[1]);
+		    i3_1 = c_im(in[1]);
+		    r3_5 = c_re(in[5]);
+		    i3_5 = c_im(in[5]);
+		    r2_1 = (r3_1 + r3_5);
+		    i2_1 = (i3_1 + i3_5);
+		    r2_5 = (r3_1 - r3_5);
+		    i2_5 = (i3_1 - i3_5);
+	       }
+	       {
+		    REAL r3_3, i3_3;
+		    REAL r3_7, i3_7;
+		    r3_3 = c_re(in[3]);
+		    i3_3 = c_im(in[3]);
+		    r3_7 = c_re(in[7]);
+		    i3_7 = c_im(in[7]);
+		    r2_3 = (r3_3 + r3_7);
+		    i2_3 = (i3_3 + i3_7);
+		    r2_7 = (r3_3 - r3_7);
+		    i2_7 = (i3_3 - i3_7);
+	       }
+	       r1_1 = (r2_1 + r2_3);
+	       i1_1 = (i2_1 + i2_3);
+	       r1_5 = (r2_1 - r2_3);
+	       i1_5 = (i2_1 - i2_3);
+	       r1_3 = (r2_5 + i2_7);
+	       i1_3 = (i2_5 - r2_7);
+	       r1_7 = (r2_5 - i2_7);
+	       i1_7 = (i2_5 + r2_7);
+	  }
+	  c_re(out[0]) = (r1_0 + r1_1);
+	  c_im(out[0]) = (i1_0 + i1_1);
+	  c_re(out[4]) = (r1_0 - r1_1);
+	  c_im(out[4]) = (i1_0 - i1_1);
+	  tmpr = (0.707106781187 * (r1_3 + i1_3));
+	  tmpi = (0.707106781187 * (i1_3 - r1_3));
+	  c_re(out[1]) = (r1_2 + tmpr);
+	  c_im(out[1]) = (i1_2 + tmpi);
+	  c_re(out[5]) = (r1_2 - tmpr);
+	  c_im(out[5]) = (i1_2 - tmpi);
+	  c_re(out[2]) = (r1_4 + i1_5);
+	  c_im(out[2]) = (i1_4 - r1_5);
+	  c_re(out[6]) = (r1_4 - i1_5);
+	  c_im(out[6]) = (i1_4 + r1_5);
+	  tmpr = (0.707106781187 * (i1_7 - r1_7));
+	  tmpi = (0.707106781187 * (r1_7 + i1_7));
+	  c_re(out[3]) = (r1_6 + tmpr);
+	  c_im(out[3]) = (i1_6 - tmpi);
+	  c_re(out[7]) = (r1_6 - tmpr);
+	  c_im(out[7]) = (i1_6 + tmpi);
+     }
+}
+void fft_twiddle_8(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m)
+{
+     int l1, i;
+     COMPLEX *jp, *kp;
+     REAL tmpr, tmpi, wr, wi;
+     if ((b - a) < 128) {
+	  for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+	       i++, l1 += nWdn, kp++) {
+	       jp = in + i;
+	       {
+		    REAL r1_0, i1_0;
+		    REAL r1_1, i1_1;
+		    REAL r1_2, i1_2;
+		    REAL r1_3, i1_3;
+		    REAL r1_4, i1_4;
+		    REAL r1_5, i1_5;
+		    REAL r1_6, i1_6;
+		    REAL r1_7, i1_7;
+		    {
+			 REAL r2_0, i2_0;
+			 REAL r2_2, i2_2;
+			 REAL r2_4, i2_4;
+			 REAL r2_6, i2_6;
+			 {
+			      REAL r3_0, i3_0;
+			      REAL r3_4, i3_4;
+			      r3_0 = c_re(jp[0 * m]);
+			      i3_0 = c_im(jp[0 * m]);
+			      wr = c_re(W[4 * l1]);
+			      wi = c_im(W[4 * l1]);
+			      tmpr = c_re(jp[4 * m]);
+			      tmpi = c_im(jp[4 * m]);
+			      r3_4 = ((wr * tmpr) - (wi * tmpi));
+			      i3_4 = ((wi * tmpr) + (wr * tmpi));
+			      r2_0 = (r3_0 + r3_4);
+			      i2_0 = (i3_0 + i3_4);
+			      r2_4 = (r3_0 - r3_4);
+			      i2_4 = (i3_0 - i3_4);
+			 }
+			 {
+			      REAL r3_2, i3_2;
+			      REAL r3_6, i3_6;
+			      wr = c_re(W[2 * l1]);
+			      wi = c_im(W[2 * l1]);
+			      tmpr = c_re(jp[2 * m]);
+			      tmpi = c_im(jp[2 * m]);
+			      r3_2 = ((wr * tmpr) - (wi * tmpi));
+			      i3_2 = ((wi * tmpr) + (wr * tmpi));
+			      wr = c_re(W[6 * l1]);
+			      wi = c_im(W[6 * l1]);
+			      tmpr = c_re(jp[6 * m]);
+			      tmpi = c_im(jp[6 * m]);
+			      r3_6 = ((wr * tmpr) - (wi * tmpi));
+			      i3_6 = ((wi * tmpr) + (wr * tmpi));
+			      r2_2 = (r3_2 + r3_6);
+			      i2_2 = (i3_2 + i3_6);
+			      r2_6 = (r3_2 - r3_6);
+			      i2_6 = (i3_2 - i3_6);
+			 }
+			 r1_0 = (r2_0 + r2_2);
+			 i1_0 = (i2_0 + i2_2);
+			 r1_4 = (r2_0 - r2_2);
+			 i1_4 = (i2_0 - i2_2);
+			 r1_2 = (r2_4 + i2_6);
+			 i1_2 = (i2_4 - r2_6);
+			 r1_6 = (r2_4 - i2_6);
+			 i1_6 = (i2_4 + r2_6);
+		    }
+		    {
+			 REAL r2_1, i2_1;
+			 REAL r2_3, i2_3;
+			 REAL r2_5, i2_5;
+			 REAL r2_7, i2_7;
+			 {
+			      REAL r3_1, i3_1;
+			      REAL r3_5, i3_5;
+			      wr = c_re(W[1 * l1]);
+			      wi = c_im(W[1 * l1]);
+			      tmpr = c_re(jp[1 * m]);
+			      tmpi = c_im(jp[1 * m]);
+			      r3_1 = ((wr * tmpr) - (wi * tmpi));
+			      i3_1 = ((wi * tmpr) + (wr * tmpi));
+			      wr = c_re(W[5 * l1]);
+			      wi = c_im(W[5 * l1]);
+			      tmpr = c_re(jp[5 * m]);
+			      tmpi = c_im(jp[5 * m]);
+			      r3_5 = ((wr * tmpr) - (wi * tmpi));
+			      i3_5 = ((wi * tmpr) + (wr * tmpi));
+			      r2_1 = (r3_1 + r3_5);
+			      i2_1 = (i3_1 + i3_5);
+			      r2_5 = (r3_1 - r3_5);
+			      i2_5 = (i3_1 - i3_5);
+			 }
+			 {
+			      REAL r3_3, i3_3;
+			      REAL r3_7, i3_7;
+			      wr = c_re(W[3 * l1]);
+			      wi = c_im(W[3 * l1]);
+			      tmpr = c_re(jp[3 * m]);
+			      tmpi = c_im(jp[3 * m]);
+			      r3_3 = ((wr * tmpr) - (wi * tmpi));
+			      i3_3 = ((wi * tmpr) + (wr * tmpi));
+			      wr = c_re(W[7 * l1]);
+			      wi = c_im(W[7 * l1]);
+			      tmpr = c_re(jp[7 * m]);
+			      tmpi = c_im(jp[7 * m]);
+			      r3_7 = ((wr * tmpr) - (wi * tmpi));
+			      i3_7 = ((wi * tmpr) + (wr * tmpi));
+			      r2_3 = (r3_3 + r3_7);
+			      i2_3 = (i3_3 + i3_7);
+			      r2_7 = (r3_3 - r3_7);
+			      i2_7 = (i3_3 - i3_7);
+			 }
+			 r1_1 = (r2_1 + r2_3);
+			 i1_1 = (i2_1 + i2_3);
+			 r1_5 = (r2_1 - r2_3);
+			 i1_5 = (i2_1 - i2_3);
+			 r1_3 = (r2_5 + i2_7);
+			 i1_3 = (i2_5 - r2_7);
+			 r1_7 = (r2_5 - i2_7);
+			 i1_7 = (i2_5 + r2_7);
+		    }
+		    c_re(kp[0 * m]) = (r1_0 + r1_1);
+		    c_im(kp[0 * m]) = (i1_0 + i1_1);
+		    c_re(kp[4 * m]) = (r1_0 - r1_1);
+		    c_im(kp[4 * m]) = (i1_0 - i1_1);
+		    tmpr = (0.707106781187 * (r1_3 + i1_3));
+		    tmpi = (0.707106781187 * (i1_3 - r1_3));
+		    c_re(kp[1 * m]) = (r1_2 + tmpr);
+		    c_im(kp[1 * m]) = (i1_2 + tmpi);
+		    c_re(kp[5 * m]) = (r1_2 - tmpr);
+		    c_im(kp[5 * m]) = (i1_2 - tmpi);
+		    c_re(kp[2 * m]) = (r1_4 + i1_5);
+		    c_im(kp[2 * m]) = (i1_4 - r1_5);
+		    c_re(kp[6 * m]) = (r1_4 - i1_5);
+		    c_im(kp[6 * m]) = (i1_4 + r1_5);
+		    tmpr = (0.707106781187 * (i1_7 - r1_7));
+		    tmpi = (0.707106781187 * (r1_7 + i1_7));
+		    c_re(kp[3 * m]) = (r1_6 + tmpr);
+		    c_im(kp[3 * m]) = (i1_6 - tmpi);
+		    c_re(kp[7 * m]) = (r1_6 - tmpr);
+		    c_im(kp[7 * m]) = (i1_6 + tmpi);
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+#if defined(FORCE_TIED_TASKS)
+          #pragma omp task
+	  fft_twiddle_8(a, ab, in, out, W, nW, nWdn, m);
+          #pragma omp task
+	  fft_twiddle_8(ab, b, in, out, W, nW, nWdn, m);
+#else
+          #pragma omp task untied
+	  fft_twiddle_8(a, ab, in, out, W, nW, nWdn, m);
+          #pragma omp task untied
+	  fft_twiddle_8(ab, b, in, out, W, nW, nWdn, m);
+#endif
+          #pragma omp taskwait
+     }
+}
+void fft_twiddle_8_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m)
+{
+     int l1, i;
+     COMPLEX *jp, *kp;
+     REAL tmpr, tmpi, wr, wi;
+     if ((b - a) < 128) {
+	  for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+	       i++, l1 += nWdn, kp++) {
+	       jp = in + i;
+	       {
+		    REAL r1_0, i1_0;
+		    REAL r1_1, i1_1;
+		    REAL r1_2, i1_2;
+		    REAL r1_3, i1_3;
+		    REAL r1_4, i1_4;
+		    REAL r1_5, i1_5;
+		    REAL r1_6, i1_6;
+		    REAL r1_7, i1_7;
+		    {
+			 REAL r2_0, i2_0;
+			 REAL r2_2, i2_2;
+			 REAL r2_4, i2_4;
+			 REAL r2_6, i2_6;
+			 {
+			      REAL r3_0, i3_0;
+			      REAL r3_4, i3_4;
+			      r3_0 = c_re(jp[0 * m]);
+			      i3_0 = c_im(jp[0 * m]);
+			      wr = c_re(W[4 * l1]);
+			      wi = c_im(W[4 * l1]);
+			      tmpr = c_re(jp[4 * m]);
+			      tmpi = c_im(jp[4 * m]);
+			      r3_4 = ((wr * tmpr) - (wi * tmpi));
+			      i3_4 = ((wi * tmpr) + (wr * tmpi));
+			      r2_0 = (r3_0 + r3_4);
+			      i2_0 = (i3_0 + i3_4);
+			      r2_4 = (r3_0 - r3_4);
+			      i2_4 = (i3_0 - i3_4);
+			 }
+			 {
+			      REAL r3_2, i3_2;
+			      REAL r3_6, i3_6;
+			      wr = c_re(W[2 * l1]);
+			      wi = c_im(W[2 * l1]);
+			      tmpr = c_re(jp[2 * m]);
+			      tmpi = c_im(jp[2 * m]);
+			      r3_2 = ((wr * tmpr) - (wi * tmpi));
+			      i3_2 = ((wi * tmpr) + (wr * tmpi));
+			      wr = c_re(W[6 * l1]);
+			      wi = c_im(W[6 * l1]);
+			      tmpr = c_re(jp[6 * m]);
+			      tmpi = c_im(jp[6 * m]);
+			      r3_6 = ((wr * tmpr) - (wi * tmpi));
+			      i3_6 = ((wi * tmpr) + (wr * tmpi));
+			      r2_2 = (r3_2 + r3_6);
+			      i2_2 = (i3_2 + i3_6);
+			      r2_6 = (r3_2 - r3_6);
+			      i2_6 = (i3_2 - i3_6);
+			 }
+			 r1_0 = (r2_0 + r2_2);
+			 i1_0 = (i2_0 + i2_2);
+			 r1_4 = (r2_0 - r2_2);
+			 i1_4 = (i2_0 - i2_2);
+			 r1_2 = (r2_4 + i2_6);
+			 i1_2 = (i2_4 - r2_6);
+			 r1_6 = (r2_4 - i2_6);
+			 i1_6 = (i2_4 + r2_6);
+		    }
+		    {
+			 REAL r2_1, i2_1;
+			 REAL r2_3, i2_3;
+			 REAL r2_5, i2_5;
+			 REAL r2_7, i2_7;
+			 {
+			      REAL r3_1, i3_1;
+			      REAL r3_5, i3_5;
+			      wr = c_re(W[1 * l1]);
+			      wi = c_im(W[1 * l1]);
+			      tmpr = c_re(jp[1 * m]);
+			      tmpi = c_im(jp[1 * m]);
+			      r3_1 = ((wr * tmpr) - (wi * tmpi));
+			      i3_1 = ((wi * tmpr) + (wr * tmpi));
+			      wr = c_re(W[5 * l1]);
+			      wi = c_im(W[5 * l1]);
+			      tmpr = c_re(jp[5 * m]);
+			      tmpi = c_im(jp[5 * m]);
+			      r3_5 = ((wr * tmpr) - (wi * tmpi));
+			      i3_5 = ((wi * tmpr) + (wr * tmpi));
+			      r2_1 = (r3_1 + r3_5);
+			      i2_1 = (i3_1 + i3_5);
+			      r2_5 = (r3_1 - r3_5);
+			      i2_5 = (i3_1 - i3_5);
+			 }
+			 {
+			      REAL r3_3, i3_3;
+			      REAL r3_7, i3_7;
+			      wr = c_re(W[3 * l1]);
+			      wi = c_im(W[3 * l1]);
+			      tmpr = c_re(jp[3 * m]);
+			      tmpi = c_im(jp[3 * m]);
+			      r3_3 = ((wr * tmpr) - (wi * tmpi));
+			      i3_3 = ((wi * tmpr) + (wr * tmpi));
+			      wr = c_re(W[7 * l1]);
+			      wi = c_im(W[7 * l1]);
+			      tmpr = c_re(jp[7 * m]);
+			      tmpi = c_im(jp[7 * m]);
+			      r3_7 = ((wr * tmpr) - (wi * tmpi));
+			      i3_7 = ((wi * tmpr) + (wr * tmpi));
+			      r2_3 = (r3_3 + r3_7);
+			      i2_3 = (i3_3 + i3_7);
+			      r2_7 = (r3_3 - r3_7);
+			      i2_7 = (i3_3 - i3_7);
+			 }
+			 r1_1 = (r2_1 + r2_3);
+			 i1_1 = (i2_1 + i2_3);
+			 r1_5 = (r2_1 - r2_3);
+			 i1_5 = (i2_1 - i2_3);
+			 r1_3 = (r2_5 + i2_7);
+			 i1_3 = (i2_5 - r2_7);
+			 r1_7 = (r2_5 - i2_7);
+			 i1_7 = (i2_5 + r2_7);
+		    }
+		    c_re(kp[0 * m]) = (r1_0 + r1_1);
+		    c_im(kp[0 * m]) = (i1_0 + i1_1);
+		    c_re(kp[4 * m]) = (r1_0 - r1_1);
+		    c_im(kp[4 * m]) = (i1_0 - i1_1);
+		    tmpr = (0.707106781187 * (r1_3 + i1_3));
+		    tmpi = (0.707106781187 * (i1_3 - r1_3));
+		    c_re(kp[1 * m]) = (r1_2 + tmpr);
+		    c_im(kp[1 * m]) = (i1_2 + tmpi);
+		    c_re(kp[5 * m]) = (r1_2 - tmpr);
+		    c_im(kp[5 * m]) = (i1_2 - tmpi);
+		    c_re(kp[2 * m]) = (r1_4 + i1_5);
+		    c_im(kp[2 * m]) = (i1_4 - r1_5);
+		    c_re(kp[6 * m]) = (r1_4 - i1_5);
+		    c_im(kp[6 * m]) = (i1_4 + r1_5);
+		    tmpr = (0.707106781187 * (i1_7 - r1_7));
+		    tmpi = (0.707106781187 * (r1_7 + i1_7));
+		    c_re(kp[3 * m]) = (r1_6 + tmpr);
+		    c_im(kp[3 * m]) = (i1_6 - tmpi);
+		    c_re(kp[7 * m]) = (r1_6 - tmpr);
+		    c_im(kp[7 * m]) = (i1_6 + tmpi);
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  fft_twiddle_8_seq(a, ab, in, out, W, nW, nWdn, m);
+	  fft_twiddle_8_seq(ab, b, in, out, W, nW, nWdn, m);
+     }
+}
+void fft_unshuffle_8(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+     int i;
+     const COMPLEX *ip;
+     COMPLEX *jp;
+     if ((b - a) < 128) {
+	  ip = in + a * 8;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+#if defined(FORCE_TIED_TASKS)
+          #pragma omp task
+	  fft_unshuffle_8(a, ab, in, out, m);
+          #pragma omp task
+	  fft_unshuffle_8(ab, b, in, out, m);
+#else
+          #pragma omp task untied
+	  fft_unshuffle_8(a, ab, in, out, m);
+          #pragma omp task untied
+	  fft_unshuffle_8(ab, b, in, out, m);
+#endif
+          #pragma omp taskwait
+     }
+}
+void fft_unshuffle_8_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+     int i;
+     const COMPLEX *ip;
+     COMPLEX *jp;
+     if ((b - a) < 128) {
+	  ip = in + a * 8;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  fft_unshuffle_8_seq(a, ab, in, out, m);
+	  fft_unshuffle_8_seq(ab, b, in, out, m);
+     }
+}
+void fft_base_16(COMPLEX * in, COMPLEX * out)
+{
+     REAL tmpr, tmpi;
+     {
+	  REAL r1_0, i1_0;
+	  REAL r1_1, i1_1;
+	  REAL r1_2, i1_2;
+	  REAL r1_3, i1_3;
+	  REAL r1_4, i1_4;
+	  REAL r1_5, i1_5;
+	  REAL r1_6, i1_6;
+	  REAL r1_7, i1_7;
+	  REAL r1_8, i1_8;
+	  REAL r1_9, i1_9;
+	  REAL r1_10, i1_10;
+	  REAL r1_11, i1_11;
+	  REAL r1_12, i1_12;
+	  REAL r1_13, i1_13;
+	  REAL r1_14, i1_14;
+	  REAL r1_15, i1_15;
+	  {
+	       REAL r2_0, i2_0;
+	       REAL r2_2, i2_2;
+	       REAL r2_4, i2_4;
+	       REAL r2_6, i2_6;
+	       REAL r2_8, i2_8;
+	       REAL r2_10, i2_10;
+	       REAL r2_12, i2_12;
+	       REAL r2_14, i2_14;
+	       {
+		    REAL r3_0, i3_0;
+		    REAL r3_4, i3_4;
+		    REAL r3_8, i3_8;
+		    REAL r3_12, i3_12;
+		    {
+			 REAL r4_0, i4_0;
+			 REAL r4_8, i4_8;
+			 r4_0 = c_re(in[0]);
+			 i4_0 = c_im(in[0]);
+			 r4_8 = c_re(in[8]);
+			 i4_8 = c_im(in[8]);
+			 r3_0 = (r4_0 + r4_8);
+			 i3_0 = (i4_0 + i4_8);
+			 r3_8 = (r4_0 - r4_8);
+			 i3_8 = (i4_0 - i4_8);
+		    }
+		    {
+			 REAL r4_4, i4_4;
+			 REAL r4_12, i4_12;
+			 r4_4 = c_re(in[4]);
+			 i4_4 = c_im(in[4]);
+			 r4_12 = c_re(in[12]);
+			 i4_12 = c_im(in[12]);
+			 r3_4 = (r4_4 + r4_12);
+			 i3_4 = (i4_4 + i4_12);
+			 r3_12 = (r4_4 - r4_12);
+			 i3_12 = (i4_4 - i4_12);
+		    }
+		    r2_0 = (r3_0 + r3_4);
+		    i2_0 = (i3_0 + i3_4);
+		    r2_8 = (r3_0 - r3_4);
+		    i2_8 = (i3_0 - i3_4);
+		    r2_4 = (r3_8 + i3_12);
+		    i2_4 = (i3_8 - r3_12);
+		    r2_12 = (r3_8 - i3_12);
+		    i2_12 = (i3_8 + r3_12);
+	       }
+	       {
+		    REAL r3_2, i3_2;
+		    REAL r3_6, i3_6;
+		    REAL r3_10, i3_10;
+		    REAL r3_14, i3_14;
+		    {
+			 REAL r4_2, i4_2;
+			 REAL r4_10, i4_10;
+			 r4_2 = c_re(in[2]);
+			 i4_2 = c_im(in[2]);
+			 r4_10 = c_re(in[10]);
+			 i4_10 = c_im(in[10]);
+			 r3_2 = (r4_2 + r4_10);
+			 i3_2 = (i4_2 + i4_10);
+			 r3_10 = (r4_2 - r4_10);
+			 i3_10 = (i4_2 - i4_10);
+		    }
+		    {
+			 REAL r4_6, i4_6;
+			 REAL r4_14, i4_14;
+			 r4_6 = c_re(in[6]);
+			 i4_6 = c_im(in[6]);
+			 r4_14 = c_re(in[14]);
+			 i4_14 = c_im(in[14]);
+			 r3_6 = (r4_6 + r4_14);
+			 i3_6 = (i4_6 + i4_14);
+			 r3_14 = (r4_6 - r4_14);
+			 i3_14 = (i4_6 - i4_14);
+		    }
+		    r2_2 = (r3_2 + r3_6);
+		    i2_2 = (i3_2 + i3_6);
+		    r2_10 = (r3_2 - r3_6);
+		    i2_10 = (i3_2 - i3_6);
+		    r2_6 = (r3_10 + i3_14);
+		    i2_6 = (i3_10 - r3_14);
+		    r2_14 = (r3_10 - i3_14);
+		    i2_14 = (i3_10 + r3_14);
+	       }
+	       r1_0 = (r2_0 + r2_2);
+	       i1_0 = (i2_0 + i2_2);
+	       r1_8 = (r2_0 - r2_2);
+	       i1_8 = (i2_0 - i2_2);
+	       tmpr = (0.707106781187 * (r2_6 + i2_6));
+	       tmpi = (0.707106781187 * (i2_6 - r2_6));
+	       r1_2 = (r2_4 + tmpr);
+	       i1_2 = (i2_4 + tmpi);
+	       r1_10 = (r2_4 - tmpr);
+	       i1_10 = (i2_4 - tmpi);
+	       r1_4 = (r2_8 + i2_10);
+	       i1_4 = (i2_8 - r2_10);
+	       r1_12 = (r2_8 - i2_10);
+	       i1_12 = (i2_8 + r2_10);
+	       tmpr = (0.707106781187 * (i2_14 - r2_14));
+	       tmpi = (0.707106781187 * (r2_14 + i2_14));
+	       r1_6 = (r2_12 + tmpr);
+	       i1_6 = (i2_12 - tmpi);
+	       r1_14 = (r2_12 - tmpr);
+	       i1_14 = (i2_12 + tmpi);
+	  }
+	  {
+	       REAL r2_1, i2_1;
+	       REAL r2_3, i2_3;
+	       REAL r2_5, i2_5;
+	       REAL r2_7, i2_7;
+	       REAL r2_9, i2_9;
+	       REAL r2_11, i2_11;
+	       REAL r2_13, i2_13;
+	       REAL r2_15, i2_15;
+	       {
+		    REAL r3_1, i3_1;
+		    REAL r3_5, i3_5;
+		    REAL r3_9, i3_9;
+		    REAL r3_13, i3_13;
+		    {
+			 REAL r4_1, i4_1;
+			 REAL r4_9, i4_9;
+			 r4_1 = c_re(in[1]);
+			 i4_1 = c_im(in[1]);
+			 r4_9 = c_re(in[9]);
+			 i4_9 = c_im(in[9]);
+			 r3_1 = (r4_1 + r4_9);
+			 i3_1 = (i4_1 + i4_9);
+			 r3_9 = (r4_1 - r4_9);
+			 i3_9 = (i4_1 - i4_9);
+		    }
+		    {
+			 REAL r4_5, i4_5;
+			 REAL r4_13, i4_13;
+			 r4_5 = c_re(in[5]);
+			 i4_5 = c_im(in[5]);
+			 r4_13 = c_re(in[13]);
+			 i4_13 = c_im(in[13]);
+			 r3_5 = (r4_5 + r4_13);
+			 i3_5 = (i4_5 + i4_13);
+			 r3_13 = (r4_5 - r4_13);
+			 i3_13 = (i4_5 - i4_13);
+		    }
+		    r2_1 = (r3_1 + r3_5);
+		    i2_1 = (i3_1 + i3_5);
+		    r2_9 = (r3_1 - r3_5);
+		    i2_9 = (i3_1 - i3_5);
+		    r2_5 = (r3_9 + i3_13);
+		    i2_5 = (i3_9 - r3_13);
+		    r2_13 = (r3_9 - i3_13);
+		    i2_13 = (i3_9 + r3_13);
+	       }
+	       {
+		    REAL r3_3, i3_3;
+		    REAL r3_7, i3_7;
+		    REAL r3_11, i3_11;
+		    REAL r3_15, i3_15;
+		    {
+			 REAL r4_3, i4_3;
+			 REAL r4_11, i4_11;
+			 r4_3 = c_re(in[3]);
+			 i4_3 = c_im(in[3]);
+			 r4_11 = c_re(in[11]);
+			 i4_11 = c_im(in[11]);
+			 r3_3 = (r4_3 + r4_11);
+			 i3_3 = (i4_3 + i4_11);
+			 r3_11 = (r4_3 - r4_11);
+			 i3_11 = (i4_3 - i4_11);
+		    }
+		    {
+			 REAL r4_7, i4_7;
+			 REAL r4_15, i4_15;
+			 r4_7 = c_re(in[7]);
+			 i4_7 = c_im(in[7]);
+			 r4_15 = c_re(in[15]);
+			 i4_15 = c_im(in[15]);
+			 r3_7 = (r4_7 + r4_15);
+			 i3_7 = (i4_7 + i4_15);
+			 r3_15 = (r4_7 - r4_15);
+			 i3_15 = (i4_7 - i4_15);
+		    }
+		    r2_3 = (r3_3 + r3_7);
+		    i2_3 = (i3_3 + i3_7);
+		    r2_11 = (r3_3 - r3_7);
+		    i2_11 = (i3_3 - i3_7);
+		    r2_7 = (r3_11 + i3_15);
+		    i2_7 = (i3_11 - r3_15);
+		    r2_15 = (r3_11 - i3_15);
+		    i2_15 = (i3_11 + r3_15);
+	       }
+	       r1_1 = (r2_1 + r2_3);
+	       i1_1 = (i2_1 + i2_3);
+	       r1_9 = (r2_1 - r2_3);
+	       i1_9 = (i2_1 - i2_3);
+	       tmpr = (0.707106781187 * (r2_7 + i2_7));
+	       tmpi = (0.707106781187 * (i2_7 - r2_7));
+	       r1_3 = (r2_5 + tmpr);
+	       i1_3 = (i2_5 + tmpi);
+	       r1_11 = (r2_5 - tmpr);
+	       i1_11 = (i2_5 - tmpi);
+	       r1_5 = (r2_9 + i2_11);
+	       i1_5 = (i2_9 - r2_11);
+	       r1_13 = (r2_9 - i2_11);
+	       i1_13 = (i2_9 + r2_11);
+	       tmpr = (0.707106781187 * (i2_15 - r2_15));
+	       tmpi = (0.707106781187 * (r2_15 + i2_15));
+	       r1_7 = (r2_13 + tmpr);
+	       i1_7 = (i2_13 - tmpi);
+	       r1_15 = (r2_13 - tmpr);
+	       i1_15 = (i2_13 + tmpi);
+	  }
+	  c_re(out[0]) = (r1_0 + r1_1);
+	  c_im(out[0]) = (i1_0 + i1_1);
+	  c_re(out[8]) = (r1_0 - r1_1);
+	  c_im(out[8]) = (i1_0 - i1_1);
+	  tmpr = ((0.923879532511 * r1_3) + (0.382683432365 * i1_3));
+	  tmpi = ((0.923879532511 * i1_3) - (0.382683432365 * r1_3));
+	  c_re(out[1]) = (r1_2 + tmpr);
+	  c_im(out[1]) = (i1_2 + tmpi);
+	  c_re(out[9]) = (r1_2 - tmpr);
+	  c_im(out[9]) = (i1_2 - tmpi);
+	  tmpr = (0.707106781187 * (r1_5 + i1_5));
+	  tmpi = (0.707106781187 * (i1_5 - r1_5));
+	  c_re(out[2]) = (r1_4 + tmpr);
+	  c_im(out[2]) = (i1_4 + tmpi);
+	  c_re(out[10]) = (r1_4 - tmpr);
+	  c_im(out[10]) = (i1_4 - tmpi);
+	  tmpr = ((0.382683432365 * r1_7) + (0.923879532511 * i1_7));
+	  tmpi = ((0.382683432365 * i1_7) - (0.923879532511 * r1_7));
+	  c_re(out[3]) = (r1_6 + tmpr);
+	  c_im(out[3]) = (i1_6 + tmpi);
+	  c_re(out[11]) = (r1_6 - tmpr);
+	  c_im(out[11]) = (i1_6 - tmpi);
+	  c_re(out[4]) = (r1_8 + i1_9);
+	  c_im(out[4]) = (i1_8 - r1_9);
+	  c_re(out[12]) = (r1_8 - i1_9);
+	  c_im(out[12]) = (i1_8 + r1_9);
+	  tmpr = ((0.923879532511 * i1_11) - (0.382683432365 * r1_11));
+	  tmpi = ((0.923879532511 * r1_11) + (0.382683432365 * i1_11));
+	  c_re(out[5]) = (r1_10 + tmpr);
+	  c_im(out[5]) = (i1_10 - tmpi);
+	  c_re(out[13]) = (r1_10 - tmpr);
+	  c_im(out[13]) = (i1_10 + tmpi);
+	  tmpr = (0.707106781187 * (i1_13 - r1_13));
+	  tmpi = (0.707106781187 * (r1_13 + i1_13));
+	  c_re(out[6]) = (r1_12 + tmpr);
+	  c_im(out[6]) = (i1_12 - tmpi);
+	  c_re(out[14]) = (r1_12 - tmpr);
+	  c_im(out[14]) = (i1_12 + tmpi);
+	  tmpr = ((0.382683432365 * i1_15) - (0.923879532511 * r1_15));
+	  tmpi = ((0.382683432365 * r1_15) + (0.923879532511 * i1_15));
+	  c_re(out[7]) = (r1_14 + tmpr);
+	  c_im(out[7]) = (i1_14 - tmpi);
+	  c_re(out[15]) = (r1_14 - tmpr);
+	  c_im(out[15]) = (i1_14 + tmpi);
+     }
+}
+void fft_twiddle_16(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m)
+{
+     int l1, i;
+     COMPLEX *jp, *kp;
+     REAL tmpr, tmpi, wr, wi;
+     if ((b - a) < 128) {
+	  for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+	       i++, l1 += nWdn, kp++) {
+	       jp = in + i;
+	       {
+		    REAL r1_0, i1_0;
+		    REAL r1_1, i1_1;
+		    REAL r1_2, i1_2;
+		    REAL r1_3, i1_3;
+		    REAL r1_4, i1_4;
+		    REAL r1_5, i1_5;
+		    REAL r1_6, i1_6;
+		    REAL r1_7, i1_7;
+		    REAL r1_8, i1_8;
+		    REAL r1_9, i1_9;
+		    REAL r1_10, i1_10;
+		    REAL r1_11, i1_11;
+		    REAL r1_12, i1_12;
+		    REAL r1_13, i1_13;
+		    REAL r1_14, i1_14;
+		    REAL r1_15, i1_15;
+		    {
+			 REAL r2_0, i2_0;
+			 REAL r2_2, i2_2;
+			 REAL r2_4, i2_4;
+			 REAL r2_6, i2_6;
+			 REAL r2_8, i2_8;
+			 REAL r2_10, i2_10;
+			 REAL r2_12, i2_12;
+			 REAL r2_14, i2_14;
+			 {
+			      REAL r3_0, i3_0;
+			      REAL r3_4, i3_4;
+			      REAL r3_8, i3_8;
+			      REAL r3_12, i3_12;
+			      {
+				   REAL r4_0, i4_0;
+				   REAL r4_8, i4_8;
+				   r4_0 = c_re(jp[0 * m]);
+				   i4_0 = c_im(jp[0 * m]);
+				   wr = c_re(W[8 * l1]);
+				   wi = c_im(W[8 * l1]);
+				   tmpr = c_re(jp[8 * m]);
+				   tmpi = c_im(jp[8 * m]);
+				   r4_8 = ((wr * tmpr) - (wi * tmpi));
+				   i4_8 = ((wi * tmpr) + (wr * tmpi));
+				   r3_0 = (r4_0 + r4_8);
+				   i3_0 = (i4_0 + i4_8);
+				   r3_8 = (r4_0 - r4_8);
+				   i3_8 = (i4_0 - i4_8);
+			      }
+			      {
+				   REAL r4_4, i4_4;
+				   REAL r4_12, i4_12;
+				   wr = c_re(W[4 * l1]);
+				   wi = c_im(W[4 * l1]);
+				   tmpr = c_re(jp[4 * m]);
+				   tmpi = c_im(jp[4 * m]);
+				   r4_4 = ((wr * tmpr) - (wi * tmpi));
+				   i4_4 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[12 * l1]);
+				   wi = c_im(W[12 * l1]);
+				   tmpr = c_re(jp[12 * m]);
+				   tmpi = c_im(jp[12 * m]);
+				   r4_12 = ((wr * tmpr) - (wi * tmpi));
+				   i4_12 = ((wi * tmpr) + (wr * tmpi));
+				   r3_4 = (r4_4 + r4_12);
+				   i3_4 = (i4_4 + i4_12);
+				   r3_12 = (r4_4 - r4_12);
+				   i3_12 = (i4_4 - i4_12);
+			      }
+			      r2_0 = (r3_0 + r3_4);
+			      i2_0 = (i3_0 + i3_4);
+			      r2_8 = (r3_0 - r3_4);
+			      i2_8 = (i3_0 - i3_4);
+			      r2_4 = (r3_8 + i3_12);
+			      i2_4 = (i3_8 - r3_12);
+			      r2_12 = (r3_8 - i3_12);
+			      i2_12 = (i3_8 + r3_12);
+			 }
+			 {
+			      REAL r3_2, i3_2;
+			      REAL r3_6, i3_6;
+			      REAL r3_10, i3_10;
+			      REAL r3_14, i3_14;
+			      {
+				   REAL r4_2, i4_2;
+				   REAL r4_10, i4_10;
+				   wr = c_re(W[2 * l1]);
+				   wi = c_im(W[2 * l1]);
+				   tmpr = c_re(jp[2 * m]);
+				   tmpi = c_im(jp[2 * m]);
+				   r4_2 = ((wr * tmpr) - (wi * tmpi));
+				   i4_2 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[10 * l1]);
+				   wi = c_im(W[10 * l1]);
+				   tmpr = c_re(jp[10 * m]);
+				   tmpi = c_im(jp[10 * m]);
+				   r4_10 = ((wr * tmpr) - (wi * tmpi));
+				   i4_10 = ((wi * tmpr) + (wr * tmpi));
+				   r3_2 = (r4_2 + r4_10);
+				   i3_2 = (i4_2 + i4_10);
+				   r3_10 = (r4_2 - r4_10);
+				   i3_10 = (i4_2 - i4_10);
+			      }
+			      {
+				   REAL r4_6, i4_6;
+				   REAL r4_14, i4_14;
+				   wr = c_re(W[6 * l1]);
+				   wi = c_im(W[6 * l1]);
+				   tmpr = c_re(jp[6 * m]);
+				   tmpi = c_im(jp[6 * m]);
+				   r4_6 = ((wr * tmpr) - (wi * tmpi));
+				   i4_6 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[14 * l1]);
+				   wi = c_im(W[14 * l1]);
+				   tmpr = c_re(jp[14 * m]);
+				   tmpi = c_im(jp[14 * m]);
+				   r4_14 = ((wr * tmpr) - (wi * tmpi));
+				   i4_14 = ((wi * tmpr) + (wr * tmpi));
+				   r3_6 = (r4_6 + r4_14);
+				   i3_6 = (i4_6 + i4_14);
+				   r3_14 = (r4_6 - r4_14);
+				   i3_14 = (i4_6 - i4_14);
+			      }
+			      r2_2 = (r3_2 + r3_6);
+			      i2_2 = (i3_2 + i3_6);
+			      r2_10 = (r3_2 - r3_6);
+			      i2_10 = (i3_2 - i3_6);
+			      r2_6 = (r3_10 + i3_14);
+			      i2_6 = (i3_10 - r3_14);
+			      r2_14 = (r3_10 - i3_14);
+			      i2_14 = (i3_10 + r3_14);
+			 }
+			 r1_0 = (r2_0 + r2_2);
+			 i1_0 = (i2_0 + i2_2);
+			 r1_8 = (r2_0 - r2_2);
+			 i1_8 = (i2_0 - i2_2);
+			 tmpr = (0.707106781187 * (r2_6 + i2_6));
+			 tmpi = (0.707106781187 * (i2_6 - r2_6));
+			 r1_2 = (r2_4 + tmpr);
+			 i1_2 = (i2_4 + tmpi);
+			 r1_10 = (r2_4 - tmpr);
+			 i1_10 = (i2_4 - tmpi);
+			 r1_4 = (r2_8 + i2_10);
+			 i1_4 = (i2_8 - r2_10);
+			 r1_12 = (r2_8 - i2_10);
+			 i1_12 = (i2_8 + r2_10);
+			 tmpr = (0.707106781187 * (i2_14 - r2_14));
+			 tmpi = (0.707106781187 * (r2_14 + i2_14));
+			 r1_6 = (r2_12 + tmpr);
+			 i1_6 = (i2_12 - tmpi);
+			 r1_14 = (r2_12 - tmpr);
+			 i1_14 = (i2_12 + tmpi);
+		    }
+		    {
+			 REAL r2_1, i2_1;
+			 REAL r2_3, i2_3;
+			 REAL r2_5, i2_5;
+			 REAL r2_7, i2_7;
+			 REAL r2_9, i2_9;
+			 REAL r2_11, i2_11;
+			 REAL r2_13, i2_13;
+			 REAL r2_15, i2_15;
+			 {
+			      REAL r3_1, i3_1;
+			      REAL r3_5, i3_5;
+			      REAL r3_9, i3_9;
+			      REAL r3_13, i3_13;
+			      {
+				   REAL r4_1, i4_1;
+				   REAL r4_9, i4_9;
+				   wr = c_re(W[1 * l1]);
+				   wi = c_im(W[1 * l1]);
+				   tmpr = c_re(jp[1 * m]);
+				   tmpi = c_im(jp[1 * m]);
+				   r4_1 = ((wr * tmpr) - (wi * tmpi));
+				   i4_1 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[9 * l1]);
+				   wi = c_im(W[9 * l1]);
+				   tmpr = c_re(jp[9 * m]);
+				   tmpi = c_im(jp[9 * m]);
+				   r4_9 = ((wr * tmpr) - (wi * tmpi));
+				   i4_9 = ((wi * tmpr) + (wr * tmpi));
+				   r3_1 = (r4_1 + r4_9);
+				   i3_1 = (i4_1 + i4_9);
+				   r3_9 = (r4_1 - r4_9);
+				   i3_9 = (i4_1 - i4_9);
+			      }
+			      {
+				   REAL r4_5, i4_5;
+				   REAL r4_13, i4_13;
+				   wr = c_re(W[5 * l1]);
+				   wi = c_im(W[5 * l1]);
+				   tmpr = c_re(jp[5 * m]);
+				   tmpi = c_im(jp[5 * m]);
+				   r4_5 = ((wr * tmpr) - (wi * tmpi));
+				   i4_5 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[13 * l1]);
+				   wi = c_im(W[13 * l1]);
+				   tmpr = c_re(jp[13 * m]);
+				   tmpi = c_im(jp[13 * m]);
+				   r4_13 = ((wr * tmpr) - (wi * tmpi));
+				   i4_13 = ((wi * tmpr) + (wr * tmpi));
+				   r3_5 = (r4_5 + r4_13);
+				   i3_5 = (i4_5 + i4_13);
+				   r3_13 = (r4_5 - r4_13);
+				   i3_13 = (i4_5 - i4_13);
+			      }
+			      r2_1 = (r3_1 + r3_5);
+			      i2_1 = (i3_1 + i3_5);
+			      r2_9 = (r3_1 - r3_5);
+			      i2_9 = (i3_1 - i3_5);
+			      r2_5 = (r3_9 + i3_13);
+			      i2_5 = (i3_9 - r3_13);
+			      r2_13 = (r3_9 - i3_13);
+			      i2_13 = (i3_9 + r3_13);
+			 }
+			 {
+			      REAL r3_3, i3_3;
+			      REAL r3_7, i3_7;
+			      REAL r3_11, i3_11;
+			      REAL r3_15, i3_15;
+			      {
+				   REAL r4_3, i4_3;
+				   REAL r4_11, i4_11;
+				   wr = c_re(W[3 * l1]);
+				   wi = c_im(W[3 * l1]);
+				   tmpr = c_re(jp[3 * m]);
+				   tmpi = c_im(jp[3 * m]);
+				   r4_3 = ((wr * tmpr) - (wi * tmpi));
+				   i4_3 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[11 * l1]);
+				   wi = c_im(W[11 * l1]);
+				   tmpr = c_re(jp[11 * m]);
+				   tmpi = c_im(jp[11 * m]);
+				   r4_11 = ((wr * tmpr) - (wi * tmpi));
+				   i4_11 = ((wi * tmpr) + (wr * tmpi));
+				   r3_3 = (r4_3 + r4_11);
+				   i3_3 = (i4_3 + i4_11);
+				   r3_11 = (r4_3 - r4_11);
+				   i3_11 = (i4_3 - i4_11);
+			      }
+			      {
+				   REAL r4_7, i4_7;
+				   REAL r4_15, i4_15;
+				   wr = c_re(W[7 * l1]);
+				   wi = c_im(W[7 * l1]);
+				   tmpr = c_re(jp[7 * m]);
+				   tmpi = c_im(jp[7 * m]);
+				   r4_7 = ((wr * tmpr) - (wi * tmpi));
+				   i4_7 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[15 * l1]);
+				   wi = c_im(W[15 * l1]);
+				   tmpr = c_re(jp[15 * m]);
+				   tmpi = c_im(jp[15 * m]);
+				   r4_15 = ((wr * tmpr) - (wi * tmpi));
+				   i4_15 = ((wi * tmpr) + (wr * tmpi));
+				   r3_7 = (r4_7 + r4_15);
+				   i3_7 = (i4_7 + i4_15);
+				   r3_15 = (r4_7 - r4_15);
+				   i3_15 = (i4_7 - i4_15);
+			      }
+			      r2_3 = (r3_3 + r3_7);
+			      i2_3 = (i3_3 + i3_7);
+			      r2_11 = (r3_3 - r3_7);
+			      i2_11 = (i3_3 - i3_7);
+			      r2_7 = (r3_11 + i3_15);
+			      i2_7 = (i3_11 - r3_15);
+			      r2_15 = (r3_11 - i3_15);
+			      i2_15 = (i3_11 + r3_15);
+			 }
+			 r1_1 = (r2_1 + r2_3);
+			 i1_1 = (i2_1 + i2_3);
+			 r1_9 = (r2_1 - r2_3);
+			 i1_9 = (i2_1 - i2_3);
+			 tmpr = (0.707106781187 * (r2_7 + i2_7));
+			 tmpi = (0.707106781187 * (i2_7 - r2_7));
+			 r1_3 = (r2_5 + tmpr);
+			 i1_3 = (i2_5 + tmpi);
+			 r1_11 = (r2_5 - tmpr);
+			 i1_11 = (i2_5 - tmpi);
+			 r1_5 = (r2_9 + i2_11);
+			 i1_5 = (i2_9 - r2_11);
+			 r1_13 = (r2_9 - i2_11);
+			 i1_13 = (i2_9 + r2_11);
+			 tmpr = (0.707106781187 * (i2_15 - r2_15));
+			 tmpi = (0.707106781187 * (r2_15 + i2_15));
+			 r1_7 = (r2_13 + tmpr);
+			 i1_7 = (i2_13 - tmpi);
+			 r1_15 = (r2_13 - tmpr);
+			 i1_15 = (i2_13 + tmpi);
+		    }
+		    c_re(kp[0 * m]) = (r1_0 + r1_1);
+		    c_im(kp[0 * m]) = (i1_0 + i1_1);
+		    c_re(kp[8 * m]) = (r1_0 - r1_1);
+		    c_im(kp[8 * m]) = (i1_0 - i1_1);
+		    tmpr = ((0.923879532511 * r1_3) + (0.382683432365 * i1_3));
+		    tmpi = ((0.923879532511 * i1_3) - (0.382683432365 * r1_3));
+		    c_re(kp[1 * m]) = (r1_2 + tmpr);
+		    c_im(kp[1 * m]) = (i1_2 + tmpi);
+		    c_re(kp[9 * m]) = (r1_2 - tmpr);
+		    c_im(kp[9 * m]) = (i1_2 - tmpi);
+		    tmpr = (0.707106781187 * (r1_5 + i1_5));
+		    tmpi = (0.707106781187 * (i1_5 - r1_5));
+		    c_re(kp[2 * m]) = (r1_4 + tmpr);
+		    c_im(kp[2 * m]) = (i1_4 + tmpi);
+		    c_re(kp[10 * m]) = (r1_4 - tmpr);
+		    c_im(kp[10 * m]) = (i1_4 - tmpi);
+		    tmpr = ((0.382683432365 * r1_7) + (0.923879532511 * i1_7));
+		    tmpi = ((0.382683432365 * i1_7) - (0.923879532511 * r1_7));
+		    c_re(kp[3 * m]) = (r1_6 + tmpr);
+		    c_im(kp[3 * m]) = (i1_6 + tmpi);
+		    c_re(kp[11 * m]) = (r1_6 - tmpr);
+		    c_im(kp[11 * m]) = (i1_6 - tmpi);
+		    c_re(kp[4 * m]) = (r1_8 + i1_9);
+		    c_im(kp[4 * m]) = (i1_8 - r1_9);
+		    c_re(kp[12 * m]) = (r1_8 - i1_9);
+		    c_im(kp[12 * m]) = (i1_8 + r1_9);
+		    tmpr = ((0.923879532511 * i1_11) - (0.382683432365 * r1_11));
+		    tmpi = ((0.923879532511 * r1_11) + (0.382683432365 * i1_11));
+		    c_re(kp[5 * m]) = (r1_10 + tmpr);
+		    c_im(kp[5 * m]) = (i1_10 - tmpi);
+		    c_re(kp[13 * m]) = (r1_10 - tmpr);
+		    c_im(kp[13 * m]) = (i1_10 + tmpi);
+		    tmpr = (0.707106781187 * (i1_13 - r1_13));
+		    tmpi = (0.707106781187 * (r1_13 + i1_13));
+		    c_re(kp[6 * m]) = (r1_12 + tmpr);
+		    c_im(kp[6 * m]) = (i1_12 - tmpi);
+		    c_re(kp[14 * m]) = (r1_12 - tmpr);
+		    c_im(kp[14 * m]) = (i1_12 + tmpi);
+		    tmpr = ((0.382683432365 * i1_15) - (0.923879532511 * r1_15));
+		    tmpi = ((0.382683432365 * r1_15) + (0.923879532511 * i1_15));
+		    c_re(kp[7 * m]) = (r1_14 + tmpr);
+		    c_im(kp[7 * m]) = (i1_14 - tmpi);
+		    c_re(kp[15 * m]) = (r1_14 - tmpr);
+		    c_im(kp[15 * m]) = (i1_14 + tmpi);
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+#if defined(FORCE_TIED_TASKS)
+          #pragma omp task
+	  fft_twiddle_16(a, ab, in, out, W, nW, nWdn, m);
+          #pragma omp task
+	  fft_twiddle_16(ab, b, in, out, W, nW, nWdn, m);
+#else
+          #pragma omp task untied
+	  fft_twiddle_16(a, ab, in, out, W, nW, nWdn, m);
+          #pragma omp task untied
+	  fft_twiddle_16(ab, b, in, out, W, nW, nWdn, m);
+#endif
+          #pragma omp taskwait
+     }
+}
+void fft_twiddle_16_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m)
+{
+     int l1, i;
+     COMPLEX *jp, *kp;
+     REAL tmpr, tmpi, wr, wi;
+     if ((b - a) < 128) {
+	  for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+	       i++, l1 += nWdn, kp++) {
+	       jp = in + i;
+	       {
+		    REAL r1_0, i1_0;
+		    REAL r1_1, i1_1;
+		    REAL r1_2, i1_2;
+		    REAL r1_3, i1_3;
+		    REAL r1_4, i1_4;
+		    REAL r1_5, i1_5;
+		    REAL r1_6, i1_6;
+		    REAL r1_7, i1_7;
+		    REAL r1_8, i1_8;
+		    REAL r1_9, i1_9;
+		    REAL r1_10, i1_10;
+		    REAL r1_11, i1_11;
+		    REAL r1_12, i1_12;
+		    REAL r1_13, i1_13;
+		    REAL r1_14, i1_14;
+		    REAL r1_15, i1_15;
+		    {
+			 REAL r2_0, i2_0;
+			 REAL r2_2, i2_2;
+			 REAL r2_4, i2_4;
+			 REAL r2_6, i2_6;
+			 REAL r2_8, i2_8;
+			 REAL r2_10, i2_10;
+			 REAL r2_12, i2_12;
+			 REAL r2_14, i2_14;
+			 {
+			      REAL r3_0, i3_0;
+			      REAL r3_4, i3_4;
+			      REAL r3_8, i3_8;
+			      REAL r3_12, i3_12;
+			      {
+				   REAL r4_0, i4_0;
+				   REAL r4_8, i4_8;
+				   r4_0 = c_re(jp[0 * m]);
+				   i4_0 = c_im(jp[0 * m]);
+				   wr = c_re(W[8 * l1]);
+				   wi = c_im(W[8 * l1]);
+				   tmpr = c_re(jp[8 * m]);
+				   tmpi = c_im(jp[8 * m]);
+				   r4_8 = ((wr * tmpr) - (wi * tmpi));
+				   i4_8 = ((wi * tmpr) + (wr * tmpi));
+				   r3_0 = (r4_0 + r4_8);
+				   i3_0 = (i4_0 + i4_8);
+				   r3_8 = (r4_0 - r4_8);
+				   i3_8 = (i4_0 - i4_8);
+			      }
+			      {
+				   REAL r4_4, i4_4;
+				   REAL r4_12, i4_12;
+				   wr = c_re(W[4 * l1]);
+				   wi = c_im(W[4 * l1]);
+				   tmpr = c_re(jp[4 * m]);
+				   tmpi = c_im(jp[4 * m]);
+				   r4_4 = ((wr * tmpr) - (wi * tmpi));
+				   i4_4 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[12 * l1]);
+				   wi = c_im(W[12 * l1]);
+				   tmpr = c_re(jp[12 * m]);
+				   tmpi = c_im(jp[12 * m]);
+				   r4_12 = ((wr * tmpr) - (wi * tmpi));
+				   i4_12 = ((wi * tmpr) + (wr * tmpi));
+				   r3_4 = (r4_4 + r4_12);
+				   i3_4 = (i4_4 + i4_12);
+				   r3_12 = (r4_4 - r4_12);
+				   i3_12 = (i4_4 - i4_12);
+			      }
+			      r2_0 = (r3_0 + r3_4);
+			      i2_0 = (i3_0 + i3_4);
+			      r2_8 = (r3_0 - r3_4);
+			      i2_8 = (i3_0 - i3_4);
+			      r2_4 = (r3_8 + i3_12);
+			      i2_4 = (i3_8 - r3_12);
+			      r2_12 = (r3_8 - i3_12);
+			      i2_12 = (i3_8 + r3_12);
+			 }
+			 {
+			      REAL r3_2, i3_2;
+			      REAL r3_6, i3_6;
+			      REAL r3_10, i3_10;
+			      REAL r3_14, i3_14;
+			      {
+				   REAL r4_2, i4_2;
+				   REAL r4_10, i4_10;
+				   wr = c_re(W[2 * l1]);
+				   wi = c_im(W[2 * l1]);
+				   tmpr = c_re(jp[2 * m]);
+				   tmpi = c_im(jp[2 * m]);
+				   r4_2 = ((wr * tmpr) - (wi * tmpi));
+				   i4_2 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[10 * l1]);
+				   wi = c_im(W[10 * l1]);
+				   tmpr = c_re(jp[10 * m]);
+				   tmpi = c_im(jp[10 * m]);
+				   r4_10 = ((wr * tmpr) - (wi * tmpi));
+				   i4_10 = ((wi * tmpr) + (wr * tmpi));
+				   r3_2 = (r4_2 + r4_10);
+				   i3_2 = (i4_2 + i4_10);
+				   r3_10 = (r4_2 - r4_10);
+				   i3_10 = (i4_2 - i4_10);
+			      }
+			      {
+				   REAL r4_6, i4_6;
+				   REAL r4_14, i4_14;
+				   wr = c_re(W[6 * l1]);
+				   wi = c_im(W[6 * l1]);
+				   tmpr = c_re(jp[6 * m]);
+				   tmpi = c_im(jp[6 * m]);
+				   r4_6 = ((wr * tmpr) - (wi * tmpi));
+				   i4_6 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[14 * l1]);
+				   wi = c_im(W[14 * l1]);
+				   tmpr = c_re(jp[14 * m]);
+				   tmpi = c_im(jp[14 * m]);
+				   r4_14 = ((wr * tmpr) - (wi * tmpi));
+				   i4_14 = ((wi * tmpr) + (wr * tmpi));
+				   r3_6 = (r4_6 + r4_14);
+				   i3_6 = (i4_6 + i4_14);
+				   r3_14 = (r4_6 - r4_14);
+				   i3_14 = (i4_6 - i4_14);
+			      }
+			      r2_2 = (r3_2 + r3_6);
+			      i2_2 = (i3_2 + i3_6);
+			      r2_10 = (r3_2 - r3_6);
+			      i2_10 = (i3_2 - i3_6);
+			      r2_6 = (r3_10 + i3_14);
+			      i2_6 = (i3_10 - r3_14);
+			      r2_14 = (r3_10 - i3_14);
+			      i2_14 = (i3_10 + r3_14);
+			 }
+			 r1_0 = (r2_0 + r2_2);
+			 i1_0 = (i2_0 + i2_2);
+			 r1_8 = (r2_0 - r2_2);
+			 i1_8 = (i2_0 - i2_2);
+			 tmpr = (0.707106781187 * (r2_6 + i2_6));
+			 tmpi = (0.707106781187 * (i2_6 - r2_6));
+			 r1_2 = (r2_4 + tmpr);
+			 i1_2 = (i2_4 + tmpi);
+			 r1_10 = (r2_4 - tmpr);
+			 i1_10 = (i2_4 - tmpi);
+			 r1_4 = (r2_8 + i2_10);
+			 i1_4 = (i2_8 - r2_10);
+			 r1_12 = (r2_8 - i2_10);
+			 i1_12 = (i2_8 + r2_10);
+			 tmpr = (0.707106781187 * (i2_14 - r2_14));
+			 tmpi = (0.707106781187 * (r2_14 + i2_14));
+			 r1_6 = (r2_12 + tmpr);
+			 i1_6 = (i2_12 - tmpi);
+			 r1_14 = (r2_12 - tmpr);
+			 i1_14 = (i2_12 + tmpi);
+		    }
+		    {
+			 REAL r2_1, i2_1;
+			 REAL r2_3, i2_3;
+			 REAL r2_5, i2_5;
+			 REAL r2_7, i2_7;
+			 REAL r2_9, i2_9;
+			 REAL r2_11, i2_11;
+			 REAL r2_13, i2_13;
+			 REAL r2_15, i2_15;
+			 {
+			      REAL r3_1, i3_1;
+			      REAL r3_5, i3_5;
+			      REAL r3_9, i3_9;
+			      REAL r3_13, i3_13;
+			      {
+				   REAL r4_1, i4_1;
+				   REAL r4_9, i4_9;
+				   wr = c_re(W[1 * l1]);
+				   wi = c_im(W[1 * l1]);
+				   tmpr = c_re(jp[1 * m]);
+				   tmpi = c_im(jp[1 * m]);
+				   r4_1 = ((wr * tmpr) - (wi * tmpi));
+				   i4_1 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[9 * l1]);
+				   wi = c_im(W[9 * l1]);
+				   tmpr = c_re(jp[9 * m]);
+				   tmpi = c_im(jp[9 * m]);
+				   r4_9 = ((wr * tmpr) - (wi * tmpi));
+				   i4_9 = ((wi * tmpr) + (wr * tmpi));
+				   r3_1 = (r4_1 + r4_9);
+				   i3_1 = (i4_1 + i4_9);
+				   r3_9 = (r4_1 - r4_9);
+				   i3_9 = (i4_1 - i4_9);
+			      }
+			      {
+				   REAL r4_5, i4_5;
+				   REAL r4_13, i4_13;
+				   wr = c_re(W[5 * l1]);
+				   wi = c_im(W[5 * l1]);
+				   tmpr = c_re(jp[5 * m]);
+				   tmpi = c_im(jp[5 * m]);
+				   r4_5 = ((wr * tmpr) - (wi * tmpi));
+				   i4_5 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[13 * l1]);
+				   wi = c_im(W[13 * l1]);
+				   tmpr = c_re(jp[13 * m]);
+				   tmpi = c_im(jp[13 * m]);
+				   r4_13 = ((wr * tmpr) - (wi * tmpi));
+				   i4_13 = ((wi * tmpr) + (wr * tmpi));
+				   r3_5 = (r4_5 + r4_13);
+				   i3_5 = (i4_5 + i4_13);
+				   r3_13 = (r4_5 - r4_13);
+				   i3_13 = (i4_5 - i4_13);
+			      }
+			      r2_1 = (r3_1 + r3_5);
+			      i2_1 = (i3_1 + i3_5);
+			      r2_9 = (r3_1 - r3_5);
+			      i2_9 = (i3_1 - i3_5);
+			      r2_5 = (r3_9 + i3_13);
+			      i2_5 = (i3_9 - r3_13);
+			      r2_13 = (r3_9 - i3_13);
+			      i2_13 = (i3_9 + r3_13);
+			 }
+			 {
+			      REAL r3_3, i3_3;
+			      REAL r3_7, i3_7;
+			      REAL r3_11, i3_11;
+			      REAL r3_15, i3_15;
+			      {
+				   REAL r4_3, i4_3;
+				   REAL r4_11, i4_11;
+				   wr = c_re(W[3 * l1]);
+				   wi = c_im(W[3 * l1]);
+				   tmpr = c_re(jp[3 * m]);
+				   tmpi = c_im(jp[3 * m]);
+				   r4_3 = ((wr * tmpr) - (wi * tmpi));
+				   i4_3 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[11 * l1]);
+				   wi = c_im(W[11 * l1]);
+				   tmpr = c_re(jp[11 * m]);
+				   tmpi = c_im(jp[11 * m]);
+				   r4_11 = ((wr * tmpr) - (wi * tmpi));
+				   i4_11 = ((wi * tmpr) + (wr * tmpi));
+				   r3_3 = (r4_3 + r4_11);
+				   i3_3 = (i4_3 + i4_11);
+				   r3_11 = (r4_3 - r4_11);
+				   i3_11 = (i4_3 - i4_11);
+			      }
+			      {
+				   REAL r4_7, i4_7;
+				   REAL r4_15, i4_15;
+				   wr = c_re(W[7 * l1]);
+				   wi = c_im(W[7 * l1]);
+				   tmpr = c_re(jp[7 * m]);
+				   tmpi = c_im(jp[7 * m]);
+				   r4_7 = ((wr * tmpr) - (wi * tmpi));
+				   i4_7 = ((wi * tmpr) + (wr * tmpi));
+				   wr = c_re(W[15 * l1]);
+				   wi = c_im(W[15 * l1]);
+				   tmpr = c_re(jp[15 * m]);
+				   tmpi = c_im(jp[15 * m]);
+				   r4_15 = ((wr * tmpr) - (wi * tmpi));
+				   i4_15 = ((wi * tmpr) + (wr * tmpi));
+				   r3_7 = (r4_7 + r4_15);
+				   i3_7 = (i4_7 + i4_15);
+				   r3_15 = (r4_7 - r4_15);
+				   i3_15 = (i4_7 - i4_15);
+			      }
+			      r2_3 = (r3_3 + r3_7);
+			      i2_3 = (i3_3 + i3_7);
+			      r2_11 = (r3_3 - r3_7);
+			      i2_11 = (i3_3 - i3_7);
+			      r2_7 = (r3_11 + i3_15);
+			      i2_7 = (i3_11 - r3_15);
+			      r2_15 = (r3_11 - i3_15);
+			      i2_15 = (i3_11 + r3_15);
+			 }
+			 r1_1 = (r2_1 + r2_3);
+			 i1_1 = (i2_1 + i2_3);
+			 r1_9 = (r2_1 - r2_3);
+			 i1_9 = (i2_1 - i2_3);
+			 tmpr = (0.707106781187 * (r2_7 + i2_7));
+			 tmpi = (0.707106781187 * (i2_7 - r2_7));
+			 r1_3 = (r2_5 + tmpr);
+			 i1_3 = (i2_5 + tmpi);
+			 r1_11 = (r2_5 - tmpr);
+			 i1_11 = (i2_5 - tmpi);
+			 r1_5 = (r2_9 + i2_11);
+			 i1_5 = (i2_9 - r2_11);
+			 r1_13 = (r2_9 - i2_11);
+			 i1_13 = (i2_9 + r2_11);
+			 tmpr = (0.707106781187 * (i2_15 - r2_15));
+			 tmpi = (0.707106781187 * (r2_15 + i2_15));
+			 r1_7 = (r2_13 + tmpr);
+			 i1_7 = (i2_13 - tmpi);
+			 r1_15 = (r2_13 - tmpr);
+			 i1_15 = (i2_13 + tmpi);
+		    }
+		    c_re(kp[0 * m]) = (r1_0 + r1_1);
+		    c_im(kp[0 * m]) = (i1_0 + i1_1);
+		    c_re(kp[8 * m]) = (r1_0 - r1_1);
+		    c_im(kp[8 * m]) = (i1_0 - i1_1);
+		    tmpr = ((0.923879532511 * r1_3) + (0.382683432365 * i1_3));
+		    tmpi = ((0.923879532511 * i1_3) - (0.382683432365 * r1_3));
+		    c_re(kp[1 * m]) = (r1_2 + tmpr);
+		    c_im(kp[1 * m]) = (i1_2 + tmpi);
+		    c_re(kp[9 * m]) = (r1_2 - tmpr);
+		    c_im(kp[9 * m]) = (i1_2 - tmpi);
+		    tmpr = (0.707106781187 * (r1_5 + i1_5));
+		    tmpi = (0.707106781187 * (i1_5 - r1_5));
+		    c_re(kp[2 * m]) = (r1_4 + tmpr);
+		    c_im(kp[2 * m]) = (i1_4 + tmpi);
+		    c_re(kp[10 * m]) = (r1_4 - tmpr);
+		    c_im(kp[10 * m]) = (i1_4 - tmpi);
+		    tmpr = ((0.382683432365 * r1_7) + (0.923879532511 * i1_7));
+		    tmpi = ((0.382683432365 * i1_7) - (0.923879532511 * r1_7));
+		    c_re(kp[3 * m]) = (r1_6 + tmpr);
+		    c_im(kp[3 * m]) = (i1_6 + tmpi);
+		    c_re(kp[11 * m]) = (r1_6 - tmpr);
+		    c_im(kp[11 * m]) = (i1_6 - tmpi);
+		    c_re(kp[4 * m]) = (r1_8 + i1_9);
+		    c_im(kp[4 * m]) = (i1_8 - r1_9);
+		    c_re(kp[12 * m]) = (r1_8 - i1_9);
+		    c_im(kp[12 * m]) = (i1_8 + r1_9);
+		    tmpr = ((0.923879532511 * i1_11) - (0.382683432365 * r1_11));
+		    tmpi = ((0.923879532511 * r1_11) + (0.382683432365 * i1_11));
+		    c_re(kp[5 * m]) = (r1_10 + tmpr);
+		    c_im(kp[5 * m]) = (i1_10 - tmpi);
+		    c_re(kp[13 * m]) = (r1_10 - tmpr);
+		    c_im(kp[13 * m]) = (i1_10 + tmpi);
+		    tmpr = (0.707106781187 * (i1_13 - r1_13));
+		    tmpi = (0.707106781187 * (r1_13 + i1_13));
+		    c_re(kp[6 * m]) = (r1_12 + tmpr);
+		    c_im(kp[6 * m]) = (i1_12 - tmpi);
+		    c_re(kp[14 * m]) = (r1_12 - tmpr);
+		    c_im(kp[14 * m]) = (i1_12 + tmpi);
+		    tmpr = ((0.382683432365 * i1_15) - (0.923879532511 * r1_15));
+		    tmpi = ((0.382683432365 * r1_15) + (0.923879532511 * i1_15));
+		    c_re(kp[7 * m]) = (r1_14 + tmpr);
+		    c_im(kp[7 * m]) = (i1_14 - tmpi);
+		    c_re(kp[15 * m]) = (r1_14 - tmpr);
+		    c_im(kp[15 * m]) = (i1_14 + tmpi);
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  fft_twiddle_16_seq(a, ab, in, out, W, nW, nWdn, m);
+	  fft_twiddle_16_seq(ab, b, in, out, W, nW, nWdn, m);
+     }
+}
+void fft_unshuffle_16(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+     int i;
+     const COMPLEX *ip;
+     COMPLEX *jp;
+     if ((b - a) < 128) {
+	  ip = in + a * 16;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+#if defined(FORCE_TIED_TASKS)
+          #pragma omp task untied
+	  fft_unshuffle_16(a, ab, in, out, m);
+          #pragma omp task untied
+	  fft_unshuffle_16(ab, b, in, out, m);
+#else
+          #pragma omp task untied
+	  fft_unshuffle_16(a, ab, in, out, m);
+          #pragma omp task untied
+	  fft_unshuffle_16(ab, b, in, out, m);
+#endif
+          #pragma omp taskwait
+     }
+}
+void fft_unshuffle_16_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+     int i;
+     const COMPLEX *ip;
+     COMPLEX *jp;
+     if ((b - a) < 128) {
+	  ip = in + a * 16;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  fft_unshuffle_16_seq(a, ab, in, out, m);
+	  fft_unshuffle_16_seq(ab, b, in, out, m);
+     }
+}
+void fft_base_32(COMPLEX * in, COMPLEX * out)
+{
+     REAL tmpr, tmpi;
+     {
+	  REAL r1_0, i1_0;
+	  REAL r1_1, i1_1;
+	  REAL r1_2, i1_2;
+	  REAL r1_3, i1_3;
+	  REAL r1_4, i1_4;
+	  REAL r1_5, i1_5;
+	  REAL r1_6, i1_6;
+	  REAL r1_7, i1_7;
+	  REAL r1_8, i1_8;
+	  REAL r1_9, i1_9;
+	  REAL r1_10, i1_10;
+	  REAL r1_11, i1_11;
+	  REAL r1_12, i1_12;
+	  REAL r1_13, i1_13;
+	  REAL r1_14, i1_14;
+	  REAL r1_15, i1_15;
+	  REAL r1_16, i1_16;
+	  REAL r1_17, i1_17;
+	  REAL r1_18, i1_18;
+	  REAL r1_19, i1_19;
+	  REAL r1_20, i1_20;
+	  REAL r1_21, i1_21;
+	  REAL r1_22, i1_22;
+	  REAL r1_23, i1_23;
+	  REAL r1_24, i1_24;
+	  REAL r1_25, i1_25;
+	  REAL r1_26, i1_26;
+	  REAL r1_27, i1_27;
+	  REAL r1_28, i1_28;
+	  REAL r1_29, i1_29;
+	  REAL r1_30, i1_30;
+	  REAL r1_31, i1_31;
+	  {
+	       REAL r2_0, i2_0;
+	       REAL r2_2, i2_2;
+	       REAL r2_4, i2_4;
+	       REAL r2_6, i2_6;
+	       REAL r2_8, i2_8;
+	       REAL r2_10, i2_10;
+	       REAL r2_12, i2_12;
+	       REAL r2_14, i2_14;
+	       REAL r2_16, i2_16;
+	       REAL r2_18, i2_18;
+	       REAL r2_20, i2_20;
+	       REAL r2_22, i2_22;
+	       REAL r2_24, i2_24;
+	       REAL r2_26, i2_26;
+	       REAL r2_28, i2_28;
+	       REAL r2_30, i2_30;
+	       {
+		    REAL r3_0, i3_0;
+		    REAL r3_4, i3_4;
+		    REAL r3_8, i3_8;
+		    REAL r3_12, i3_12;
+		    REAL r3_16, i3_16;
+		    REAL r3_20, i3_20;
+		    REAL r3_24, i3_24;
+		    REAL r3_28, i3_28;
+		    {
+			 REAL r4_0, i4_0;
+			 REAL r4_8, i4_8;
+			 REAL r4_16, i4_16;
+			 REAL r4_24, i4_24;
+			 {
+			      REAL r5_0, i5_0;
+			      REAL r5_16, i5_16;
+			      r5_0 = c_re(in[0]);
+			      i5_0 = c_im(in[0]);
+			      r5_16 = c_re(in[16]);
+			      i5_16 = c_im(in[16]);
+			      r4_0 = (r5_0 + r5_16);
+			      i4_0 = (i5_0 + i5_16);
+			      r4_16 = (r5_0 - r5_16);
+			      i4_16 = (i5_0 - i5_16);
+			 }
+			 {
+			      REAL r5_8, i5_8;
+			      REAL r5_24, i5_24;
+			      r5_8 = c_re(in[8]);
+			      i5_8 = c_im(in[8]);
+			      r5_24 = c_re(in[24]);
+			      i5_24 = c_im(in[24]);
+			      r4_8 = (r5_8 + r5_24);
+			      i4_8 = (i5_8 + i5_24);
+			      r4_24 = (r5_8 - r5_24);
+			      i4_24 = (i5_8 - i5_24);
+			 }
+			 r3_0 = (r4_0 + r4_8);
+			 i3_0 = (i4_0 + i4_8);
+			 r3_16 = (r4_0 - r4_8);
+			 i3_16 = (i4_0 - i4_8);
+			 r3_8 = (r4_16 + i4_24);
+			 i3_8 = (i4_16 - r4_24);
+			 r3_24 = (r4_16 - i4_24);
+			 i3_24 = (i4_16 + r4_24);
+		    }
+		    {
+			 REAL r4_4, i4_4;
+			 REAL r4_12, i4_12;
+			 REAL r4_20, i4_20;
+			 REAL r4_28, i4_28;
+			 {
+			      REAL r5_4, i5_4;
+			      REAL r5_20, i5_20;
+			      r5_4 = c_re(in[4]);
+			      i5_4 = c_im(in[4]);
+			      r5_20 = c_re(in[20]);
+			      i5_20 = c_im(in[20]);
+			      r4_4 = (r5_4 + r5_20);
+			      i4_4 = (i5_4 + i5_20);
+			      r4_20 = (r5_4 - r5_20);
+			      i4_20 = (i5_4 - i5_20);
+			 }
+			 {
+			      REAL r5_12, i5_12;
+			      REAL r5_28, i5_28;
+			      r5_12 = c_re(in[12]);
+			      i5_12 = c_im(in[12]);
+			      r5_28 = c_re(in[28]);
+			      i5_28 = c_im(in[28]);
+			      r4_12 = (r5_12 + r5_28);
+			      i4_12 = (i5_12 + i5_28);
+			      r4_28 = (r5_12 - r5_28);
+			      i4_28 = (i5_12 - i5_28);
+			 }
+			 r3_4 = (r4_4 + r4_12);
+			 i3_4 = (i4_4 + i4_12);
+			 r3_20 = (r4_4 - r4_12);
+			 i3_20 = (i4_4 - i4_12);
+			 r3_12 = (r4_20 + i4_28);
+			 i3_12 = (i4_20 - r4_28);
+			 r3_28 = (r4_20 - i4_28);
+			 i3_28 = (i4_20 + r4_28);
+		    }
+		    r2_0 = (r3_0 + r3_4);
+		    i2_0 = (i3_0 + i3_4);
+		    r2_16 = (r3_0 - r3_4);
+		    i2_16 = (i3_0 - i3_4);
+		    tmpr = (0.707106781187 * (r3_12 + i3_12));
+		    tmpi = (0.707106781187 * (i3_12 - r3_12));
+		    r2_4 = (r3_8 + tmpr);
+		    i2_4 = (i3_8 + tmpi);
+		    r2_20 = (r3_8 - tmpr);
+		    i2_20 = (i3_8 - tmpi);
+		    r2_8 = (r3_16 + i3_20);
+		    i2_8 = (i3_16 - r3_20);
+		    r2_24 = (r3_16 - i3_20);
+		    i2_24 = (i3_16 + r3_20);
+		    tmpr = (0.707106781187 * (i3_28 - r3_28));
+		    tmpi = (0.707106781187 * (r3_28 + i3_28));
+		    r2_12 = (r3_24 + tmpr);
+		    i2_12 = (i3_24 - tmpi);
+		    r2_28 = (r3_24 - tmpr);
+		    i2_28 = (i3_24 + tmpi);
+	       }
+	       {
+		    REAL r3_2, i3_2;
+		    REAL r3_6, i3_6;
+		    REAL r3_10, i3_10;
+		    REAL r3_14, i3_14;
+		    REAL r3_18, i3_18;
+		    REAL r3_22, i3_22;
+		    REAL r3_26, i3_26;
+		    REAL r3_30, i3_30;
+		    {
+			 REAL r4_2, i4_2;
+			 REAL r4_10, i4_10;
+			 REAL r4_18, i4_18;
+			 REAL r4_26, i4_26;
+			 {
+			      REAL r5_2, i5_2;
+			      REAL r5_18, i5_18;
+			      r5_2 = c_re(in[2]);
+			      i5_2 = c_im(in[2]);
+			      r5_18 = c_re(in[18]);
+			      i5_18 = c_im(in[18]);
+			      r4_2 = (r5_2 + r5_18);
+			      i4_2 = (i5_2 + i5_18);
+			      r4_18 = (r5_2 - r5_18);
+			      i4_18 = (i5_2 - i5_18);
+			 }
+			 {
+			      REAL r5_10, i5_10;
+			      REAL r5_26, i5_26;
+			      r5_10 = c_re(in[10]);
+			      i5_10 = c_im(in[10]);
+			      r5_26 = c_re(in[26]);
+			      i5_26 = c_im(in[26]);
+			      r4_10 = (r5_10 + r5_26);
+			      i4_10 = (i5_10 + i5_26);
+			      r4_26 = (r5_10 - r5_26);
+			      i4_26 = (i5_10 - i5_26);
+			 }
+			 r3_2 = (r4_2 + r4_10);
+			 i3_2 = (i4_2 + i4_10);
+			 r3_18 = (r4_2 - r4_10);
+			 i3_18 = (i4_2 - i4_10);
+			 r3_10 = (r4_18 + i4_26);
+			 i3_10 = (i4_18 - r4_26);
+			 r3_26 = (r4_18 - i4_26);
+			 i3_26 = (i4_18 + r4_26);
+		    }
+		    {
+			 REAL r4_6, i4_6;
+			 REAL r4_14, i4_14;
+			 REAL r4_22, i4_22;
+			 REAL r4_30, i4_30;
+			 {
+			      REAL r5_6, i5_6;
+			      REAL r5_22, i5_22;
+			      r5_6 = c_re(in[6]);
+			      i5_6 = c_im(in[6]);
+			      r5_22 = c_re(in[22]);
+			      i5_22 = c_im(in[22]);
+			      r4_6 = (r5_6 + r5_22);
+			      i4_6 = (i5_6 + i5_22);
+			      r4_22 = (r5_6 - r5_22);
+			      i4_22 = (i5_6 - i5_22);
+			 }
+			 {
+			      REAL r5_14, i5_14;
+			      REAL r5_30, i5_30;
+			      r5_14 = c_re(in[14]);
+			      i5_14 = c_im(in[14]);
+			      r5_30 = c_re(in[30]);
+			      i5_30 = c_im(in[30]);
+			      r4_14 = (r5_14 + r5_30);
+			      i4_14 = (i5_14 + i5_30);
+			      r4_30 = (r5_14 - r5_30);
+			      i4_30 = (i5_14 - i5_30);
+			 }
+			 r3_6 = (r4_6 + r4_14);
+			 i3_6 = (i4_6 + i4_14);
+			 r3_22 = (r4_6 - r4_14);
+			 i3_22 = (i4_6 - i4_14);
+			 r3_14 = (r4_22 + i4_30);
+			 i3_14 = (i4_22 - r4_30);
+			 r3_30 = (r4_22 - i4_30);
+			 i3_30 = (i4_22 + r4_30);
+		    }
+		    r2_2 = (r3_2 + r3_6);
+		    i2_2 = (i3_2 + i3_6);
+		    r2_18 = (r3_2 - r3_6);
+		    i2_18 = (i3_2 - i3_6);
+		    tmpr = (0.707106781187 * (r3_14 + i3_14));
+		    tmpi = (0.707106781187 * (i3_14 - r3_14));
+		    r2_6 = (r3_10 + tmpr);
+		    i2_6 = (i3_10 + tmpi);
+		    r2_22 = (r3_10 - tmpr);
+		    i2_22 = (i3_10 - tmpi);
+		    r2_10 = (r3_18 + i3_22);
+		    i2_10 = (i3_18 - r3_22);
+		    r2_26 = (r3_18 - i3_22);
+		    i2_26 = (i3_18 + r3_22);
+		    tmpr = (0.707106781187 * (i3_30 - r3_30));
+		    tmpi = (0.707106781187 * (r3_30 + i3_30));
+		    r2_14 = (r3_26 + tmpr);
+		    i2_14 = (i3_26 - tmpi);
+		    r2_30 = (r3_26 - tmpr);
+		    i2_30 = (i3_26 + tmpi);
+	       }
+	       r1_0 = (r2_0 + r2_2);
+	       i1_0 = (i2_0 + i2_2);
+	       r1_16 = (r2_0 - r2_2);
+	       i1_16 = (i2_0 - i2_2);
+	       tmpr = ((0.923879532511 * r2_6) + (0.382683432365 * i2_6));
+	       tmpi = ((0.923879532511 * i2_6) - (0.382683432365 * r2_6));
+	       r1_2 = (r2_4 + tmpr);
+	       i1_2 = (i2_4 + tmpi);
+	       r1_18 = (r2_4 - tmpr);
+	       i1_18 = (i2_4 - tmpi);
+	       tmpr = (0.707106781187 * (r2_10 + i2_10));
+	       tmpi = (0.707106781187 * (i2_10 - r2_10));
+	       r1_4 = (r2_8 + tmpr);
+	       i1_4 = (i2_8 + tmpi);
+	       r1_20 = (r2_8 - tmpr);
+	       i1_20 = (i2_8 - tmpi);
+	       tmpr = ((0.382683432365 * r2_14) + (0.923879532511 * i2_14));
+	       tmpi = ((0.382683432365 * i2_14) - (0.923879532511 * r2_14));
+	       r1_6 = (r2_12 + tmpr);
+	       i1_6 = (i2_12 + tmpi);
+	       r1_22 = (r2_12 - tmpr);
+	       i1_22 = (i2_12 - tmpi);
+	       r1_8 = (r2_16 + i2_18);
+	       i1_8 = (i2_16 - r2_18);
+	       r1_24 = (r2_16 - i2_18);
+	       i1_24 = (i2_16 + r2_18);
+	       tmpr = ((0.923879532511 * i2_22) - (0.382683432365 * r2_22));
+	       tmpi = ((0.923879532511 * r2_22) + (0.382683432365 * i2_22));
+	       r1_10 = (r2_20 + tmpr);
+	       i1_10 = (i2_20 - tmpi);
+	       r1_26 = (r2_20 - tmpr);
+	       i1_26 = (i2_20 + tmpi);
+	       tmpr = (0.707106781187 * (i2_26 - r2_26));
+	       tmpi = (0.707106781187 * (r2_26 + i2_26));
+	       r1_12 = (r2_24 + tmpr);
+	       i1_12 = (i2_24 - tmpi);
+	       r1_28 = (r2_24 - tmpr);
+	       i1_28 = (i2_24 + tmpi);
+	       tmpr = ((0.382683432365 * i2_30) - (0.923879532511 * r2_30));
+	       tmpi = ((0.382683432365 * r2_30) + (0.923879532511 * i2_30));
+	       r1_14 = (r2_28 + tmpr);
+	       i1_14 = (i2_28 - tmpi);
+	       r1_30 = (r2_28 - tmpr);
+	       i1_30 = (i2_28 + tmpi);
+	  }
+	  {
+	       REAL r2_1, i2_1;
+	       REAL r2_3, i2_3;
+	       REAL r2_5, i2_5;
+	       REAL r2_7, i2_7;
+	       REAL r2_9, i2_9;
+	       REAL r2_11, i2_11;
+	       REAL r2_13, i2_13;
+	       REAL r2_15, i2_15;
+	       REAL r2_17, i2_17;
+	       REAL r2_19, i2_19;
+	       REAL r2_21, i2_21;
+	       REAL r2_23, i2_23;
+	       REAL r2_25, i2_25;
+	       REAL r2_27, i2_27;
+	       REAL r2_29, i2_29;
+	       REAL r2_31, i2_31;
+	       {
+		    REAL r3_1, i3_1;
+		    REAL r3_5, i3_5;
+		    REAL r3_9, i3_9;
+		    REAL r3_13, i3_13;
+		    REAL r3_17, i3_17;
+		    REAL r3_21, i3_21;
+		    REAL r3_25, i3_25;
+		    REAL r3_29, i3_29;
+		    {
+			 REAL r4_1, i4_1;
+			 REAL r4_9, i4_9;
+			 REAL r4_17, i4_17;
+			 REAL r4_25, i4_25;
+			 {
+			      REAL r5_1, i5_1;
+			      REAL r5_17, i5_17;
+			      r5_1 = c_re(in[1]);
+			      i5_1 = c_im(in[1]);
+			      r5_17 = c_re(in[17]);
+			      i5_17 = c_im(in[17]);
+			      r4_1 = (r5_1 + r5_17);
+			      i4_1 = (i5_1 + i5_17);
+			      r4_17 = (r5_1 - r5_17);
+			      i4_17 = (i5_1 - i5_17);
+			 }
+			 {
+			      REAL r5_9, i5_9;
+			      REAL r5_25, i5_25;
+			      r5_9 = c_re(in[9]);
+			      i5_9 = c_im(in[9]);
+			      r5_25 = c_re(in[25]);
+			      i5_25 = c_im(in[25]);
+			      r4_9 = (r5_9 + r5_25);
+			      i4_9 = (i5_9 + i5_25);
+			      r4_25 = (r5_9 - r5_25);
+			      i4_25 = (i5_9 - i5_25);
+			 }
+			 r3_1 = (r4_1 + r4_9);
+			 i3_1 = (i4_1 + i4_9);
+			 r3_17 = (r4_1 - r4_9);
+			 i3_17 = (i4_1 - i4_9);
+			 r3_9 = (r4_17 + i4_25);
+			 i3_9 = (i4_17 - r4_25);
+			 r3_25 = (r4_17 - i4_25);
+			 i3_25 = (i4_17 + r4_25);
+		    }
+		    {
+			 REAL r4_5, i4_5;
+			 REAL r4_13, i4_13;
+			 REAL r4_21, i4_21;
+			 REAL r4_29, i4_29;
+			 {
+			      REAL r5_5, i5_5;
+			      REAL r5_21, i5_21;
+			      r5_5 = c_re(in[5]);
+			      i5_5 = c_im(in[5]);
+			      r5_21 = c_re(in[21]);
+			      i5_21 = c_im(in[21]);
+			      r4_5 = (r5_5 + r5_21);
+			      i4_5 = (i5_5 + i5_21);
+			      r4_21 = (r5_5 - r5_21);
+			      i4_21 = (i5_5 - i5_21);
+			 }
+			 {
+			      REAL r5_13, i5_13;
+			      REAL r5_29, i5_29;
+			      r5_13 = c_re(in[13]);
+			      i5_13 = c_im(in[13]);
+			      r5_29 = c_re(in[29]);
+			      i5_29 = c_im(in[29]);
+			      r4_13 = (r5_13 + r5_29);
+			      i4_13 = (i5_13 + i5_29);
+			      r4_29 = (r5_13 - r5_29);
+			      i4_29 = (i5_13 - i5_29);
+			 }
+			 r3_5 = (r4_5 + r4_13);
+			 i3_5 = (i4_5 + i4_13);
+			 r3_21 = (r4_5 - r4_13);
+			 i3_21 = (i4_5 - i4_13);
+			 r3_13 = (r4_21 + i4_29);
+			 i3_13 = (i4_21 - r4_29);
+			 r3_29 = (r4_21 - i4_29);
+			 i3_29 = (i4_21 + r4_29);
+		    }
+		    r2_1 = (r3_1 + r3_5);
+		    i2_1 = (i3_1 + i3_5);
+		    r2_17 = (r3_1 - r3_5);
+		    i2_17 = (i3_1 - i3_5);
+		    tmpr = (0.707106781187 * (r3_13 + i3_13));
+		    tmpi = (0.707106781187 * (i3_13 - r3_13));
+		    r2_5 = (r3_9 + tmpr);
+		    i2_5 = (i3_9 + tmpi);
+		    r2_21 = (r3_9 - tmpr);
+		    i2_21 = (i3_9 - tmpi);
+		    r2_9 = (r3_17 + i3_21);
+		    i2_9 = (i3_17 - r3_21);
+		    r2_25 = (r3_17 - i3_21);
+		    i2_25 = (i3_17 + r3_21);
+		    tmpr = (0.707106781187 * (i3_29 - r3_29));
+		    tmpi = (0.707106781187 * (r3_29 + i3_29));
+		    r2_13 = (r3_25 + tmpr);
+		    i2_13 = (i3_25 - tmpi);
+		    r2_29 = (r3_25 - tmpr);
+		    i2_29 = (i3_25 + tmpi);
+	       }
+	       {
+		    REAL r3_3, i3_3;
+		    REAL r3_7, i3_7;
+		    REAL r3_11, i3_11;
+		    REAL r3_15, i3_15;
+		    REAL r3_19, i3_19;
+		    REAL r3_23, i3_23;
+		    REAL r3_27, i3_27;
+		    REAL r3_31, i3_31;
+		    {
+			 REAL r4_3, i4_3;
+			 REAL r4_11, i4_11;
+			 REAL r4_19, i4_19;
+			 REAL r4_27, i4_27;
+			 {
+			      REAL r5_3, i5_3;
+			      REAL r5_19, i5_19;
+			      r5_3 = c_re(in[3]);
+			      i5_3 = c_im(in[3]);
+			      r5_19 = c_re(in[19]);
+			      i5_19 = c_im(in[19]);
+			      r4_3 = (r5_3 + r5_19);
+			      i4_3 = (i5_3 + i5_19);
+			      r4_19 = (r5_3 - r5_19);
+			      i4_19 = (i5_3 - i5_19);
+			 }
+			 {
+			      REAL r5_11, i5_11;
+			      REAL r5_27, i5_27;
+			      r5_11 = c_re(in[11]);
+			      i5_11 = c_im(in[11]);
+			      r5_27 = c_re(in[27]);
+			      i5_27 = c_im(in[27]);
+			      r4_11 = (r5_11 + r5_27);
+			      i4_11 = (i5_11 + i5_27);
+			      r4_27 = (r5_11 - r5_27);
+			      i4_27 = (i5_11 - i5_27);
+			 }
+			 r3_3 = (r4_3 + r4_11);
+			 i3_3 = (i4_3 + i4_11);
+			 r3_19 = (r4_3 - r4_11);
+			 i3_19 = (i4_3 - i4_11);
+			 r3_11 = (r4_19 + i4_27);
+			 i3_11 = (i4_19 - r4_27);
+			 r3_27 = (r4_19 - i4_27);
+			 i3_27 = (i4_19 + r4_27);
+		    }
+		    {
+			 REAL r4_7, i4_7;
+			 REAL r4_15, i4_15;
+			 REAL r4_23, i4_23;
+			 REAL r4_31, i4_31;
+			 {
+			      REAL r5_7, i5_7;
+			      REAL r5_23, i5_23;
+			      r5_7 = c_re(in[7]);
+			      i5_7 = c_im(in[7]);
+			      r5_23 = c_re(in[23]);
+			      i5_23 = c_im(in[23]);
+			      r4_7 = (r5_7 + r5_23);
+			      i4_7 = (i5_7 + i5_23);
+			      r4_23 = (r5_7 - r5_23);
+			      i4_23 = (i5_7 - i5_23);
+			 }
+			 {
+			      REAL r5_15, i5_15;
+			      REAL r5_31, i5_31;
+			      r5_15 = c_re(in[15]);
+			      i5_15 = c_im(in[15]);
+			      r5_31 = c_re(in[31]);
+			      i5_31 = c_im(in[31]);
+			      r4_15 = (r5_15 + r5_31);
+			      i4_15 = (i5_15 + i5_31);
+			      r4_31 = (r5_15 - r5_31);
+			      i4_31 = (i5_15 - i5_31);
+			 }
+			 r3_7 = (r4_7 + r4_15);
+			 i3_7 = (i4_7 + i4_15);
+			 r3_23 = (r4_7 - r4_15);
+			 i3_23 = (i4_7 - i4_15);
+			 r3_15 = (r4_23 + i4_31);
+			 i3_15 = (i4_23 - r4_31);
+			 r3_31 = (r4_23 - i4_31);
+			 i3_31 = (i4_23 + r4_31);
+		    }
+		    r2_3 = (r3_3 + r3_7);
+		    i2_3 = (i3_3 + i3_7);
+		    r2_19 = (r3_3 - r3_7);
+		    i2_19 = (i3_3 - i3_7);
+		    tmpr = (0.707106781187 * (r3_15 + i3_15));
+		    tmpi = (0.707106781187 * (i3_15 - r3_15));
+		    r2_7 = (r3_11 + tmpr);
+		    i2_7 = (i3_11 + tmpi);
+		    r2_23 = (r3_11 - tmpr);
+		    i2_23 = (i3_11 - tmpi);
+		    r2_11 = (r3_19 + i3_23);
+		    i2_11 = (i3_19 - r3_23);
+		    r2_27 = (r3_19 - i3_23);
+		    i2_27 = (i3_19 + r3_23);
+		    tmpr = (0.707106781187 * (i3_31 - r3_31));
+		    tmpi = (0.707106781187 * (r3_31 + i3_31));
+		    r2_15 = (r3_27 + tmpr);
+		    i2_15 = (i3_27 - tmpi);
+		    r2_31 = (r3_27 - tmpr);
+		    i2_31 = (i3_27 + tmpi);
+	       }
+	       r1_1 = (r2_1 + r2_3);
+	       i1_1 = (i2_1 + i2_3);
+	       r1_17 = (r2_1 - r2_3);
+	       i1_17 = (i2_1 - i2_3);
+	       tmpr = ((0.923879532511 * r2_7) + (0.382683432365 * i2_7));
+	       tmpi = ((0.923879532511 * i2_7) - (0.382683432365 * r2_7));
+	       r1_3 = (r2_5 + tmpr);
+	       i1_3 = (i2_5 + tmpi);
+	       r1_19 = (r2_5 - tmpr);
+	       i1_19 = (i2_5 - tmpi);
+	       tmpr = (0.707106781187 * (r2_11 + i2_11));
+	       tmpi = (0.707106781187 * (i2_11 - r2_11));
+	       r1_5 = (r2_9 + tmpr);
+	       i1_5 = (i2_9 + tmpi);
+	       r1_21 = (r2_9 - tmpr);
+	       i1_21 = (i2_9 - tmpi);
+	       tmpr = ((0.382683432365 * r2_15) + (0.923879532511 * i2_15));
+	       tmpi = ((0.382683432365 * i2_15) - (0.923879532511 * r2_15));
+	       r1_7 = (r2_13 + tmpr);
+	       i1_7 = (i2_13 + tmpi);
+	       r1_23 = (r2_13 - tmpr);
+	       i1_23 = (i2_13 - tmpi);
+	       r1_9 = (r2_17 + i2_19);
+	       i1_9 = (i2_17 - r2_19);
+	       r1_25 = (r2_17 - i2_19);
+	       i1_25 = (i2_17 + r2_19);
+	       tmpr = ((0.923879532511 * i2_23) - (0.382683432365 * r2_23));
+	       tmpi = ((0.923879532511 * r2_23) + (0.382683432365 * i2_23));
+	       r1_11 = (r2_21 + tmpr);
+	       i1_11 = (i2_21 - tmpi);
+	       r1_27 = (r2_21 - tmpr);
+	       i1_27 = (i2_21 + tmpi);
+	       tmpr = (0.707106781187 * (i2_27 - r2_27));
+	       tmpi = (0.707106781187 * (r2_27 + i2_27));
+	       r1_13 = (r2_25 + tmpr);
+	       i1_13 = (i2_25 - tmpi);
+	       r1_29 = (r2_25 - tmpr);
+	       i1_29 = (i2_25 + tmpi);
+	       tmpr = ((0.382683432365 * i2_31) - (0.923879532511 * r2_31));
+	       tmpi = ((0.382683432365 * r2_31) + (0.923879532511 * i2_31));
+	       r1_15 = (r2_29 + tmpr);
+	       i1_15 = (i2_29 - tmpi);
+	       r1_31 = (r2_29 - tmpr);
+	       i1_31 = (i2_29 + tmpi);
+	  }
+	  c_re(out[0]) = (r1_0 + r1_1);
+	  c_im(out[0]) = (i1_0 + i1_1);
+	  c_re(out[16]) = (r1_0 - r1_1);
+	  c_im(out[16]) = (i1_0 - i1_1);
+	  tmpr = ((0.980785280403 * r1_3) + (0.195090322016 * i1_3));
+	  tmpi = ((0.980785280403 * i1_3) - (0.195090322016 * r1_3));
+	  c_re(out[1]) = (r1_2 + tmpr);
+	  c_im(out[1]) = (i1_2 + tmpi);
+	  c_re(out[17]) = (r1_2 - tmpr);
+	  c_im(out[17]) = (i1_2 - tmpi);
+	  tmpr = ((0.923879532511 * r1_5) + (0.382683432365 * i1_5));
+	  tmpi = ((0.923879532511 * i1_5) - (0.382683432365 * r1_5));
+	  c_re(out[2]) = (r1_4 + tmpr);
+	  c_im(out[2]) = (i1_4 + tmpi);
+	  c_re(out[18]) = (r1_4 - tmpr);
+	  c_im(out[18]) = (i1_4 - tmpi);
+	  tmpr = ((0.831469612303 * r1_7) + (0.55557023302 * i1_7));
+	  tmpi = ((0.831469612303 * i1_7) - (0.55557023302 * r1_7));
+	  c_re(out[3]) = (r1_6 + tmpr);
+	  c_im(out[3]) = (i1_6 + tmpi);
+	  c_re(out[19]) = (r1_6 - tmpr);
+	  c_im(out[19]) = (i1_6 - tmpi);
+	  tmpr = (0.707106781187 * (r1_9 + i1_9));
+	  tmpi = (0.707106781187 * (i1_9 - r1_9));
+	  c_re(out[4]) = (r1_8 + tmpr);
+	  c_im(out[4]) = (i1_8 + tmpi);
+	  c_re(out[20]) = (r1_8 - tmpr);
+	  c_im(out[20]) = (i1_8 - tmpi);
+	  tmpr = ((0.55557023302 * r1_11) + (0.831469612303 * i1_11));
+	  tmpi = ((0.55557023302 * i1_11) - (0.831469612303 * r1_11));
+	  c_re(out[5]) = (r1_10 + tmpr);
+	  c_im(out[5]) = (i1_10 + tmpi);
+	  c_re(out[21]) = (r1_10 - tmpr);
+	  c_im(out[21]) = (i1_10 - tmpi);
+	  tmpr = ((0.382683432365 * r1_13) + (0.923879532511 * i1_13));
+	  tmpi = ((0.382683432365 * i1_13) - (0.923879532511 * r1_13));
+	  c_re(out[6]) = (r1_12 + tmpr);
+	  c_im(out[6]) = (i1_12 + tmpi);
+	  c_re(out[22]) = (r1_12 - tmpr);
+	  c_im(out[22]) = (i1_12 - tmpi);
+	  tmpr = ((0.195090322016 * r1_15) + (0.980785280403 * i1_15));
+	  tmpi = ((0.195090322016 * i1_15) - (0.980785280403 * r1_15));
+	  c_re(out[7]) = (r1_14 + tmpr);
+	  c_im(out[7]) = (i1_14 + tmpi);
+	  c_re(out[23]) = (r1_14 - tmpr);
+	  c_im(out[23]) = (i1_14 - tmpi);
+	  c_re(out[8]) = (r1_16 + i1_17);
+	  c_im(out[8]) = (i1_16 - r1_17);
+	  c_re(out[24]) = (r1_16 - i1_17);
+	  c_im(out[24]) = (i1_16 + r1_17);
+	  tmpr = ((0.980785280403 * i1_19) - (0.195090322016 * r1_19));
+	  tmpi = ((0.980785280403 * r1_19) + (0.195090322016 * i1_19));
+	  c_re(out[9]) = (r1_18 + tmpr);
+	  c_im(out[9]) = (i1_18 - tmpi);
+	  c_re(out[25]) = (r1_18 - tmpr);
+	  c_im(out[25]) = (i1_18 + tmpi);
+	  tmpr = ((0.923879532511 * i1_21) - (0.382683432365 * r1_21));
+	  tmpi = ((0.923879532511 * r1_21) + (0.382683432365 * i1_21));
+	  c_re(out[10]) = (r1_20 + tmpr);
+	  c_im(out[10]) = (i1_20 - tmpi);
+	  c_re(out[26]) = (r1_20 - tmpr);
+	  c_im(out[26]) = (i1_20 + tmpi);
+	  tmpr = ((0.831469612303 * i1_23) - (0.55557023302 * r1_23));
+	  tmpi = ((0.831469612303 * r1_23) + (0.55557023302 * i1_23));
+	  c_re(out[11]) = (r1_22 + tmpr);
+	  c_im(out[11]) = (i1_22 - tmpi);
+	  c_re(out[27]) = (r1_22 - tmpr);
+	  c_im(out[27]) = (i1_22 + tmpi);
+	  tmpr = (0.707106781187 * (i1_25 - r1_25));
+	  tmpi = (0.707106781187 * (r1_25 + i1_25));
+	  c_re(out[12]) = (r1_24 + tmpr);
+	  c_im(out[12]) = (i1_24 - tmpi);
+	  c_re(out[28]) = (r1_24 - tmpr);
+	  c_im(out[28]) = (i1_24 + tmpi);
+	  tmpr = ((0.55557023302 * i1_27) - (0.831469612303 * r1_27));
+	  tmpi = ((0.55557023302 * r1_27) + (0.831469612303 * i1_27));
+	  c_re(out[13]) = (r1_26 + tmpr);
+	  c_im(out[13]) = (i1_26 - tmpi);
+	  c_re(out[29]) = (r1_26 - tmpr);
+	  c_im(out[29]) = (i1_26 + tmpi);
+	  tmpr = ((0.382683432365 * i1_29) - (0.923879532511 * r1_29));
+	  tmpi = ((0.382683432365 * r1_29) + (0.923879532511 * i1_29));
+	  c_re(out[14]) = (r1_28 + tmpr);
+	  c_im(out[14]) = (i1_28 - tmpi);
+	  c_re(out[30]) = (r1_28 - tmpr);
+	  c_im(out[30]) = (i1_28 + tmpi);
+	  tmpr = ((0.195090322016 * i1_31) - (0.980785280403 * r1_31));
+	  tmpi = ((0.195090322016 * r1_31) + (0.980785280403 * i1_31));
+	  c_re(out[15]) = (r1_30 + tmpr);
+	  c_im(out[15]) = (i1_30 - tmpi);
+	  c_re(out[31]) = (r1_30 - tmpr);
+	  c_im(out[31]) = (i1_30 + tmpi);
+     }
+}
+void fft_twiddle_32(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m)
+{
+     int l1, i;
+     COMPLEX *jp, *kp;
+     REAL tmpr, tmpi, wr, wi;
+     if ((b - a) < 128) {
+	  for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+	       i++, l1 += nWdn, kp++) {
+	       jp = in + i;
+	       {
+		    REAL r1_0, i1_0;
+		    REAL r1_1, i1_1;
+		    REAL r1_2, i1_2;
+		    REAL r1_3, i1_3;
+		    REAL r1_4, i1_4;
+		    REAL r1_5, i1_5;
+		    REAL r1_6, i1_6;
+		    REAL r1_7, i1_7;
+		    REAL r1_8, i1_8;
+		    REAL r1_9, i1_9;
+		    REAL r1_10, i1_10;
+		    REAL r1_11, i1_11;
+		    REAL r1_12, i1_12;
+		    REAL r1_13, i1_13;
+		    REAL r1_14, i1_14;
+		    REAL r1_15, i1_15;
+		    REAL r1_16, i1_16;
+		    REAL r1_17, i1_17;
+		    REAL r1_18, i1_18;
+		    REAL r1_19, i1_19;
+		    REAL r1_20, i1_20;
+		    REAL r1_21, i1_21;
+		    REAL r1_22, i1_22;
+		    REAL r1_23, i1_23;
+		    REAL r1_24, i1_24;
+		    REAL r1_25, i1_25;
+		    REAL r1_26, i1_26;
+		    REAL r1_27, i1_27;
+		    REAL r1_28, i1_28;
+		    REAL r1_29, i1_29;
+		    REAL r1_30, i1_30;
+		    REAL r1_31, i1_31;
+		    {
+			 REAL r2_0, i2_0;
+			 REAL r2_2, i2_2;
+			 REAL r2_4, i2_4;
+			 REAL r2_6, i2_6;
+			 REAL r2_8, i2_8;
+			 REAL r2_10, i2_10;
+			 REAL r2_12, i2_12;
+			 REAL r2_14, i2_14;
+			 REAL r2_16, i2_16;
+			 REAL r2_18, i2_18;
+			 REAL r2_20, i2_20;
+			 REAL r2_22, i2_22;
+			 REAL r2_24, i2_24;
+			 REAL r2_26, i2_26;
+			 REAL r2_28, i2_28;
+			 REAL r2_30, i2_30;
+			 {
+			      REAL r3_0, i3_0;
+			      REAL r3_4, i3_4;
+			      REAL r3_8, i3_8;
+			      REAL r3_12, i3_12;
+			      REAL r3_16, i3_16;
+			      REAL r3_20, i3_20;
+			      REAL r3_24, i3_24;
+			      REAL r3_28, i3_28;
+			      {
+				   REAL r4_0, i4_0;
+				   REAL r4_8, i4_8;
+				   REAL r4_16, i4_16;
+				   REAL r4_24, i4_24;
+				   {
+					REAL r5_0, i5_0;
+					REAL r5_16, i5_16;
+					r5_0 = c_re(jp[0 * m]);
+					i5_0 = c_im(jp[0 * m]);
+					wr = c_re(W[16 * l1]);
+					wi = c_im(W[16 * l1]);
+					tmpr = c_re(jp[16 * m]);
+					tmpi = c_im(jp[16 * m]);
+					r5_16 = ((wr * tmpr) - (wi * tmpi));
+					i5_16 = ((wi * tmpr) + (wr * tmpi));
+					r4_0 = (r5_0 + r5_16);
+					i4_0 = (i5_0 + i5_16);
+					r4_16 = (r5_0 - r5_16);
+					i4_16 = (i5_0 - i5_16);
+				   }
+				   {
+					REAL r5_8, i5_8;
+					REAL r5_24, i5_24;
+					wr = c_re(W[8 * l1]);
+					wi = c_im(W[8 * l1]);
+					tmpr = c_re(jp[8 * m]);
+					tmpi = c_im(jp[8 * m]);
+					r5_8 = ((wr * tmpr) - (wi * tmpi));
+					i5_8 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[24 * l1]);
+					wi = c_im(W[24 * l1]);
+					tmpr = c_re(jp[24 * m]);
+					tmpi = c_im(jp[24 * m]);
+					r5_24 = ((wr * tmpr) - (wi * tmpi));
+					i5_24 = ((wi * tmpr) + (wr * tmpi));
+					r4_8 = (r5_8 + r5_24);
+					i4_8 = (i5_8 + i5_24);
+					r4_24 = (r5_8 - r5_24);
+					i4_24 = (i5_8 - i5_24);
+				   }
+				   r3_0 = (r4_0 + r4_8);
+				   i3_0 = (i4_0 + i4_8);
+				   r3_16 = (r4_0 - r4_8);
+				   i3_16 = (i4_0 - i4_8);
+				   r3_8 = (r4_16 + i4_24);
+				   i3_8 = (i4_16 - r4_24);
+				   r3_24 = (r4_16 - i4_24);
+				   i3_24 = (i4_16 + r4_24);
+			      }
+			      {
+				   REAL r4_4, i4_4;
+				   REAL r4_12, i4_12;
+				   REAL r4_20, i4_20;
+				   REAL r4_28, i4_28;
+				   {
+					REAL r5_4, i5_4;
+					REAL r5_20, i5_20;
+					wr = c_re(W[4 * l1]);
+					wi = c_im(W[4 * l1]);
+					tmpr = c_re(jp[4 * m]);
+					tmpi = c_im(jp[4 * m]);
+					r5_4 = ((wr * tmpr) - (wi * tmpi));
+					i5_4 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[20 * l1]);
+					wi = c_im(W[20 * l1]);
+					tmpr = c_re(jp[20 * m]);
+					tmpi = c_im(jp[20 * m]);
+					r5_20 = ((wr * tmpr) - (wi * tmpi));
+					i5_20 = ((wi * tmpr) + (wr * tmpi));
+					r4_4 = (r5_4 + r5_20);
+					i4_4 = (i5_4 + i5_20);
+					r4_20 = (r5_4 - r5_20);
+					i4_20 = (i5_4 - i5_20);
+				   }
+				   {
+					REAL r5_12, i5_12;
+					REAL r5_28, i5_28;
+					wr = c_re(W[12 * l1]);
+					wi = c_im(W[12 * l1]);
+					tmpr = c_re(jp[12 * m]);
+					tmpi = c_im(jp[12 * m]);
+					r5_12 = ((wr * tmpr) - (wi * tmpi));
+					i5_12 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[28 * l1]);
+					wi = c_im(W[28 * l1]);
+					tmpr = c_re(jp[28 * m]);
+					tmpi = c_im(jp[28 * m]);
+					r5_28 = ((wr * tmpr) - (wi * tmpi));
+					i5_28 = ((wi * tmpr) + (wr * tmpi));
+					r4_12 = (r5_12 + r5_28);
+					i4_12 = (i5_12 + i5_28);
+					r4_28 = (r5_12 - r5_28);
+					i4_28 = (i5_12 - i5_28);
+				   }
+				   r3_4 = (r4_4 + r4_12);
+				   i3_4 = (i4_4 + i4_12);
+				   r3_20 = (r4_4 - r4_12);
+				   i3_20 = (i4_4 - i4_12);
+				   r3_12 = (r4_20 + i4_28);
+				   i3_12 = (i4_20 - r4_28);
+				   r3_28 = (r4_20 - i4_28);
+				   i3_28 = (i4_20 + r4_28);
+			      }
+			      r2_0 = (r3_0 + r3_4);
+			      i2_0 = (i3_0 + i3_4);
+			      r2_16 = (r3_0 - r3_4);
+			      i2_16 = (i3_0 - i3_4);
+			      tmpr = (0.707106781187 * (r3_12 + i3_12));
+			      tmpi = (0.707106781187 * (i3_12 - r3_12));
+			      r2_4 = (r3_8 + tmpr);
+			      i2_4 = (i3_8 + tmpi);
+			      r2_20 = (r3_8 - tmpr);
+			      i2_20 = (i3_8 - tmpi);
+			      r2_8 = (r3_16 + i3_20);
+			      i2_8 = (i3_16 - r3_20);
+			      r2_24 = (r3_16 - i3_20);
+			      i2_24 = (i3_16 + r3_20);
+			      tmpr = (0.707106781187 * (i3_28 - r3_28));
+			      tmpi = (0.707106781187 * (r3_28 + i3_28));
+			      r2_12 = (r3_24 + tmpr);
+			      i2_12 = (i3_24 - tmpi);
+			      r2_28 = (r3_24 - tmpr);
+			      i2_28 = (i3_24 + tmpi);
+			 }
+			 {
+			      REAL r3_2, i3_2;
+			      REAL r3_6, i3_6;
+			      REAL r3_10, i3_10;
+			      REAL r3_14, i3_14;
+			      REAL r3_18, i3_18;
+			      REAL r3_22, i3_22;
+			      REAL r3_26, i3_26;
+			      REAL r3_30, i3_30;
+			      {
+				   REAL r4_2, i4_2;
+				   REAL r4_10, i4_10;
+				   REAL r4_18, i4_18;
+				   REAL r4_26, i4_26;
+				   {
+					REAL r5_2, i5_2;
+					REAL r5_18, i5_18;
+					wr = c_re(W[2 * l1]);
+					wi = c_im(W[2 * l1]);
+					tmpr = c_re(jp[2 * m]);
+					tmpi = c_im(jp[2 * m]);
+					r5_2 = ((wr * tmpr) - (wi * tmpi));
+					i5_2 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[18 * l1]);
+					wi = c_im(W[18 * l1]);
+					tmpr = c_re(jp[18 * m]);
+					tmpi = c_im(jp[18 * m]);
+					r5_18 = ((wr * tmpr) - (wi * tmpi));
+					i5_18 = ((wi * tmpr) + (wr * tmpi));
+					r4_2 = (r5_2 + r5_18);
+					i4_2 = (i5_2 + i5_18);
+					r4_18 = (r5_2 - r5_18);
+					i4_18 = (i5_2 - i5_18);
+				   }
+				   {
+					REAL r5_10, i5_10;
+					REAL r5_26, i5_26;
+					wr = c_re(W[10 * l1]);
+					wi = c_im(W[10 * l1]);
+					tmpr = c_re(jp[10 * m]);
+					tmpi = c_im(jp[10 * m]);
+					r5_10 = ((wr * tmpr) - (wi * tmpi));
+					i5_10 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[26 * l1]);
+					wi = c_im(W[26 * l1]);
+					tmpr = c_re(jp[26 * m]);
+					tmpi = c_im(jp[26 * m]);
+					r5_26 = ((wr * tmpr) - (wi * tmpi));
+					i5_26 = ((wi * tmpr) + (wr * tmpi));
+					r4_10 = (r5_10 + r5_26);
+					i4_10 = (i5_10 + i5_26);
+					r4_26 = (r5_10 - r5_26);
+					i4_26 = (i5_10 - i5_26);
+				   }
+				   r3_2 = (r4_2 + r4_10);
+				   i3_2 = (i4_2 + i4_10);
+				   r3_18 = (r4_2 - r4_10);
+				   i3_18 = (i4_2 - i4_10);
+				   r3_10 = (r4_18 + i4_26);
+				   i3_10 = (i4_18 - r4_26);
+				   r3_26 = (r4_18 - i4_26);
+				   i3_26 = (i4_18 + r4_26);
+			      }
+			      {
+				   REAL r4_6, i4_6;
+				   REAL r4_14, i4_14;
+				   REAL r4_22, i4_22;
+				   REAL r4_30, i4_30;
+				   {
+					REAL r5_6, i5_6;
+					REAL r5_22, i5_22;
+					wr = c_re(W[6 * l1]);
+					wi = c_im(W[6 * l1]);
+					tmpr = c_re(jp[6 * m]);
+					tmpi = c_im(jp[6 * m]);
+					r5_6 = ((wr * tmpr) - (wi * tmpi));
+					i5_6 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[22 * l1]);
+					wi = c_im(W[22 * l1]);
+					tmpr = c_re(jp[22 * m]);
+					tmpi = c_im(jp[22 * m]);
+					r5_22 = ((wr * tmpr) - (wi * tmpi));
+					i5_22 = ((wi * tmpr) + (wr * tmpi));
+					r4_6 = (r5_6 + r5_22);
+					i4_6 = (i5_6 + i5_22);
+					r4_22 = (r5_6 - r5_22);
+					i4_22 = (i5_6 - i5_22);
+				   }
+				   {
+					REAL r5_14, i5_14;
+					REAL r5_30, i5_30;
+					wr = c_re(W[14 * l1]);
+					wi = c_im(W[14 * l1]);
+					tmpr = c_re(jp[14 * m]);
+					tmpi = c_im(jp[14 * m]);
+					r5_14 = ((wr * tmpr) - (wi * tmpi));
+					i5_14 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[30 * l1]);
+					wi = c_im(W[30 * l1]);
+					tmpr = c_re(jp[30 * m]);
+					tmpi = c_im(jp[30 * m]);
+					r5_30 = ((wr * tmpr) - (wi * tmpi));
+					i5_30 = ((wi * tmpr) + (wr * tmpi));
+					r4_14 = (r5_14 + r5_30);
+					i4_14 = (i5_14 + i5_30);
+					r4_30 = (r5_14 - r5_30);
+					i4_30 = (i5_14 - i5_30);
+				   }
+				   r3_6 = (r4_6 + r4_14);
+				   i3_6 = (i4_6 + i4_14);
+				   r3_22 = (r4_6 - r4_14);
+				   i3_22 = (i4_6 - i4_14);
+				   r3_14 = (r4_22 + i4_30);
+				   i3_14 = (i4_22 - r4_30);
+				   r3_30 = (r4_22 - i4_30);
+				   i3_30 = (i4_22 + r4_30);
+			      }
+			      r2_2 = (r3_2 + r3_6);
+			      i2_2 = (i3_2 + i3_6);
+			      r2_18 = (r3_2 - r3_6);
+			      i2_18 = (i3_2 - i3_6);
+			      tmpr = (0.707106781187 * (r3_14 + i3_14));
+			      tmpi = (0.707106781187 * (i3_14 - r3_14));
+			      r2_6 = (r3_10 + tmpr);
+			      i2_6 = (i3_10 + tmpi);
+			      r2_22 = (r3_10 - tmpr);
+			      i2_22 = (i3_10 - tmpi);
+			      r2_10 = (r3_18 + i3_22);
+			      i2_10 = (i3_18 - r3_22);
+			      r2_26 = (r3_18 - i3_22);
+			      i2_26 = (i3_18 + r3_22);
+			      tmpr = (0.707106781187 * (i3_30 - r3_30));
+			      tmpi = (0.707106781187 * (r3_30 + i3_30));
+			      r2_14 = (r3_26 + tmpr);
+			      i2_14 = (i3_26 - tmpi);
+			      r2_30 = (r3_26 - tmpr);
+			      i2_30 = (i3_26 + tmpi);
+			 }
+			 r1_0 = (r2_0 + r2_2);
+			 i1_0 = (i2_0 + i2_2);
+			 r1_16 = (r2_0 - r2_2);
+			 i1_16 = (i2_0 - i2_2);
+			 tmpr = ((0.923879532511 * r2_6) + (0.382683432365 * i2_6));
+			 tmpi = ((0.923879532511 * i2_6) - (0.382683432365 * r2_6));
+			 r1_2 = (r2_4 + tmpr);
+			 i1_2 = (i2_4 + tmpi);
+			 r1_18 = (r2_4 - tmpr);
+			 i1_18 = (i2_4 - tmpi);
+			 tmpr = (0.707106781187 * (r2_10 + i2_10));
+			 tmpi = (0.707106781187 * (i2_10 - r2_10));
+			 r1_4 = (r2_8 + tmpr);
+			 i1_4 = (i2_8 + tmpi);
+			 r1_20 = (r2_8 - tmpr);
+			 i1_20 = (i2_8 - tmpi);
+			 tmpr = ((0.382683432365 * r2_14) + (0.923879532511 * i2_14));
+			 tmpi = ((0.382683432365 * i2_14) - (0.923879532511 * r2_14));
+			 r1_6 = (r2_12 + tmpr);
+			 i1_6 = (i2_12 + tmpi);
+			 r1_22 = (r2_12 - tmpr);
+			 i1_22 = (i2_12 - tmpi);
+			 r1_8 = (r2_16 + i2_18);
+			 i1_8 = (i2_16 - r2_18);
+			 r1_24 = (r2_16 - i2_18);
+			 i1_24 = (i2_16 + r2_18);
+			 tmpr = ((0.923879532511 * i2_22) - (0.382683432365 * r2_22));
+			 tmpi = ((0.923879532511 * r2_22) + (0.382683432365 * i2_22));
+			 r1_10 = (r2_20 + tmpr);
+			 i1_10 = (i2_20 - tmpi);
+			 r1_26 = (r2_20 - tmpr);
+			 i1_26 = (i2_20 + tmpi);
+			 tmpr = (0.707106781187 * (i2_26 - r2_26));
+			 tmpi = (0.707106781187 * (r2_26 + i2_26));
+			 r1_12 = (r2_24 + tmpr);
+			 i1_12 = (i2_24 - tmpi);
+			 r1_28 = (r2_24 - tmpr);
+			 i1_28 = (i2_24 + tmpi);
+			 tmpr = ((0.382683432365 * i2_30) - (0.923879532511 * r2_30));
+			 tmpi = ((0.382683432365 * r2_30) + (0.923879532511 * i2_30));
+			 r1_14 = (r2_28 + tmpr);
+			 i1_14 = (i2_28 - tmpi);
+			 r1_30 = (r2_28 - tmpr);
+			 i1_30 = (i2_28 + tmpi);
+		    }
+		    {
+			 REAL r2_1, i2_1;
+			 REAL r2_3, i2_3;
+			 REAL r2_5, i2_5;
+			 REAL r2_7, i2_7;
+			 REAL r2_9, i2_9;
+			 REAL r2_11, i2_11;
+			 REAL r2_13, i2_13;
+			 REAL r2_15, i2_15;
+			 REAL r2_17, i2_17;
+			 REAL r2_19, i2_19;
+			 REAL r2_21, i2_21;
+			 REAL r2_23, i2_23;
+			 REAL r2_25, i2_25;
+			 REAL r2_27, i2_27;
+			 REAL r2_29, i2_29;
+			 REAL r2_31, i2_31;
+			 {
+			      REAL r3_1, i3_1;
+			      REAL r3_5, i3_5;
+			      REAL r3_9, i3_9;
+			      REAL r3_13, i3_13;
+			      REAL r3_17, i3_17;
+			      REAL r3_21, i3_21;
+			      REAL r3_25, i3_25;
+			      REAL r3_29, i3_29;
+			      {
+				   REAL r4_1, i4_1;
+				   REAL r4_9, i4_9;
+				   REAL r4_17, i4_17;
+				   REAL r4_25, i4_25;
+				   {
+					REAL r5_1, i5_1;
+					REAL r5_17, i5_17;
+					wr = c_re(W[1 * l1]);
+					wi = c_im(W[1 * l1]);
+					tmpr = c_re(jp[1 * m]);
+					tmpi = c_im(jp[1 * m]);
+					r5_1 = ((wr * tmpr) - (wi * tmpi));
+					i5_1 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[17 * l1]);
+					wi = c_im(W[17 * l1]);
+					tmpr = c_re(jp[17 * m]);
+					tmpi = c_im(jp[17 * m]);
+					r5_17 = ((wr * tmpr) - (wi * tmpi));
+					i5_17 = ((wi * tmpr) + (wr * tmpi));
+					r4_1 = (r5_1 + r5_17);
+					i4_1 = (i5_1 + i5_17);
+					r4_17 = (r5_1 - r5_17);
+					i4_17 = (i5_1 - i5_17);
+				   }
+				   {
+					REAL r5_9, i5_9;
+					REAL r5_25, i5_25;
+					wr = c_re(W[9 * l1]);
+					wi = c_im(W[9 * l1]);
+					tmpr = c_re(jp[9 * m]);
+					tmpi = c_im(jp[9 * m]);
+					r5_9 = ((wr * tmpr) - (wi * tmpi));
+					i5_9 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[25 * l1]);
+					wi = c_im(W[25 * l1]);
+					tmpr = c_re(jp[25 * m]);
+					tmpi = c_im(jp[25 * m]);
+					r5_25 = ((wr * tmpr) - (wi * tmpi));
+					i5_25 = ((wi * tmpr) + (wr * tmpi));
+					r4_9 = (r5_9 + r5_25);
+					i4_9 = (i5_9 + i5_25);
+					r4_25 = (r5_9 - r5_25);
+					i4_25 = (i5_9 - i5_25);
+				   }
+				   r3_1 = (r4_1 + r4_9);
+				   i3_1 = (i4_1 + i4_9);
+				   r3_17 = (r4_1 - r4_9);
+				   i3_17 = (i4_1 - i4_9);
+				   r3_9 = (r4_17 + i4_25);
+				   i3_9 = (i4_17 - r4_25);
+				   r3_25 = (r4_17 - i4_25);
+				   i3_25 = (i4_17 + r4_25);
+			      }
+			      {
+				   REAL r4_5, i4_5;
+				   REAL r4_13, i4_13;
+				   REAL r4_21, i4_21;
+				   REAL r4_29, i4_29;
+				   {
+					REAL r5_5, i5_5;
+					REAL r5_21, i5_21;
+					wr = c_re(W[5 * l1]);
+					wi = c_im(W[5 * l1]);
+					tmpr = c_re(jp[5 * m]);
+					tmpi = c_im(jp[5 * m]);
+					r5_5 = ((wr * tmpr) - (wi * tmpi));
+					i5_5 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[21 * l1]);
+					wi = c_im(W[21 * l1]);
+					tmpr = c_re(jp[21 * m]);
+					tmpi = c_im(jp[21 * m]);
+					r5_21 = ((wr * tmpr) - (wi * tmpi));
+					i5_21 = ((wi * tmpr) + (wr * tmpi));
+					r4_5 = (r5_5 + r5_21);
+					i4_5 = (i5_5 + i5_21);
+					r4_21 = (r5_5 - r5_21);
+					i4_21 = (i5_5 - i5_21);
+				   }
+				   {
+					REAL r5_13, i5_13;
+					REAL r5_29, i5_29;
+					wr = c_re(W[13 * l1]);
+					wi = c_im(W[13 * l1]);
+					tmpr = c_re(jp[13 * m]);
+					tmpi = c_im(jp[13 * m]);
+					r5_13 = ((wr * tmpr) - (wi * tmpi));
+					i5_13 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[29 * l1]);
+					wi = c_im(W[29 * l1]);
+					tmpr = c_re(jp[29 * m]);
+					tmpi = c_im(jp[29 * m]);
+					r5_29 = ((wr * tmpr) - (wi * tmpi));
+					i5_29 = ((wi * tmpr) + (wr * tmpi));
+					r4_13 = (r5_13 + r5_29);
+					i4_13 = (i5_13 + i5_29);
+					r4_29 = (r5_13 - r5_29);
+					i4_29 = (i5_13 - i5_29);
+				   }
+				   r3_5 = (r4_5 + r4_13);
+				   i3_5 = (i4_5 + i4_13);
+				   r3_21 = (r4_5 - r4_13);
+				   i3_21 = (i4_5 - i4_13);
+				   r3_13 = (r4_21 + i4_29);
+				   i3_13 = (i4_21 - r4_29);
+				   r3_29 = (r4_21 - i4_29);
+				   i3_29 = (i4_21 + r4_29);
+			      }
+			      r2_1 = (r3_1 + r3_5);
+			      i2_1 = (i3_1 + i3_5);
+			      r2_17 = (r3_1 - r3_5);
+			      i2_17 = (i3_1 - i3_5);
+			      tmpr = (0.707106781187 * (r3_13 + i3_13));
+			      tmpi = (0.707106781187 * (i3_13 - r3_13));
+			      r2_5 = (r3_9 + tmpr);
+			      i2_5 = (i3_9 + tmpi);
+			      r2_21 = (r3_9 - tmpr);
+			      i2_21 = (i3_9 - tmpi);
+			      r2_9 = (r3_17 + i3_21);
+			      i2_9 = (i3_17 - r3_21);
+			      r2_25 = (r3_17 - i3_21);
+			      i2_25 = (i3_17 + r3_21);
+			      tmpr = (0.707106781187 * (i3_29 - r3_29));
+			      tmpi = (0.707106781187 * (r3_29 + i3_29));
+			      r2_13 = (r3_25 + tmpr);
+			      i2_13 = (i3_25 - tmpi);
+			      r2_29 = (r3_25 - tmpr);
+			      i2_29 = (i3_25 + tmpi);
+			 }
+			 {
+			      REAL r3_3, i3_3;
+			      REAL r3_7, i3_7;
+			      REAL r3_11, i3_11;
+			      REAL r3_15, i3_15;
+			      REAL r3_19, i3_19;
+			      REAL r3_23, i3_23;
+			      REAL r3_27, i3_27;
+			      REAL r3_31, i3_31;
+			      {
+				   REAL r4_3, i4_3;
+				   REAL r4_11, i4_11;
+				   REAL r4_19, i4_19;
+				   REAL r4_27, i4_27;
+				   {
+					REAL r5_3, i5_3;
+					REAL r5_19, i5_19;
+					wr = c_re(W[3 * l1]);
+					wi = c_im(W[3 * l1]);
+					tmpr = c_re(jp[3 * m]);
+					tmpi = c_im(jp[3 * m]);
+					r5_3 = ((wr * tmpr) - (wi * tmpi));
+					i5_3 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[19 * l1]);
+					wi = c_im(W[19 * l1]);
+					tmpr = c_re(jp[19 * m]);
+					tmpi = c_im(jp[19 * m]);
+					r5_19 = ((wr * tmpr) - (wi * tmpi));
+					i5_19 = ((wi * tmpr) + (wr * tmpi));
+					r4_3 = (r5_3 + r5_19);
+					i4_3 = (i5_3 + i5_19);
+					r4_19 = (r5_3 - r5_19);
+					i4_19 = (i5_3 - i5_19);
+				   }
+				   {
+					REAL r5_11, i5_11;
+					REAL r5_27, i5_27;
+					wr = c_re(W[11 * l1]);
+					wi = c_im(W[11 * l1]);
+					tmpr = c_re(jp[11 * m]);
+					tmpi = c_im(jp[11 * m]);
+					r5_11 = ((wr * tmpr) - (wi * tmpi));
+					i5_11 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[27 * l1]);
+					wi = c_im(W[27 * l1]);
+					tmpr = c_re(jp[27 * m]);
+					tmpi = c_im(jp[27 * m]);
+					r5_27 = ((wr * tmpr) - (wi * tmpi));
+					i5_27 = ((wi * tmpr) + (wr * tmpi));
+					r4_11 = (r5_11 + r5_27);
+					i4_11 = (i5_11 + i5_27);
+					r4_27 = (r5_11 - r5_27);
+					i4_27 = (i5_11 - i5_27);
+				   }
+				   r3_3 = (r4_3 + r4_11);
+				   i3_3 = (i4_3 + i4_11);
+				   r3_19 = (r4_3 - r4_11);
+				   i3_19 = (i4_3 - i4_11);
+				   r3_11 = (r4_19 + i4_27);
+				   i3_11 = (i4_19 - r4_27);
+				   r3_27 = (r4_19 - i4_27);
+				   i3_27 = (i4_19 + r4_27);
+			      }
+			      {
+				   REAL r4_7, i4_7;
+				   REAL r4_15, i4_15;
+				   REAL r4_23, i4_23;
+				   REAL r4_31, i4_31;
+				   {
+					REAL r5_7, i5_7;
+					REAL r5_23, i5_23;
+					wr = c_re(W[7 * l1]);
+					wi = c_im(W[7 * l1]);
+					tmpr = c_re(jp[7 * m]);
+					tmpi = c_im(jp[7 * m]);
+					r5_7 = ((wr * tmpr) - (wi * tmpi));
+					i5_7 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[23 * l1]);
+					wi = c_im(W[23 * l1]);
+					tmpr = c_re(jp[23 * m]);
+					tmpi = c_im(jp[23 * m]);
+					r5_23 = ((wr * tmpr) - (wi * tmpi));
+					i5_23 = ((wi * tmpr) + (wr * tmpi));
+					r4_7 = (r5_7 + r5_23);
+					i4_7 = (i5_7 + i5_23);
+					r4_23 = (r5_7 - r5_23);
+					i4_23 = (i5_7 - i5_23);
+				   }
+				   {
+					REAL r5_15, i5_15;
+					REAL r5_31, i5_31;
+					wr = c_re(W[15 * l1]);
+					wi = c_im(W[15 * l1]);
+					tmpr = c_re(jp[15 * m]);
+					tmpi = c_im(jp[15 * m]);
+					r5_15 = ((wr * tmpr) - (wi * tmpi));
+					i5_15 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[31 * l1]);
+					wi = c_im(W[31 * l1]);
+					tmpr = c_re(jp[31 * m]);
+					tmpi = c_im(jp[31 * m]);
+					r5_31 = ((wr * tmpr) - (wi * tmpi));
+					i5_31 = ((wi * tmpr) + (wr * tmpi));
+					r4_15 = (r5_15 + r5_31);
+					i4_15 = (i5_15 + i5_31);
+					r4_31 = (r5_15 - r5_31);
+					i4_31 = (i5_15 - i5_31);
+				   }
+				   r3_7 = (r4_7 + r4_15);
+				   i3_7 = (i4_7 + i4_15);
+				   r3_23 = (r4_7 - r4_15);
+				   i3_23 = (i4_7 - i4_15);
+				   r3_15 = (r4_23 + i4_31);
+				   i3_15 = (i4_23 - r4_31);
+				   r3_31 = (r4_23 - i4_31);
+				   i3_31 = (i4_23 + r4_31);
+			      }
+			      r2_3 = (r3_3 + r3_7);
+			      i2_3 = (i3_3 + i3_7);
+			      r2_19 = (r3_3 - r3_7);
+			      i2_19 = (i3_3 - i3_7);
+			      tmpr = (0.707106781187 * (r3_15 + i3_15));
+			      tmpi = (0.707106781187 * (i3_15 - r3_15));
+			      r2_7 = (r3_11 + tmpr);
+			      i2_7 = (i3_11 + tmpi);
+			      r2_23 = (r3_11 - tmpr);
+			      i2_23 = (i3_11 - tmpi);
+			      r2_11 = (r3_19 + i3_23);
+			      i2_11 = (i3_19 - r3_23);
+			      r2_27 = (r3_19 - i3_23);
+			      i2_27 = (i3_19 + r3_23);
+			      tmpr = (0.707106781187 * (i3_31 - r3_31));
+			      tmpi = (0.707106781187 * (r3_31 + i3_31));
+			      r2_15 = (r3_27 + tmpr);
+			      i2_15 = (i3_27 - tmpi);
+			      r2_31 = (r3_27 - tmpr);
+			      i2_31 = (i3_27 + tmpi);
+			 }
+			 r1_1 = (r2_1 + r2_3);
+			 i1_1 = (i2_1 + i2_3);
+			 r1_17 = (r2_1 - r2_3);
+			 i1_17 = (i2_1 - i2_3);
+			 tmpr = ((0.923879532511 * r2_7) + (0.382683432365 * i2_7));
+			 tmpi = ((0.923879532511 * i2_7) - (0.382683432365 * r2_7));
+			 r1_3 = (r2_5 + tmpr);
+			 i1_3 = (i2_5 + tmpi);
+			 r1_19 = (r2_5 - tmpr);
+			 i1_19 = (i2_5 - tmpi);
+			 tmpr = (0.707106781187 * (r2_11 + i2_11));
+			 tmpi = (0.707106781187 * (i2_11 - r2_11));
+			 r1_5 = (r2_9 + tmpr);
+			 i1_5 = (i2_9 + tmpi);
+			 r1_21 = (r2_9 - tmpr);
+			 i1_21 = (i2_9 - tmpi);
+			 tmpr = ((0.382683432365 * r2_15) + (0.923879532511 * i2_15));
+			 tmpi = ((0.382683432365 * i2_15) - (0.923879532511 * r2_15));
+			 r1_7 = (r2_13 + tmpr);
+			 i1_7 = (i2_13 + tmpi);
+			 r1_23 = (r2_13 - tmpr);
+			 i1_23 = (i2_13 - tmpi);
+			 r1_9 = (r2_17 + i2_19);
+			 i1_9 = (i2_17 - r2_19);
+			 r1_25 = (r2_17 - i2_19);
+			 i1_25 = (i2_17 + r2_19);
+			 tmpr = ((0.923879532511 * i2_23) - (0.382683432365 * r2_23));
+			 tmpi = ((0.923879532511 * r2_23) + (0.382683432365 * i2_23));
+			 r1_11 = (r2_21 + tmpr);
+			 i1_11 = (i2_21 - tmpi);
+			 r1_27 = (r2_21 - tmpr);
+			 i1_27 = (i2_21 + tmpi);
+			 tmpr = (0.707106781187 * (i2_27 - r2_27));
+			 tmpi = (0.707106781187 * (r2_27 + i2_27));
+			 r1_13 = (r2_25 + tmpr);
+			 i1_13 = (i2_25 - tmpi);
+			 r1_29 = (r2_25 - tmpr);
+			 i1_29 = (i2_25 + tmpi);
+			 tmpr = ((0.382683432365 * i2_31) - (0.923879532511 * r2_31));
+			 tmpi = ((0.382683432365 * r2_31) + (0.923879532511 * i2_31));
+			 r1_15 = (r2_29 + tmpr);
+			 i1_15 = (i2_29 - tmpi);
+			 r1_31 = (r2_29 - tmpr);
+			 i1_31 = (i2_29 + tmpi);
+		    }
+		    c_re(kp[0 * m]) = (r1_0 + r1_1);
+		    c_im(kp[0 * m]) = (i1_0 + i1_1);
+		    c_re(kp[16 * m]) = (r1_0 - r1_1);
+		    c_im(kp[16 * m]) = (i1_0 - i1_1);
+		    tmpr = ((0.980785280403 * r1_3) + (0.195090322016 * i1_3));
+		    tmpi = ((0.980785280403 * i1_3) - (0.195090322016 * r1_3));
+		    c_re(kp[1 * m]) = (r1_2 + tmpr);
+		    c_im(kp[1 * m]) = (i1_2 + tmpi);
+		    c_re(kp[17 * m]) = (r1_2 - tmpr);
+		    c_im(kp[17 * m]) = (i1_2 - tmpi);
+		    tmpr = ((0.923879532511 * r1_5) + (0.382683432365 * i1_5));
+		    tmpi = ((0.923879532511 * i1_5) - (0.382683432365 * r1_5));
+		    c_re(kp[2 * m]) = (r1_4 + tmpr);
+		    c_im(kp[2 * m]) = (i1_4 + tmpi);
+		    c_re(kp[18 * m]) = (r1_4 - tmpr);
+		    c_im(kp[18 * m]) = (i1_4 - tmpi);
+		    tmpr = ((0.831469612303 * r1_7) + (0.55557023302 * i1_7));
+		    tmpi = ((0.831469612303 * i1_7) - (0.55557023302 * r1_7));
+		    c_re(kp[3 * m]) = (r1_6 + tmpr);
+		    c_im(kp[3 * m]) = (i1_6 + tmpi);
+		    c_re(kp[19 * m]) = (r1_6 - tmpr);
+		    c_im(kp[19 * m]) = (i1_6 - tmpi);
+		    tmpr = (0.707106781187 * (r1_9 + i1_9));
+		    tmpi = (0.707106781187 * (i1_9 - r1_9));
+		    c_re(kp[4 * m]) = (r1_8 + tmpr);
+		    c_im(kp[4 * m]) = (i1_8 + tmpi);
+		    c_re(kp[20 * m]) = (r1_8 - tmpr);
+		    c_im(kp[20 * m]) = (i1_8 - tmpi);
+		    tmpr = ((0.55557023302 * r1_11) + (0.831469612303 * i1_11));
+		    tmpi = ((0.55557023302 * i1_11) - (0.831469612303 * r1_11));
+		    c_re(kp[5 * m]) = (r1_10 + tmpr);
+		    c_im(kp[5 * m]) = (i1_10 + tmpi);
+		    c_re(kp[21 * m]) = (r1_10 - tmpr);
+		    c_im(kp[21 * m]) = (i1_10 - tmpi);
+		    tmpr = ((0.382683432365 * r1_13) + (0.923879532511 * i1_13));
+		    tmpi = ((0.382683432365 * i1_13) - (0.923879532511 * r1_13));
+		    c_re(kp[6 * m]) = (r1_12 + tmpr);
+		    c_im(kp[6 * m]) = (i1_12 + tmpi);
+		    c_re(kp[22 * m]) = (r1_12 - tmpr);
+		    c_im(kp[22 * m]) = (i1_12 - tmpi);
+		    tmpr = ((0.195090322016 * r1_15) + (0.980785280403 * i1_15));
+		    tmpi = ((0.195090322016 * i1_15) - (0.980785280403 * r1_15));
+		    c_re(kp[7 * m]) = (r1_14 + tmpr);
+		    c_im(kp[7 * m]) = (i1_14 + tmpi);
+		    c_re(kp[23 * m]) = (r1_14 - tmpr);
+		    c_im(kp[23 * m]) = (i1_14 - tmpi);
+		    c_re(kp[8 * m]) = (r1_16 + i1_17);
+		    c_im(kp[8 * m]) = (i1_16 - r1_17);
+		    c_re(kp[24 * m]) = (r1_16 - i1_17);
+		    c_im(kp[24 * m]) = (i1_16 + r1_17);
+		    tmpr = ((0.980785280403 * i1_19) - (0.195090322016 * r1_19));
+		    tmpi = ((0.980785280403 * r1_19) + (0.195090322016 * i1_19));
+		    c_re(kp[9 * m]) = (r1_18 + tmpr);
+		    c_im(kp[9 * m]) = (i1_18 - tmpi);
+		    c_re(kp[25 * m]) = (r1_18 - tmpr);
+		    c_im(kp[25 * m]) = (i1_18 + tmpi);
+		    tmpr = ((0.923879532511 * i1_21) - (0.382683432365 * r1_21));
+		    tmpi = ((0.923879532511 * r1_21) + (0.382683432365 * i1_21));
+		    c_re(kp[10 * m]) = (r1_20 + tmpr);
+		    c_im(kp[10 * m]) = (i1_20 - tmpi);
+		    c_re(kp[26 * m]) = (r1_20 - tmpr);
+		    c_im(kp[26 * m]) = (i1_20 + tmpi);
+		    tmpr = ((0.831469612303 * i1_23) - (0.55557023302 * r1_23));
+		    tmpi = ((0.831469612303 * r1_23) + (0.55557023302 * i1_23));
+		    c_re(kp[11 * m]) = (r1_22 + tmpr);
+		    c_im(kp[11 * m]) = (i1_22 - tmpi);
+		    c_re(kp[27 * m]) = (r1_22 - tmpr);
+		    c_im(kp[27 * m]) = (i1_22 + tmpi);
+		    tmpr = (0.707106781187 * (i1_25 - r1_25));
+		    tmpi = (0.707106781187 * (r1_25 + i1_25));
+		    c_re(kp[12 * m]) = (r1_24 + tmpr);
+		    c_im(kp[12 * m]) = (i1_24 - tmpi);
+		    c_re(kp[28 * m]) = (r1_24 - tmpr);
+		    c_im(kp[28 * m]) = (i1_24 + tmpi);
+		    tmpr = ((0.55557023302 * i1_27) - (0.831469612303 * r1_27));
+		    tmpi = ((0.55557023302 * r1_27) + (0.831469612303 * i1_27));
+		    c_re(kp[13 * m]) = (r1_26 + tmpr);
+		    c_im(kp[13 * m]) = (i1_26 - tmpi);
+		    c_re(kp[29 * m]) = (r1_26 - tmpr);
+		    c_im(kp[29 * m]) = (i1_26 + tmpi);
+		    tmpr = ((0.382683432365 * i1_29) - (0.923879532511 * r1_29));
+		    tmpi = ((0.382683432365 * r1_29) + (0.923879532511 * i1_29));
+		    c_re(kp[14 * m]) = (r1_28 + tmpr);
+		    c_im(kp[14 * m]) = (i1_28 - tmpi);
+		    c_re(kp[30 * m]) = (r1_28 - tmpr);
+		    c_im(kp[30 * m]) = (i1_28 + tmpi);
+		    tmpr = ((0.195090322016 * i1_31) - (0.980785280403 * r1_31));
+		    tmpi = ((0.195090322016 * r1_31) + (0.980785280403 * i1_31));
+		    c_re(kp[15 * m]) = (r1_30 + tmpr);
+		    c_im(kp[15 * m]) = (i1_30 - tmpi);
+		    c_re(kp[31 * m]) = (r1_30 - tmpr);
+		    c_im(kp[31 * m]) = (i1_30 + tmpi);
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+#if defined(FORCE_TIED_TASKS)
+          #pragma omp task
+	  fft_twiddle_32(a, ab, in, out, W, nW, nWdn, m);
+          #pragma omp task
+	  fft_twiddle_32(ab, b, in, out, W, nW, nWdn, m);
+#else
+          #pragma omp task untied
+	  fft_twiddle_32(a, ab, in, out, W, nW, nWdn, m);
+          #pragma omp task untied
+	  fft_twiddle_32(ab, b, in, out, W, nW, nWdn, m);
+#endif
+          #pragma omp taskwait
+     }
+}
+void fft_twiddle_32_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m)
+{
+     int l1, i;
+     COMPLEX *jp, *kp;
+     REAL tmpr, tmpi, wr, wi;
+     if ((b - a) < 128) {
+	  for (i = a, l1 = nWdn * i, kp = out + i; i < b;
+	       i++, l1 += nWdn, kp++) {
+	       jp = in + i;
+	       {
+		    REAL r1_0, i1_0;
+		    REAL r1_1, i1_1;
+		    REAL r1_2, i1_2;
+		    REAL r1_3, i1_3;
+		    REAL r1_4, i1_4;
+		    REAL r1_5, i1_5;
+		    REAL r1_6, i1_6;
+		    REAL r1_7, i1_7;
+		    REAL r1_8, i1_8;
+		    REAL r1_9, i1_9;
+		    REAL r1_10, i1_10;
+		    REAL r1_11, i1_11;
+		    REAL r1_12, i1_12;
+		    REAL r1_13, i1_13;
+		    REAL r1_14, i1_14;
+		    REAL r1_15, i1_15;
+		    REAL r1_16, i1_16;
+		    REAL r1_17, i1_17;
+		    REAL r1_18, i1_18;
+		    REAL r1_19, i1_19;
+		    REAL r1_20, i1_20;
+		    REAL r1_21, i1_21;
+		    REAL r1_22, i1_22;
+		    REAL r1_23, i1_23;
+		    REAL r1_24, i1_24;
+		    REAL r1_25, i1_25;
+		    REAL r1_26, i1_26;
+		    REAL r1_27, i1_27;
+		    REAL r1_28, i1_28;
+		    REAL r1_29, i1_29;
+		    REAL r1_30, i1_30;
+		    REAL r1_31, i1_31;
+		    {
+			 REAL r2_0, i2_0;
+			 REAL r2_2, i2_2;
+			 REAL r2_4, i2_4;
+			 REAL r2_6, i2_6;
+			 REAL r2_8, i2_8;
+			 REAL r2_10, i2_10;
+			 REAL r2_12, i2_12;
+			 REAL r2_14, i2_14;
+			 REAL r2_16, i2_16;
+			 REAL r2_18, i2_18;
+			 REAL r2_20, i2_20;
+			 REAL r2_22, i2_22;
+			 REAL r2_24, i2_24;
+			 REAL r2_26, i2_26;
+			 REAL r2_28, i2_28;
+			 REAL r2_30, i2_30;
+			 {
+			      REAL r3_0, i3_0;
+			      REAL r3_4, i3_4;
+			      REAL r3_8, i3_8;
+			      REAL r3_12, i3_12;
+			      REAL r3_16, i3_16;
+			      REAL r3_20, i3_20;
+			      REAL r3_24, i3_24;
+			      REAL r3_28, i3_28;
+			      {
+				   REAL r4_0, i4_0;
+				   REAL r4_8, i4_8;
+				   REAL r4_16, i4_16;
+				   REAL r4_24, i4_24;
+				   {
+					REAL r5_0, i5_0;
+					REAL r5_16, i5_16;
+					r5_0 = c_re(jp[0 * m]);
+					i5_0 = c_im(jp[0 * m]);
+					wr = c_re(W[16 * l1]);
+					wi = c_im(W[16 * l1]);
+					tmpr = c_re(jp[16 * m]);
+					tmpi = c_im(jp[16 * m]);
+					r5_16 = ((wr * tmpr) - (wi * tmpi));
+					i5_16 = ((wi * tmpr) + (wr * tmpi));
+					r4_0 = (r5_0 + r5_16);
+					i4_0 = (i5_0 + i5_16);
+					r4_16 = (r5_0 - r5_16);
+					i4_16 = (i5_0 - i5_16);
+				   }
+				   {
+					REAL r5_8, i5_8;
+					REAL r5_24, i5_24;
+					wr = c_re(W[8 * l1]);
+					wi = c_im(W[8 * l1]);
+					tmpr = c_re(jp[8 * m]);
+					tmpi = c_im(jp[8 * m]);
+					r5_8 = ((wr * tmpr) - (wi * tmpi));
+					i5_8 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[24 * l1]);
+					wi = c_im(W[24 * l1]);
+					tmpr = c_re(jp[24 * m]);
+					tmpi = c_im(jp[24 * m]);
+					r5_24 = ((wr * tmpr) - (wi * tmpi));
+					i5_24 = ((wi * tmpr) + (wr * tmpi));
+					r4_8 = (r5_8 + r5_24);
+					i4_8 = (i5_8 + i5_24);
+					r4_24 = (r5_8 - r5_24);
+					i4_24 = (i5_8 - i5_24);
+				   }
+				   r3_0 = (r4_0 + r4_8);
+				   i3_0 = (i4_0 + i4_8);
+				   r3_16 = (r4_0 - r4_8);
+				   i3_16 = (i4_0 - i4_8);
+				   r3_8 = (r4_16 + i4_24);
+				   i3_8 = (i4_16 - r4_24);
+				   r3_24 = (r4_16 - i4_24);
+				   i3_24 = (i4_16 + r4_24);
+			      }
+			      {
+				   REAL r4_4, i4_4;
+				   REAL r4_12, i4_12;
+				   REAL r4_20, i4_20;
+				   REAL r4_28, i4_28;
+				   {
+					REAL r5_4, i5_4;
+					REAL r5_20, i5_20;
+					wr = c_re(W[4 * l1]);
+					wi = c_im(W[4 * l1]);
+					tmpr = c_re(jp[4 * m]);
+					tmpi = c_im(jp[4 * m]);
+					r5_4 = ((wr * tmpr) - (wi * tmpi));
+					i5_4 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[20 * l1]);
+					wi = c_im(W[20 * l1]);
+					tmpr = c_re(jp[20 * m]);
+					tmpi = c_im(jp[20 * m]);
+					r5_20 = ((wr * tmpr) - (wi * tmpi));
+					i5_20 = ((wi * tmpr) + (wr * tmpi));
+					r4_4 = (r5_4 + r5_20);
+					i4_4 = (i5_4 + i5_20);
+					r4_20 = (r5_4 - r5_20);
+					i4_20 = (i5_4 - i5_20);
+				   }
+				   {
+					REAL r5_12, i5_12;
+					REAL r5_28, i5_28;
+					wr = c_re(W[12 * l1]);
+					wi = c_im(W[12 * l1]);
+					tmpr = c_re(jp[12 * m]);
+					tmpi = c_im(jp[12 * m]);
+					r5_12 = ((wr * tmpr) - (wi * tmpi));
+					i5_12 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[28 * l1]);
+					wi = c_im(W[28 * l1]);
+					tmpr = c_re(jp[28 * m]);
+					tmpi = c_im(jp[28 * m]);
+					r5_28 = ((wr * tmpr) - (wi * tmpi));
+					i5_28 = ((wi * tmpr) + (wr * tmpi));
+					r4_12 = (r5_12 + r5_28);
+					i4_12 = (i5_12 + i5_28);
+					r4_28 = (r5_12 - r5_28);
+					i4_28 = (i5_12 - i5_28);
+				   }
+				   r3_4 = (r4_4 + r4_12);
+				   i3_4 = (i4_4 + i4_12);
+				   r3_20 = (r4_4 - r4_12);
+				   i3_20 = (i4_4 - i4_12);
+				   r3_12 = (r4_20 + i4_28);
+				   i3_12 = (i4_20 - r4_28);
+				   r3_28 = (r4_20 - i4_28);
+				   i3_28 = (i4_20 + r4_28);
+			      }
+			      r2_0 = (r3_0 + r3_4);
+			      i2_0 = (i3_0 + i3_4);
+			      r2_16 = (r3_0 - r3_4);
+			      i2_16 = (i3_0 - i3_4);
+			      tmpr = (0.707106781187 * (r3_12 + i3_12));
+			      tmpi = (0.707106781187 * (i3_12 - r3_12));
+			      r2_4 = (r3_8 + tmpr);
+			      i2_4 = (i3_8 + tmpi);
+			      r2_20 = (r3_8 - tmpr);
+			      i2_20 = (i3_8 - tmpi);
+			      r2_8 = (r3_16 + i3_20);
+			      i2_8 = (i3_16 - r3_20);
+			      r2_24 = (r3_16 - i3_20);
+			      i2_24 = (i3_16 + r3_20);
+			      tmpr = (0.707106781187 * (i3_28 - r3_28));
+			      tmpi = (0.707106781187 * (r3_28 + i3_28));
+			      r2_12 = (r3_24 + tmpr);
+			      i2_12 = (i3_24 - tmpi);
+			      r2_28 = (r3_24 - tmpr);
+			      i2_28 = (i3_24 + tmpi);
+			 }
+			 {
+			      REAL r3_2, i3_2;
+			      REAL r3_6, i3_6;
+			      REAL r3_10, i3_10;
+			      REAL r3_14, i3_14;
+			      REAL r3_18, i3_18;
+			      REAL r3_22, i3_22;
+			      REAL r3_26, i3_26;
+			      REAL r3_30, i3_30;
+			      {
+				   REAL r4_2, i4_2;
+				   REAL r4_10, i4_10;
+				   REAL r4_18, i4_18;
+				   REAL r4_26, i4_26;
+				   {
+					REAL r5_2, i5_2;
+					REAL r5_18, i5_18;
+					wr = c_re(W[2 * l1]);
+					wi = c_im(W[2 * l1]);
+					tmpr = c_re(jp[2 * m]);
+					tmpi = c_im(jp[2 * m]);
+					r5_2 = ((wr * tmpr) - (wi * tmpi));
+					i5_2 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[18 * l1]);
+					wi = c_im(W[18 * l1]);
+					tmpr = c_re(jp[18 * m]);
+					tmpi = c_im(jp[18 * m]);
+					r5_18 = ((wr * tmpr) - (wi * tmpi));
+					i5_18 = ((wi * tmpr) + (wr * tmpi));
+					r4_2 = (r5_2 + r5_18);
+					i4_2 = (i5_2 + i5_18);
+					r4_18 = (r5_2 - r5_18);
+					i4_18 = (i5_2 - i5_18);
+				   }
+				   {
+					REAL r5_10, i5_10;
+					REAL r5_26, i5_26;
+					wr = c_re(W[10 * l1]);
+					wi = c_im(W[10 * l1]);
+					tmpr = c_re(jp[10 * m]);
+					tmpi = c_im(jp[10 * m]);
+					r5_10 = ((wr * tmpr) - (wi * tmpi));
+					i5_10 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[26 * l1]);
+					wi = c_im(W[26 * l1]);
+					tmpr = c_re(jp[26 * m]);
+					tmpi = c_im(jp[26 * m]);
+					r5_26 = ((wr * tmpr) - (wi * tmpi));
+					i5_26 = ((wi * tmpr) + (wr * tmpi));
+					r4_10 = (r5_10 + r5_26);
+					i4_10 = (i5_10 + i5_26);
+					r4_26 = (r5_10 - r5_26);
+					i4_26 = (i5_10 - i5_26);
+				   }
+				   r3_2 = (r4_2 + r4_10);
+				   i3_2 = (i4_2 + i4_10);
+				   r3_18 = (r4_2 - r4_10);
+				   i3_18 = (i4_2 - i4_10);
+				   r3_10 = (r4_18 + i4_26);
+				   i3_10 = (i4_18 - r4_26);
+				   r3_26 = (r4_18 - i4_26);
+				   i3_26 = (i4_18 + r4_26);
+			      }
+			      {
+				   REAL r4_6, i4_6;
+				   REAL r4_14, i4_14;
+				   REAL r4_22, i4_22;
+				   REAL r4_30, i4_30;
+				   {
+					REAL r5_6, i5_6;
+					REAL r5_22, i5_22;
+					wr = c_re(W[6 * l1]);
+					wi = c_im(W[6 * l1]);
+					tmpr = c_re(jp[6 * m]);
+					tmpi = c_im(jp[6 * m]);
+					r5_6 = ((wr * tmpr) - (wi * tmpi));
+					i5_6 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[22 * l1]);
+					wi = c_im(W[22 * l1]);
+					tmpr = c_re(jp[22 * m]);
+					tmpi = c_im(jp[22 * m]);
+					r5_22 = ((wr * tmpr) - (wi * tmpi));
+					i5_22 = ((wi * tmpr) + (wr * tmpi));
+					r4_6 = (r5_6 + r5_22);
+					i4_6 = (i5_6 + i5_22);
+					r4_22 = (r5_6 - r5_22);
+					i4_22 = (i5_6 - i5_22);
+				   }
+				   {
+					REAL r5_14, i5_14;
+					REAL r5_30, i5_30;
+					wr = c_re(W[14 * l1]);
+					wi = c_im(W[14 * l1]);
+					tmpr = c_re(jp[14 * m]);
+					tmpi = c_im(jp[14 * m]);
+					r5_14 = ((wr * tmpr) - (wi * tmpi));
+					i5_14 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[30 * l1]);
+					wi = c_im(W[30 * l1]);
+					tmpr = c_re(jp[30 * m]);
+					tmpi = c_im(jp[30 * m]);
+					r5_30 = ((wr * tmpr) - (wi * tmpi));
+					i5_30 = ((wi * tmpr) + (wr * tmpi));
+					r4_14 = (r5_14 + r5_30);
+					i4_14 = (i5_14 + i5_30);
+					r4_30 = (r5_14 - r5_30);
+					i4_30 = (i5_14 - i5_30);
+				   }
+				   r3_6 = (r4_6 + r4_14);
+				   i3_6 = (i4_6 + i4_14);
+				   r3_22 = (r4_6 - r4_14);
+				   i3_22 = (i4_6 - i4_14);
+				   r3_14 = (r4_22 + i4_30);
+				   i3_14 = (i4_22 - r4_30);
+				   r3_30 = (r4_22 - i4_30);
+				   i3_30 = (i4_22 + r4_30);
+			      }
+			      r2_2 = (r3_2 + r3_6);
+			      i2_2 = (i3_2 + i3_6);
+			      r2_18 = (r3_2 - r3_6);
+			      i2_18 = (i3_2 - i3_6);
+			      tmpr = (0.707106781187 * (r3_14 + i3_14));
+			      tmpi = (0.707106781187 * (i3_14 - r3_14));
+			      r2_6 = (r3_10 + tmpr);
+			      i2_6 = (i3_10 + tmpi);
+			      r2_22 = (r3_10 - tmpr);
+			      i2_22 = (i3_10 - tmpi);
+			      r2_10 = (r3_18 + i3_22);
+			      i2_10 = (i3_18 - r3_22);
+			      r2_26 = (r3_18 - i3_22);
+			      i2_26 = (i3_18 + r3_22);
+			      tmpr = (0.707106781187 * (i3_30 - r3_30));
+			      tmpi = (0.707106781187 * (r3_30 + i3_30));
+			      r2_14 = (r3_26 + tmpr);
+			      i2_14 = (i3_26 - tmpi);
+			      r2_30 = (r3_26 - tmpr);
+			      i2_30 = (i3_26 + tmpi);
+			 }
+			 r1_0 = (r2_0 + r2_2);
+			 i1_0 = (i2_0 + i2_2);
+			 r1_16 = (r2_0 - r2_2);
+			 i1_16 = (i2_0 - i2_2);
+			 tmpr = ((0.923879532511 * r2_6) + (0.382683432365 * i2_6));
+			 tmpi = ((0.923879532511 * i2_6) - (0.382683432365 * r2_6));
+			 r1_2 = (r2_4 + tmpr);
+			 i1_2 = (i2_4 + tmpi);
+			 r1_18 = (r2_4 - tmpr);
+			 i1_18 = (i2_4 - tmpi);
+			 tmpr = (0.707106781187 * (r2_10 + i2_10));
+			 tmpi = (0.707106781187 * (i2_10 - r2_10));
+			 r1_4 = (r2_8 + tmpr);
+			 i1_4 = (i2_8 + tmpi);
+			 r1_20 = (r2_8 - tmpr);
+			 i1_20 = (i2_8 - tmpi);
+			 tmpr = ((0.382683432365 * r2_14) + (0.923879532511 * i2_14));
+			 tmpi = ((0.382683432365 * i2_14) - (0.923879532511 * r2_14));
+			 r1_6 = (r2_12 + tmpr);
+			 i1_6 = (i2_12 + tmpi);
+			 r1_22 = (r2_12 - tmpr);
+			 i1_22 = (i2_12 - tmpi);
+			 r1_8 = (r2_16 + i2_18);
+			 i1_8 = (i2_16 - r2_18);
+			 r1_24 = (r2_16 - i2_18);
+			 i1_24 = (i2_16 + r2_18);
+			 tmpr = ((0.923879532511 * i2_22) - (0.382683432365 * r2_22));
+			 tmpi = ((0.923879532511 * r2_22) + (0.382683432365 * i2_22));
+			 r1_10 = (r2_20 + tmpr);
+			 i1_10 = (i2_20 - tmpi);
+			 r1_26 = (r2_20 - tmpr);
+			 i1_26 = (i2_20 + tmpi);
+			 tmpr = (0.707106781187 * (i2_26 - r2_26));
+			 tmpi = (0.707106781187 * (r2_26 + i2_26));
+			 r1_12 = (r2_24 + tmpr);
+			 i1_12 = (i2_24 - tmpi);
+			 r1_28 = (r2_24 - tmpr);
+			 i1_28 = (i2_24 + tmpi);
+			 tmpr = ((0.382683432365 * i2_30) - (0.923879532511 * r2_30));
+			 tmpi = ((0.382683432365 * r2_30) + (0.923879532511 * i2_30));
+			 r1_14 = (r2_28 + tmpr);
+			 i1_14 = (i2_28 - tmpi);
+			 r1_30 = (r2_28 - tmpr);
+			 i1_30 = (i2_28 + tmpi);
+		    }
+		    {
+			 REAL r2_1, i2_1;
+			 REAL r2_3, i2_3;
+			 REAL r2_5, i2_5;
+			 REAL r2_7, i2_7;
+			 REAL r2_9, i2_9;
+			 REAL r2_11, i2_11;
+			 REAL r2_13, i2_13;
+			 REAL r2_15, i2_15;
+			 REAL r2_17, i2_17;
+			 REAL r2_19, i2_19;
+			 REAL r2_21, i2_21;
+			 REAL r2_23, i2_23;
+			 REAL r2_25, i2_25;
+			 REAL r2_27, i2_27;
+			 REAL r2_29, i2_29;
+			 REAL r2_31, i2_31;
+			 {
+			      REAL r3_1, i3_1;
+			      REAL r3_5, i3_5;
+			      REAL r3_9, i3_9;
+			      REAL r3_13, i3_13;
+			      REAL r3_17, i3_17;
+			      REAL r3_21, i3_21;
+			      REAL r3_25, i3_25;
+			      REAL r3_29, i3_29;
+			      {
+				   REAL r4_1, i4_1;
+				   REAL r4_9, i4_9;
+				   REAL r4_17, i4_17;
+				   REAL r4_25, i4_25;
+				   {
+					REAL r5_1, i5_1;
+					REAL r5_17, i5_17;
+					wr = c_re(W[1 * l1]);
+					wi = c_im(W[1 * l1]);
+					tmpr = c_re(jp[1 * m]);
+					tmpi = c_im(jp[1 * m]);
+					r5_1 = ((wr * tmpr) - (wi * tmpi));
+					i5_1 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[17 * l1]);
+					wi = c_im(W[17 * l1]);
+					tmpr = c_re(jp[17 * m]);
+					tmpi = c_im(jp[17 * m]);
+					r5_17 = ((wr * tmpr) - (wi * tmpi));
+					i5_17 = ((wi * tmpr) + (wr * tmpi));
+					r4_1 = (r5_1 + r5_17);
+					i4_1 = (i5_1 + i5_17);
+					r4_17 = (r5_1 - r5_17);
+					i4_17 = (i5_1 - i5_17);
+				   }
+				   {
+					REAL r5_9, i5_9;
+					REAL r5_25, i5_25;
+					wr = c_re(W[9 * l1]);
+					wi = c_im(W[9 * l1]);
+					tmpr = c_re(jp[9 * m]);
+					tmpi = c_im(jp[9 * m]);
+					r5_9 = ((wr * tmpr) - (wi * tmpi));
+					i5_9 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[25 * l1]);
+					wi = c_im(W[25 * l1]);
+					tmpr = c_re(jp[25 * m]);
+					tmpi = c_im(jp[25 * m]);
+					r5_25 = ((wr * tmpr) - (wi * tmpi));
+					i5_25 = ((wi * tmpr) + (wr * tmpi));
+					r4_9 = (r5_9 + r5_25);
+					i4_9 = (i5_9 + i5_25);
+					r4_25 = (r5_9 - r5_25);
+					i4_25 = (i5_9 - i5_25);
+				   }
+				   r3_1 = (r4_1 + r4_9);
+				   i3_1 = (i4_1 + i4_9);
+				   r3_17 = (r4_1 - r4_9);
+				   i3_17 = (i4_1 - i4_9);
+				   r3_9 = (r4_17 + i4_25);
+				   i3_9 = (i4_17 - r4_25);
+				   r3_25 = (r4_17 - i4_25);
+				   i3_25 = (i4_17 + r4_25);
+			      }
+			      {
+				   REAL r4_5, i4_5;
+				   REAL r4_13, i4_13;
+				   REAL r4_21, i4_21;
+				   REAL r4_29, i4_29;
+				   {
+					REAL r5_5, i5_5;
+					REAL r5_21, i5_21;
+					wr = c_re(W[5 * l1]);
+					wi = c_im(W[5 * l1]);
+					tmpr = c_re(jp[5 * m]);
+					tmpi = c_im(jp[5 * m]);
+					r5_5 = ((wr * tmpr) - (wi * tmpi));
+					i5_5 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[21 * l1]);
+					wi = c_im(W[21 * l1]);
+					tmpr = c_re(jp[21 * m]);
+					tmpi = c_im(jp[21 * m]);
+					r5_21 = ((wr * tmpr) - (wi * tmpi));
+					i5_21 = ((wi * tmpr) + (wr * tmpi));
+					r4_5 = (r5_5 + r5_21);
+					i4_5 = (i5_5 + i5_21);
+					r4_21 = (r5_5 - r5_21);
+					i4_21 = (i5_5 - i5_21);
+				   }
+				   {
+					REAL r5_13, i5_13;
+					REAL r5_29, i5_29;
+					wr = c_re(W[13 * l1]);
+					wi = c_im(W[13 * l1]);
+					tmpr = c_re(jp[13 * m]);
+					tmpi = c_im(jp[13 * m]);
+					r5_13 = ((wr * tmpr) - (wi * tmpi));
+					i5_13 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[29 * l1]);
+					wi = c_im(W[29 * l1]);
+					tmpr = c_re(jp[29 * m]);
+					tmpi = c_im(jp[29 * m]);
+					r5_29 = ((wr * tmpr) - (wi * tmpi));
+					i5_29 = ((wi * tmpr) + (wr * tmpi));
+					r4_13 = (r5_13 + r5_29);
+					i4_13 = (i5_13 + i5_29);
+					r4_29 = (r5_13 - r5_29);
+					i4_29 = (i5_13 - i5_29);
+				   }
+				   r3_5 = (r4_5 + r4_13);
+				   i3_5 = (i4_5 + i4_13);
+				   r3_21 = (r4_5 - r4_13);
+				   i3_21 = (i4_5 - i4_13);
+				   r3_13 = (r4_21 + i4_29);
+				   i3_13 = (i4_21 - r4_29);
+				   r3_29 = (r4_21 - i4_29);
+				   i3_29 = (i4_21 + r4_29);
+			      }
+			      r2_1 = (r3_1 + r3_5);
+			      i2_1 = (i3_1 + i3_5);
+			      r2_17 = (r3_1 - r3_5);
+			      i2_17 = (i3_1 - i3_5);
+			      tmpr = (0.707106781187 * (r3_13 + i3_13));
+			      tmpi = (0.707106781187 * (i3_13 - r3_13));
+			      r2_5 = (r3_9 + tmpr);
+			      i2_5 = (i3_9 + tmpi);
+			      r2_21 = (r3_9 - tmpr);
+			      i2_21 = (i3_9 - tmpi);
+			      r2_9 = (r3_17 + i3_21);
+			      i2_9 = (i3_17 - r3_21);
+			      r2_25 = (r3_17 - i3_21);
+			      i2_25 = (i3_17 + r3_21);
+			      tmpr = (0.707106781187 * (i3_29 - r3_29));
+			      tmpi = (0.707106781187 * (r3_29 + i3_29));
+			      r2_13 = (r3_25 + tmpr);
+			      i2_13 = (i3_25 - tmpi);
+			      r2_29 = (r3_25 - tmpr);
+			      i2_29 = (i3_25 + tmpi);
+			 }
+			 {
+			      REAL r3_3, i3_3;
+			      REAL r3_7, i3_7;
+			      REAL r3_11, i3_11;
+			      REAL r3_15, i3_15;
+			      REAL r3_19, i3_19;
+			      REAL r3_23, i3_23;
+			      REAL r3_27, i3_27;
+			      REAL r3_31, i3_31;
+			      {
+				   REAL r4_3, i4_3;
+				   REAL r4_11, i4_11;
+				   REAL r4_19, i4_19;
+				   REAL r4_27, i4_27;
+				   {
+					REAL r5_3, i5_3;
+					REAL r5_19, i5_19;
+					wr = c_re(W[3 * l1]);
+					wi = c_im(W[3 * l1]);
+					tmpr = c_re(jp[3 * m]);
+					tmpi = c_im(jp[3 * m]);
+					r5_3 = ((wr * tmpr) - (wi * tmpi));
+					i5_3 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[19 * l1]);
+					wi = c_im(W[19 * l1]);
+					tmpr = c_re(jp[19 * m]);
+					tmpi = c_im(jp[19 * m]);
+					r5_19 = ((wr * tmpr) - (wi * tmpi));
+					i5_19 = ((wi * tmpr) + (wr * tmpi));
+					r4_3 = (r5_3 + r5_19);
+					i4_3 = (i5_3 + i5_19);
+					r4_19 = (r5_3 - r5_19);
+					i4_19 = (i5_3 - i5_19);
+				   }
+				   {
+					REAL r5_11, i5_11;
+					REAL r5_27, i5_27;
+					wr = c_re(W[11 * l1]);
+					wi = c_im(W[11 * l1]);
+					tmpr = c_re(jp[11 * m]);
+					tmpi = c_im(jp[11 * m]);
+					r5_11 = ((wr * tmpr) - (wi * tmpi));
+					i5_11 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[27 * l1]);
+					wi = c_im(W[27 * l1]);
+					tmpr = c_re(jp[27 * m]);
+					tmpi = c_im(jp[27 * m]);
+					r5_27 = ((wr * tmpr) - (wi * tmpi));
+					i5_27 = ((wi * tmpr) + (wr * tmpi));
+					r4_11 = (r5_11 + r5_27);
+					i4_11 = (i5_11 + i5_27);
+					r4_27 = (r5_11 - r5_27);
+					i4_27 = (i5_11 - i5_27);
+				   }
+				   r3_3 = (r4_3 + r4_11);
+				   i3_3 = (i4_3 + i4_11);
+				   r3_19 = (r4_3 - r4_11);
+				   i3_19 = (i4_3 - i4_11);
+				   r3_11 = (r4_19 + i4_27);
+				   i3_11 = (i4_19 - r4_27);
+				   r3_27 = (r4_19 - i4_27);
+				   i3_27 = (i4_19 + r4_27);
+			      }
+			      {
+				   REAL r4_7, i4_7;
+				   REAL r4_15, i4_15;
+				   REAL r4_23, i4_23;
+				   REAL r4_31, i4_31;
+				   {
+					REAL r5_7, i5_7;
+					REAL r5_23, i5_23;
+					wr = c_re(W[7 * l1]);
+					wi = c_im(W[7 * l1]);
+					tmpr = c_re(jp[7 * m]);
+					tmpi = c_im(jp[7 * m]);
+					r5_7 = ((wr * tmpr) - (wi * tmpi));
+					i5_7 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[23 * l1]);
+					wi = c_im(W[23 * l1]);
+					tmpr = c_re(jp[23 * m]);
+					tmpi = c_im(jp[23 * m]);
+					r5_23 = ((wr * tmpr) - (wi * tmpi));
+					i5_23 = ((wi * tmpr) + (wr * tmpi));
+					r4_7 = (r5_7 + r5_23);
+					i4_7 = (i5_7 + i5_23);
+					r4_23 = (r5_7 - r5_23);
+					i4_23 = (i5_7 - i5_23);
+				   }
+				   {
+					REAL r5_15, i5_15;
+					REAL r5_31, i5_31;
+					wr = c_re(W[15 * l1]);
+					wi = c_im(W[15 * l1]);
+					tmpr = c_re(jp[15 * m]);
+					tmpi = c_im(jp[15 * m]);
+					r5_15 = ((wr * tmpr) - (wi * tmpi));
+					i5_15 = ((wi * tmpr) + (wr * tmpi));
+					wr = c_re(W[31 * l1]);
+					wi = c_im(W[31 * l1]);
+					tmpr = c_re(jp[31 * m]);
+					tmpi = c_im(jp[31 * m]);
+					r5_31 = ((wr * tmpr) - (wi * tmpi));
+					i5_31 = ((wi * tmpr) + (wr * tmpi));
+					r4_15 = (r5_15 + r5_31);
+					i4_15 = (i5_15 + i5_31);
+					r4_31 = (r5_15 - r5_31);
+					i4_31 = (i5_15 - i5_31);
+				   }
+				   r3_7 = (r4_7 + r4_15);
+				   i3_7 = (i4_7 + i4_15);
+				   r3_23 = (r4_7 - r4_15);
+				   i3_23 = (i4_7 - i4_15);
+				   r3_15 = (r4_23 + i4_31);
+				   i3_15 = (i4_23 - r4_31);
+				   r3_31 = (r4_23 - i4_31);
+				   i3_31 = (i4_23 + r4_31);
+			      }
+			      r2_3 = (r3_3 + r3_7);
+			      i2_3 = (i3_3 + i3_7);
+			      r2_19 = (r3_3 - r3_7);
+			      i2_19 = (i3_3 - i3_7);
+			      tmpr = (0.707106781187 * (r3_15 + i3_15));
+			      tmpi = (0.707106781187 * (i3_15 - r3_15));
+			      r2_7 = (r3_11 + tmpr);
+			      i2_7 = (i3_11 + tmpi);
+			      r2_23 = (r3_11 - tmpr);
+			      i2_23 = (i3_11 - tmpi);
+			      r2_11 = (r3_19 + i3_23);
+			      i2_11 = (i3_19 - r3_23);
+			      r2_27 = (r3_19 - i3_23);
+			      i2_27 = (i3_19 + r3_23);
+			      tmpr = (0.707106781187 * (i3_31 - r3_31));
+			      tmpi = (0.707106781187 * (r3_31 + i3_31));
+			      r2_15 = (r3_27 + tmpr);
+			      i2_15 = (i3_27 - tmpi);
+			      r2_31 = (r3_27 - tmpr);
+			      i2_31 = (i3_27 + tmpi);
+			 }
+			 r1_1 = (r2_1 + r2_3);
+			 i1_1 = (i2_1 + i2_3);
+			 r1_17 = (r2_1 - r2_3);
+			 i1_17 = (i2_1 - i2_3);
+			 tmpr = ((0.923879532511 * r2_7) + (0.382683432365 * i2_7));
+			 tmpi = ((0.923879532511 * i2_7) - (0.382683432365 * r2_7));
+			 r1_3 = (r2_5 + tmpr);
+			 i1_3 = (i2_5 + tmpi);
+			 r1_19 = (r2_5 - tmpr);
+			 i1_19 = (i2_5 - tmpi);
+			 tmpr = (0.707106781187 * (r2_11 + i2_11));
+			 tmpi = (0.707106781187 * (i2_11 - r2_11));
+			 r1_5 = (r2_9 + tmpr);
+			 i1_5 = (i2_9 + tmpi);
+			 r1_21 = (r2_9 - tmpr);
+			 i1_21 = (i2_9 - tmpi);
+			 tmpr = ((0.382683432365 * r2_15) + (0.923879532511 * i2_15));
+			 tmpi = ((0.382683432365 * i2_15) - (0.923879532511 * r2_15));
+			 r1_7 = (r2_13 + tmpr);
+			 i1_7 = (i2_13 + tmpi);
+			 r1_23 = (r2_13 - tmpr);
+			 i1_23 = (i2_13 - tmpi);
+			 r1_9 = (r2_17 + i2_19);
+			 i1_9 = (i2_17 - r2_19);
+			 r1_25 = (r2_17 - i2_19);
+			 i1_25 = (i2_17 + r2_19);
+			 tmpr = ((0.923879532511 * i2_23) - (0.382683432365 * r2_23));
+			 tmpi = ((0.923879532511 * r2_23) + (0.382683432365 * i2_23));
+			 r1_11 = (r2_21 + tmpr);
+			 i1_11 = (i2_21 - tmpi);
+			 r1_27 = (r2_21 - tmpr);
+			 i1_27 = (i2_21 + tmpi);
+			 tmpr = (0.707106781187 * (i2_27 - r2_27));
+			 tmpi = (0.707106781187 * (r2_27 + i2_27));
+			 r1_13 = (r2_25 + tmpr);
+			 i1_13 = (i2_25 - tmpi);
+			 r1_29 = (r2_25 - tmpr);
+			 i1_29 = (i2_25 + tmpi);
+			 tmpr = ((0.382683432365 * i2_31) - (0.923879532511 * r2_31));
+			 tmpi = ((0.382683432365 * r2_31) + (0.923879532511 * i2_31));
+			 r1_15 = (r2_29 + tmpr);
+			 i1_15 = (i2_29 - tmpi);
+			 r1_31 = (r2_29 - tmpr);
+			 i1_31 = (i2_29 + tmpi);
+		    }
+		    c_re(kp[0 * m]) = (r1_0 + r1_1);
+		    c_im(kp[0 * m]) = (i1_0 + i1_1);
+		    c_re(kp[16 * m]) = (r1_0 - r1_1);
+		    c_im(kp[16 * m]) = (i1_0 - i1_1);
+		    tmpr = ((0.980785280403 * r1_3) + (0.195090322016 * i1_3));
+		    tmpi = ((0.980785280403 * i1_3) - (0.195090322016 * r1_3));
+		    c_re(kp[1 * m]) = (r1_2 + tmpr);
+		    c_im(kp[1 * m]) = (i1_2 + tmpi);
+		    c_re(kp[17 * m]) = (r1_2 - tmpr);
+		    c_im(kp[17 * m]) = (i1_2 - tmpi);
+		    tmpr = ((0.923879532511 * r1_5) + (0.382683432365 * i1_5));
+		    tmpi = ((0.923879532511 * i1_5) - (0.382683432365 * r1_5));
+		    c_re(kp[2 * m]) = (r1_4 + tmpr);
+		    c_im(kp[2 * m]) = (i1_4 + tmpi);
+		    c_re(kp[18 * m]) = (r1_4 - tmpr);
+		    c_im(kp[18 * m]) = (i1_4 - tmpi);
+		    tmpr = ((0.831469612303 * r1_7) + (0.55557023302 * i1_7));
+		    tmpi = ((0.831469612303 * i1_7) - (0.55557023302 * r1_7));
+		    c_re(kp[3 * m]) = (r1_6 + tmpr);
+		    c_im(kp[3 * m]) = (i1_6 + tmpi);
+		    c_re(kp[19 * m]) = (r1_6 - tmpr);
+		    c_im(kp[19 * m]) = (i1_6 - tmpi);
+		    tmpr = (0.707106781187 * (r1_9 + i1_9));
+		    tmpi = (0.707106781187 * (i1_9 - r1_9));
+		    c_re(kp[4 * m]) = (r1_8 + tmpr);
+		    c_im(kp[4 * m]) = (i1_8 + tmpi);
+		    c_re(kp[20 * m]) = (r1_8 - tmpr);
+		    c_im(kp[20 * m]) = (i1_8 - tmpi);
+		    tmpr = ((0.55557023302 * r1_11) + (0.831469612303 * i1_11));
+		    tmpi = ((0.55557023302 * i1_11) - (0.831469612303 * r1_11));
+		    c_re(kp[5 * m]) = (r1_10 + tmpr);
+		    c_im(kp[5 * m]) = (i1_10 + tmpi);
+		    c_re(kp[21 * m]) = (r1_10 - tmpr);
+		    c_im(kp[21 * m]) = (i1_10 - tmpi);
+		    tmpr = ((0.382683432365 * r1_13) + (0.923879532511 * i1_13));
+		    tmpi = ((0.382683432365 * i1_13) - (0.923879532511 * r1_13));
+		    c_re(kp[6 * m]) = (r1_12 + tmpr);
+		    c_im(kp[6 * m]) = (i1_12 + tmpi);
+		    c_re(kp[22 * m]) = (r1_12 - tmpr);
+		    c_im(kp[22 * m]) = (i1_12 - tmpi);
+		    tmpr = ((0.195090322016 * r1_15) + (0.980785280403 * i1_15));
+		    tmpi = ((0.195090322016 * i1_15) - (0.980785280403 * r1_15));
+		    c_re(kp[7 * m]) = (r1_14 + tmpr);
+		    c_im(kp[7 * m]) = (i1_14 + tmpi);
+		    c_re(kp[23 * m]) = (r1_14 - tmpr);
+		    c_im(kp[23 * m]) = (i1_14 - tmpi);
+		    c_re(kp[8 * m]) = (r1_16 + i1_17);
+		    c_im(kp[8 * m]) = (i1_16 - r1_17);
+		    c_re(kp[24 * m]) = (r1_16 - i1_17);
+		    c_im(kp[24 * m]) = (i1_16 + r1_17);
+		    tmpr = ((0.980785280403 * i1_19) - (0.195090322016 * r1_19));
+		    tmpi = ((0.980785280403 * r1_19) + (0.195090322016 * i1_19));
+		    c_re(kp[9 * m]) = (r1_18 + tmpr);
+		    c_im(kp[9 * m]) = (i1_18 - tmpi);
+		    c_re(kp[25 * m]) = (r1_18 - tmpr);
+		    c_im(kp[25 * m]) = (i1_18 + tmpi);
+		    tmpr = ((0.923879532511 * i1_21) - (0.382683432365 * r1_21));
+		    tmpi = ((0.923879532511 * r1_21) + (0.382683432365 * i1_21));
+		    c_re(kp[10 * m]) = (r1_20 + tmpr);
+		    c_im(kp[10 * m]) = (i1_20 - tmpi);
+		    c_re(kp[26 * m]) = (r1_20 - tmpr);
+		    c_im(kp[26 * m]) = (i1_20 + tmpi);
+		    tmpr = ((0.831469612303 * i1_23) - (0.55557023302 * r1_23));
+		    tmpi = ((0.831469612303 * r1_23) + (0.55557023302 * i1_23));
+		    c_re(kp[11 * m]) = (r1_22 + tmpr);
+		    c_im(kp[11 * m]) = (i1_22 - tmpi);
+		    c_re(kp[27 * m]) = (r1_22 - tmpr);
+		    c_im(kp[27 * m]) = (i1_22 + tmpi);
+		    tmpr = (0.707106781187 * (i1_25 - r1_25));
+		    tmpi = (0.707106781187 * (r1_25 + i1_25));
+		    c_re(kp[12 * m]) = (r1_24 + tmpr);
+		    c_im(kp[12 * m]) = (i1_24 - tmpi);
+		    c_re(kp[28 * m]) = (r1_24 - tmpr);
+		    c_im(kp[28 * m]) = (i1_24 + tmpi);
+		    tmpr = ((0.55557023302 * i1_27) - (0.831469612303 * r1_27));
+		    tmpi = ((0.55557023302 * r1_27) + (0.831469612303 * i1_27));
+		    c_re(kp[13 * m]) = (r1_26 + tmpr);
+		    c_im(kp[13 * m]) = (i1_26 - tmpi);
+		    c_re(kp[29 * m]) = (r1_26 - tmpr);
+		    c_im(kp[29 * m]) = (i1_26 + tmpi);
+		    tmpr = ((0.382683432365 * i1_29) - (0.923879532511 * r1_29));
+		    tmpi = ((0.382683432365 * r1_29) + (0.923879532511 * i1_29));
+		    c_re(kp[14 * m]) = (r1_28 + tmpr);
+		    c_im(kp[14 * m]) = (i1_28 - tmpi);
+		    c_re(kp[30 * m]) = (r1_28 - tmpr);
+		    c_im(kp[30 * m]) = (i1_28 + tmpi);
+		    tmpr = ((0.195090322016 * i1_31) - (0.980785280403 * r1_31));
+		    tmpi = ((0.195090322016 * r1_31) + (0.980785280403 * i1_31));
+		    c_re(kp[15 * m]) = (r1_30 + tmpr);
+		    c_im(kp[15 * m]) = (i1_30 - tmpi);
+		    c_re(kp[31 * m]) = (r1_30 - tmpr);
+		    c_im(kp[31 * m]) = (i1_30 + tmpi);
+	       }
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  fft_twiddle_32_seq(a, ab, in, out, W, nW, nWdn, m);
+	  fft_twiddle_32_seq(ab, b, in, out, W, nW, nWdn, m);
+     }
+}
+void fft_unshuffle_32(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+     int i;
+     const COMPLEX *ip;
+     COMPLEX *jp;
+     if ((b - a) < 128) {
+	  ip = in + a * 32;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+#if defined(FORCE_TIED_TASKS)
+          #pragma omp task
+	  fft_unshuffle_32(a, ab, in, out, m);
+          #pragma omp task
+	  fft_unshuffle_32(ab, b, in, out, m);
+#else
+          #pragma omp task untied
+	  fft_unshuffle_32(a, ab, in, out, m);
+          #pragma omp task untied
+	  fft_unshuffle_32(ab, b, in, out, m);
+#endif
+          #pragma omp taskwait
+     }
+}
+void fft_unshuffle_32_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m)
+{
+     int i;
+     const COMPLEX *ip;
+     COMPLEX *jp;
+     if ((b - a) < 128) {
+	  ip = in + a * 32;
+	  for (i = a; i < b; ++i) {
+	       jp = out + i;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	       jp += 2 * m;
+	       jp[0] = ip[0];
+	       jp[m] = ip[1];
+	       ip += 2;
+	  }
+     } else {
+	  int ab = (a + b) / 2;
+	  fft_unshuffle_32_seq(a, ab, in, out, m);
+	  fft_unshuffle_32_seq(ab, b, in, out, m);
+     }
+}
+/* end of machine-generated code */
+
+/*
+ * Recursive complex FFT on the n complex components of the array in:
+ * basic Cooley-Tukey algorithm, with some improvements for
+ * n power of two. The result is placed in the array out. n is arbitrary. 
+ * The algorithm runs in time O(n*(r1 + ... + rk)) where r1, ..., rk
+ * are prime numbers, and r1 * r2 * ... * rk = n.
+ *
+ * n: size of the input
+ * in: pointer to input
+ * out: pointer to output
+ * factors: list of factors of n, precomputed
+ * W: twiddle factors
+ * nW: size of W, that is, size of the original transform
+ *
+ */
+void fft_aux(int n, COMPLEX * in, COMPLEX * out, int *factors, COMPLEX * W, int nW)
+{
+     int r, m;
+     int k;
+
+     /* special cases */
+     if (n == 32) {
+	  fft_base_32(in, out);
+	  return;
+     }
+     if (n == 16) {
+	  fft_base_16(in, out);
+	  return;
+     }
+     if (n == 8) {
+	  fft_base_8(in, out);
+	  return;
+     }
+     if (n == 4) {
+	  fft_base_4(in, out);
+	  return;
+     }
+     if (n == 2) {
+	  fft_base_2(in, out);
+	  return;
+     }
+     /* 
+      * the cases n == 3, n == 5, and maybe 7 should be implemented as well
+      */
+
+     r = *factors;
+     m = n / r;
+
+     if (r < n) {
+	  /* 
+	   * split the DFT of length n into r DFTs of length n/r,  and
+	   * recurse 
+	   */
+#if defined(FORCE_TIED_TASKS)
+	  if (r == 32) {
+               #pragma omp task
+	       fft_unshuffle_32(0, m, in, out, m);
+	  } else if (r == 16) {
+               #pragma omp task
+	       fft_unshuffle_16(0, m, in, out, m);
+	  } else if (r == 8) {
+               #pragma omp task
+	       fft_unshuffle_8(0, m, in, out, m);
+	  } else if (r == 4) {
+               #pragma omp task
+	       fft_unshuffle_4(0, m, in, out, m);
+	  } else if (r == 2) {
+               #pragma omp task
+	       fft_unshuffle_2(0, m, in, out, m);
+	  } else
+	       unshuffle(0, m, in, out, r, m);
+#else
+	  if (r == 32) {
+               #pragma omp task untied
+	       fft_unshuffle_32(0, m, in, out, m);
+	  } else if (r == 16) {
+               #pragma omp task untied
+	       fft_unshuffle_16(0, m, in, out, m);
+	  } else if (r == 8) {
+               #pragma omp task untied
+	       fft_unshuffle_8(0, m, in, out, m);
+	  } else if (r == 4) {
+               #pragma omp task untied
+	       fft_unshuffle_4(0, m, in, out, m);
+	  } else if (r == 2) {
+               #pragma omp task untied
+	       fft_unshuffle_2(0, m, in, out, m);
+	  } else
+	       unshuffle(0, m, in, out, r, m);
+
+#endif
+          #pragma omp taskwait
+
+	  for (k = 0; k < n; k += m) {
+#if defined(FORCE_TIED_TASKS)
+               #pragma omp task
+	       fft_aux(m, out + k, in + k, factors + 1, W, nW);
+#else
+               #pragma omp task untied
+	       fft_aux(m, out + k, in + k, factors + 1, W, nW);
+#endif
+	  }
+          #pragma omp taskwait
+     }
+     /* 
+      * now multiply by the twiddle factors, and perform m FFTs
+      * of length r
+      */
+#if defined(FORCE_TIED_TASKS)
+     if (r == 2) {
+          #pragma omp task untied
+	  fft_twiddle_2(0, m, in, out, W, nW, nW / n, m);
+     } else if (r == 4) {
+          #pragma omp task untied
+	  fft_twiddle_4(0, m, in, out, W, nW, nW / n, m);
+     } else if (r == 8) {
+          #pragma omp task untied
+	  fft_twiddle_8(0, m, in, out, W, nW, nW / n, m);
+     } else if (r == 16) {
+          #pragma omp task untied
+	  fft_twiddle_16(0, m, in, out, W, nW, nW / n, m);
+     } else if (r == 32) {
+          #pragma omp task untied
+	  fft_twiddle_32(0, m, in, out, W, nW, nW / n, m);
+     } else {
+          #pragma omp task untied
+	  fft_twiddle_gen(0, m, in, out, W, nW, nW / n, r, m);
+     }
+#else
+     if (r == 2) {
+          #pragma omp task untied
+	  fft_twiddle_2(0, m, in, out, W, nW, nW / n, m);
+     } else if (r == 4) {
+          #pragma omp task untied
+	  fft_twiddle_4(0, m, in, out, W, nW, nW / n, m);
+     } else if (r == 8) {
+          #pragma omp task untied
+	  fft_twiddle_8(0, m, in, out, W, nW, nW / n, m);
+     } else if (r == 16) {
+          #pragma omp task untied
+	  fft_twiddle_16(0, m, in, out, W, nW, nW / n, m);
+     } else if (r == 32) {
+          #pragma omp task untied
+	  fft_twiddle_32(0, m, in, out, W, nW, nW / n, m);
+     } else {
+          #pragma omp task untied
+	  fft_twiddle_gen(0, m, in, out, W, nW, nW / n, r, m);
+     }
+#endif
+
+     #pragma omp taskwait
+
+     return;
+}
+
+void fft_aux_seq(int n, COMPLEX * in, COMPLEX * out, int *factors, COMPLEX * W, int nW)
+{
+     int r, m;
+     int k;
+
+     /* special cases */
+     if (n == 32) {
+	  fft_base_32(in, out);
+	  return;
+     }
+     if (n == 16) {
+	  fft_base_16(in, out);
+	  return;
+     }
+     if (n == 8) {
+	  fft_base_8(in, out);
+	  return;
+     }
+     if (n == 4) {
+	  fft_base_4(in, out);
+	  return;
+     }
+     if (n == 2) {
+	  fft_base_2(in, out);
+	  return;
+     }
+     /* 
+      * the cases n == 3, n == 5, and maybe 7 should be implemented as well
+      */
+
+     r = *factors;
+     m = n / r;
+
+     if (r < n) {
+	  /* 
+	   * split the DFT of length n into r DFTs of length n/r,  and
+	   * recurse 
+	   */
+	  if      (r == 32) fft_unshuffle_32_seq(0, m, in, out, m);
+	  else if (r == 16) fft_unshuffle_16_seq(0, m, in, out, m);
+	  else if (r ==  8) fft_unshuffle_8_seq(0, m, in, out, m);
+	  else if (r ==  4) fft_unshuffle_4_seq(0, m, in, out, m);
+	  else if (r ==  2) fft_unshuffle_2_seq(0, m, in, out, m);
+	  else              unshuffle_seq(0, m, in, out, r, m);
+
+	  for (k = 0; k < n; k += m) {
+	       fft_aux_seq(m, out + k, in + k, factors + 1, W, nW);
+	  }
+     }
+     /* 
+      * now multiply by the twiddle factors, and perform m FFTs
+      * of length r
+      */
+     if      (r ==  2) fft_twiddle_2_seq(0, m, in, out, W, nW, nW / n, m);
+     else if (r ==  4) fft_twiddle_4_seq(0, m, in, out, W, nW, nW / n, m);
+     else if (r ==  8) fft_twiddle_8_seq(0, m, in, out, W, nW, nW / n, m);
+     else if (r == 16) fft_twiddle_16_seq(0, m, in, out, W, nW, nW / n, m);
+     else if (r == 32) fft_twiddle_32_seq(0, m, in, out, W, nW, nW / n, m);
+     else              fft_twiddle_gen_seq(0, m, in, out, W, nW, nW / n, r, m);
+
+     return;
+}
+/*
+ * user interface for fft_aux
+ */
+void fft(int n, COMPLEX * in, COMPLEX * out)
+{
+     int factors[40];		/* allows FFTs up to at least 3^40 */
+     int *p = factors;
+     int l = n;
+     int r;
+     COMPLEX *W;
+
+     bots_message("Computing coefficients ");
+     W = (COMPLEX *) malloc((n + 1) * sizeof(COMPLEX));
+     #pragma omp parallel
+     #pragma omp single
+#if defined(FORCE_TIED_TASKS)
+     #pragma omp task
+#else
+     #pragma omp task untied
+#endif
+     compute_w_coefficients(n, 0, n / 2, W);
+     bots_message(" completed!\n");
+
+     /* 
+      * find factors of n, first 8, then 4 and then primes in ascending
+      * order 
+      */
+     do {
+	  r = factor(l);
+	  *p++ = r;
+	  l /= r;
+     } while (l > 1);
+
+     bots_message("Computing FFT ");
+     #pragma omp parallel
+     #pragma omp single
+#if defined(FORCE_TIED_TASKS)
+     #pragma omp task
+#else
+     #pragma omp task untied
+#endif
+     fft_aux(n, in, out, factors, W, n);
+     bots_message(" completed!\n");
+
+     free(W);
+     return;
+}
+void fft_seq(int n, COMPLEX * in, COMPLEX * out)
+{
+     int factors[40];		/* allows FFTs up to at least 3^40 */
+     int *p = factors;
+     int l = n;
+     int r;
+     COMPLEX *W;
+
+     W = (COMPLEX *) malloc((n + 1) * sizeof(COMPLEX));
+     compute_w_coefficients_seq(n, 0, n / 2, W);
+
+     /* 
+      * find factors of n, first 8, then 4 and then primes in ascending
+      * order 
+      */
+     do {
+	  r = factor(l);
+	  *p++ = r;
+	  l /= r;
+     } while (l > 1);
+
+     fft_aux_seq(n, in, out, factors, W, n);
+
+     free(W);
+     return;
+}
+int test_correctness(int n, COMPLEX *out1, COMPLEX *out2)
+{
+  int i;
+  double a,d,error = 0.0;
+
+  for (i = 0; i < n; ++i) {
+       a = sqrt((c_re(out1[i]) - c_re(out2[i])) *
+		(c_re(out1[i]) - c_re(out2[i])) +
+		(c_im(out1[i]) - c_im(out2[i])) *
+		(c_im(out1[i]) - c_im(out2[i])));
+       d =  sqrt(c_re(out2[i]) * c_re(out2[i]) + 
+		 c_im(out2[i]) * c_im(out2[i]));
+       if (d < -1.0e-10 || d > 1.0e-10) a /= d;
+       if (a > error) error = a;
+  }
+  bots_message("relative error=%e\n", error);
+  if (error > 1e-3) return BOTS_RESULT_UNSUCCESSFUL;
+  else return BOTS_RESULT_SUCCESSFUL;
+}
+
diff --git a/src/components/implementation/no_interface/omp_fft_bots/fft.h b/src/components/implementation/no_interface/omp_fft_bots/fft.h
new file mode 100644
index 0000000000..ebafa9fb4d
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fft_bots/fft.h
@@ -0,0 +1,55 @@
+#ifndef FFT_H
+#define FFT_H
+
+/* our real numbers */
+typedef double REAL;
+
+/* Complex numbers and operations */
+typedef struct {
+     REAL re, im;
+} COMPLEX;
+
+#define c_re(c)  ((c).re)
+#define c_im(c)  ((c).im)
+
+void compute_w_coefficients(int n, int a, int b, COMPLEX * W);
+void compute_w_coefficients_seq(int n, int a, int b, COMPLEX * W);
+int factor(int n);
+void unshuffle(int a, int b, COMPLEX * in, COMPLEX * out, int r, int m);
+void unshuffle_seq(int a, int b, COMPLEX * in, COMPLEX * out, int r, int m);
+void fft_twiddle_gen1(COMPLEX * in, COMPLEX * out, COMPLEX * W, int r, int m, int nW, int nWdnti, int nWdntm);
+void fft_twiddle_gen(int i, int i1, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int r, int m);
+void fft_twiddle_gen_seq(int i, int i1, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int r, int m);
+void fft_base_2(COMPLEX * in, COMPLEX * out);
+void fft_twiddle_2(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m);
+void fft_twiddle_2_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m);
+void fft_unshuffle_2(int a, int b, COMPLEX * in, COMPLEX * out, int m);
+void fft_unshuffle_2_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m);
+void fft_base_4(COMPLEX * in, COMPLEX * out);
+void fft_twiddle_4(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m);
+void fft_twiddle_4_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m);
+void fft_unshuffle_4(int a, int b, COMPLEX * in, COMPLEX * out, int m);
+void fft_unshuffle_4_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m);
+void fft_base_8(COMPLEX * in, COMPLEX * out);
+void fft_twiddle_8(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m);
+void fft_twiddle_8_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m);
+void fft_unshuffle_8(int a, int b, COMPLEX * in, COMPLEX * out, int m);
+void fft_unshuffle_8_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m);
+void fft_base_16(COMPLEX * in, COMPLEX * out);
+void fft_twiddle_16(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m);
+void fft_twiddle_16_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m);
+void fft_unshuffle_16(int a, int b, COMPLEX * in, COMPLEX * out, int m);
+void fft_unshuffle_16_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m);
+void fft_base_32(COMPLEX * in, COMPLEX * out);
+void fft_twiddle_32(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m);
+void fft_twiddle_32_seq(int a, int b, COMPLEX * in, COMPLEX * out, COMPLEX * W, int nW, int nWdn, int m);
+void fft_unshuffle_32(int a, int b, COMPLEX * in, COMPLEX * out, int m);
+void fft_unshuffle_32_seq(int a, int b, COMPLEX * in, COMPLEX * out, int m);
+void fft_aux(int n, COMPLEX * in, COMPLEX * out, int *factors, COMPLEX * W, int nW);
+void fft_aux_seq(int n, COMPLEX * in, COMPLEX * out, int *factors, COMPLEX * W, int nW);
+void fft(int n, COMPLEX * in, COMPLEX * out);
+void fft_seq(int n, COMPLEX * in, COMPLEX * out);
+int test_correctness(int n, COMPLEX *out1, COMPLEX *out2);
+
+#endif
+
diff --git a/src/components/implementation/no_interface/omp_fft_bots/init.c b/src/components/implementation/no_interface/omp_fft_bots/init.c
new file mode 120000
index 0000000000..9e09b82e77
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fft_bots/init.c
@@ -0,0 +1 @@
+../omp_fib_bots/init.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_fft_bots/omp-tasks-app.h b/src/components/implementation/no_interface/omp_fft_bots/omp-tasks-app.h
new file mode 120000
index 0000000000..9fba574408
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fft_bots/omp-tasks-app.h
@@ -0,0 +1 @@
+../omp_fib_bots/omp-tasks-app.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_fft_bots/posix_basic.c b/src/components/implementation/no_interface/omp_fft_bots/posix_basic.c
new file mode 120000
index 0000000000..9afee078fb
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_fft_bots/posix_basic.c
@@ -0,0 +1 @@
+../omp_fib_bots/posix_basic.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_fib_bots/bots.h b/src/components/implementation/no_interface/omp_fib_bots/bots.h
index add69e42ec..fee71a7eb2 100644
--- a/src/components/implementation/no_interface/omp_fib_bots/bots.h
+++ b/src/components/implementation/no_interface/omp_fib_bots/bots.h
@@ -83,7 +83,7 @@ extern bots_verbose_mode_t bots_verbose_mode;
 #define bots_message(msg, ...) \
    {\
       if ( bots_verbose_mode >= BOTS_VERBOSE_DEFAULT ) {\
-        PRINTC(msg , ##__VA_ARGS__);\
+        printc(msg , ##__VA_ARGS__);\
       }\
    }
 
diff --git a/src/components/implementation/no_interface/omp_fib_bots/bots_main.c b/src/components/implementation/no_interface/omp_fib_bots/bots_main.c
index 53b478512a..e70ca2fccb 100644
--- a/src/components/implementation/no_interface/omp_fib_bots/bots_main.c
+++ b/src/components/implementation/no_interface/omp_fib_bots/bots_main.c
@@ -303,10 +303,9 @@ bots_get_params_common(int argc, char **argv)
 #endif
             case 'c': /* set/unset check mode */
                argv[i][1] = '*';
-               //i++;
-               //if (argc == i) { bots_print_usage(); cos_exit(100); }
-               //bots_check_flag = atoi(argv[i]);
-               bots_check_flag = TRUE;
+               i++;
+               if (argc == i) { bots_print_usage(); cos_exit(100); }
+               bots_check_flag = atoi(argv[i]);
                break;
             case 'e': /* include execution message */
                argv[i][1] = '*';
diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/Makefile b/src/components/implementation/no_interface/omp_sparselu_for_bots/Makefile
new file mode 100644
index 0000000000..901901a2cb
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/Makefile
@@ -0,0 +1,19 @@
+COMPONENT=omp_sparselu_for_bots.o
+INTERFACES=
+#DEPENDENCIES=capmgr
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
+
+CFLAGS += -fopenmp
+# if tied tasks are required
+CFLAGS += -DFORCE_TIED_TASKS
+
+OMPC_FINAL_FLAGS=
+
+# one per compilation or none
+#CFLAGS += -DMANUAL_CUTOFF
+#CFLAGS += -DIF_CUTOFF
+#CFLAGS += -DFINAL_CUTOFF $(OMPC_FINAL_FLAGS) 
diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/app-desc.h b/src/components/implementation/no_interface/omp_sparselu_for_bots/app-desc.h
new file mode 100644
index 0000000000..50e655cf0b
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/app-desc.h
@@ -0,0 +1,56 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include "omp-tasks-app.h"
+
+#define BOTS_APP_NAME "SparseLU (For version)"
+#define BOTS_APP_PARAMETERS_DESC "S1=%dx%d, S2=%dx%d"
+#define BOTS_APP_PARAMETERS_LIST ,bots_arg_size,bots_arg_size,bots_arg_size_1,bots_arg_size_1
+
+#define BOTS_APP_USES_ARG_SIZE
+#define BOTS_APP_DEF_ARG_SIZE 50
+#define BOTS_APP_DESC_ARG_SIZE "Matrix Size"
+
+#define BOTS_APP_USES_ARG_SIZE_1
+#define BOTS_APP_DEF_ARG_SIZE_1 100
+#define BOTS_APP_DESC_ARG_SIZE_1 "Submatrix Size"
+
+#define BOTS_APP_INIT float **SEQ,**BENCH;
+
+void sparselu_init(float ***pM, char *pass);
+void sparselu_fini(float **M, char *pass);
+void sparselu_seq_call(float **SEQ);
+void sparselu_par_call(float **BENCH);
+int sparselu_check(float **SEQ, float **BENCH);
+
+#define KERNEL_INIT sparselu_init(&BENCH,"benchmark");
+#define KERNEL_CALL sparselu_par_call(BENCH);
+#define KERNEL_FINI sparselu_fini(BENCH,"benchmark");
+
+#define KERNEL_SEQ_INIT sparselu_init(&SEQ,"serial");
+#define KERNEL_SEQ_CALL sparselu_seq_call(SEQ);
+#define KERNEL_SEQ_FINI sparselu_fini(SEQ,"serial");
+
+/*
+ * Phani: start without sequencial test
+ */
+#undef BOTS_APP_CHECK_USES_SEQ_RESULT
+#define KERNEL_CHECK sparselu_check(SEQ,BENCH);
+
diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/bots.h b/src/components/implementation/no_interface/omp_sparselu_for_bots/bots.h
new file mode 120000
index 0000000000..828039f356
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/bots.h
@@ -0,0 +1 @@
+../omp_sparselu_single_bots/bots.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_common.c b/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_common.c
new file mode 120000
index 0000000000..8517c18eeb
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_common.c
@@ -0,0 +1 @@
+../omp_sparselu_single_bots/bots_common.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_common.h b/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_common.h
new file mode 120000
index 0000000000..7eb55ec523
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_common.h
@@ -0,0 +1 @@
+../omp_sparselu_single_bots/bots_common.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_main.c b/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_main.c
new file mode 120000
index 0000000000..29ad202b50
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_main.c
@@ -0,0 +1 @@
+../omp_sparselu_single_bots/bots_main.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_main.h b/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_main.h
new file mode 120000
index 0000000000..2d1387edd5
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/bots_main.h
@@ -0,0 +1 @@
+../omp_sparselu_single_bots/bots_main.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/init.c b/src/components/implementation/no_interface/omp_sparselu_for_bots/init.c
new file mode 120000
index 0000000000..a7a03a9e37
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/init.c
@@ -0,0 +1 @@
+../omp_sparselu_single_bots/init.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/omp-tasks-app.h b/src/components/implementation/no_interface/omp_sparselu_for_bots/omp-tasks-app.h
new file mode 120000
index 0000000000..1c1cf79526
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/omp-tasks-app.h
@@ -0,0 +1 @@
+../omp_sparselu_single_bots/omp-tasks-app.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/posix_basic.c b/src/components/implementation/no_interface/omp_sparselu_for_bots/posix_basic.c
new file mode 120000
index 0000000000..0b1896b27e
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/posix_basic.c
@@ -0,0 +1 @@
+../omp_sparselu_single_bots/posix_basic.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/sparselu.c b/src/components/implementation/no_interface/omp_sparselu_for_bots/sparselu.c
new file mode 100644
index 0000000000..b441389dc9
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/sparselu.c
@@ -0,0 +1,326 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h> 
+#include <string.h>
+#include <math.h>
+#include <libgen.h>
+#include "bots.h"
+#include "sparselu.h"
+
+/***********************************************************************
+ * checkmat: 
+ **********************************************************************/
+int checkmat (float *M, float *N)
+{
+   int i, j;
+   float r_err;
+
+   for (i = 0; i < bots_arg_size_1; i++) 
+   {
+      for (j = 0; j < bots_arg_size_1; j++) 
+      {
+         r_err = M[i*bots_arg_size_1+j] - N[i*bots_arg_size_1+j];
+         if ( r_err == 0.0 ) continue;
+
+         if (r_err < 0.0 ) r_err = -r_err;
+
+         if ( M[i*bots_arg_size_1+j] == 0 )
+         {
+           bots_message("Checking failure: A[%d][%d]=%f  B[%d][%d]=%f; \n",
+                    i,j, M[i*bots_arg_size_1+j], i,j, N[i*bots_arg_size_1+j]);
+           return FALSE;
+         }
+         r_err = r_err / M[i*bots_arg_size_1+j];
+         if(r_err > EPSILON)
+         {
+            bots_message("Checking failure: A[%d][%d]=%f  B[%d][%d]=%f; Relative Error=%f\n",
+                    i,j, M[i*bots_arg_size_1+j], i,j, N[i*bots_arg_size_1+j], r_err);
+            return FALSE;
+         }
+      }
+   }
+   return TRUE;
+}
+/***********************************************************************
+ * genmat: 
+ **********************************************************************/
+void genmat (float *M[])
+{
+   int null_entry, init_val, i, j, ii, jj;
+   float *p;
+   int a=0,b=0;
+
+   init_val = 1325;
+
+   /* generating the structure */
+   for (ii=0; ii < bots_arg_size; ii++)
+   {
+      for (jj=0; jj < bots_arg_size; jj++)
+      {
+         /* computing null entries */
+         null_entry=FALSE;
+         if ((ii<jj) && (ii%3 !=0)) null_entry = TRUE;
+         if ((ii>jj) && (jj%3 !=0)) null_entry = TRUE;
+	 if (ii%2==1) null_entry = TRUE;
+	 if (jj%2==1) null_entry = TRUE;
+	 if (ii==jj) null_entry = FALSE;
+	 if (ii==jj-1) null_entry = FALSE;
+         if (ii-1 == jj) null_entry = FALSE; 
+         /* allocating matrix */
+         if (null_entry == FALSE){
+            a++;
+            M[ii*bots_arg_size+jj] = (float *) malloc(bots_arg_size_1*bots_arg_size_1*sizeof(float));
+	    if ((M[ii*bots_arg_size+jj] == NULL))
+            {
+               bots_message("Error: Out of memory\n");
+               exit(101);
+            }
+            /* initializing matrix */
+            p = M[ii*bots_arg_size+jj];
+            for (i = 0; i < bots_arg_size_1; i++) 
+            {
+               for (j = 0; j < bots_arg_size_1; j++)
+               {
+	            init_val = (3125 * init_val) % 65536;
+      	            (*p) = (float)((init_val - 32768.0) / 16384.0);
+                    p++;
+               }
+            }
+         }
+         else
+         {
+            b++;
+            M[ii*bots_arg_size+jj] = NULL;
+         }
+      }
+   }
+   bots_debug("allo = %d, no = %d, total = %d, factor = %f\n",a,b,a+b,(float)((float)a/(float)(a+b)));
+}
+/***********************************************************************
+ * print_structure: 
+ **********************************************************************/
+void print_structure(char *name, float *M[])
+{
+   int ii, jj;
+   bots_message("Structure for matrix %s @ 0x%p\n",name, M);
+   for (ii = 0; ii < bots_arg_size; ii++) {
+     for (jj = 0; jj < bots_arg_size; jj++) {
+        if (M[ii*bots_arg_size+jj]!=NULL) {bots_message("x");}
+        else bots_message(" ");
+     }
+     bots_message("\n");
+   }
+   bots_message("\n");
+}
+/***********************************************************************
+ * allocate_clean_block: 
+ **********************************************************************/
+float * allocate_clean_block()
+{
+  int i,j;
+  float *p, *q;
+
+  p = (float *) malloc(bots_arg_size_1*bots_arg_size_1*sizeof(float));
+  q=p;
+  if (p!=NULL){
+     for (i = 0; i < bots_arg_size_1; i++) 
+        for (j = 0; j < bots_arg_size_1; j++){(*p)=0.0; p++;}
+	
+  }
+  else
+  {
+      bots_message("Error: Out of memory\n");
+      exit (101);
+  }
+  return (q);
+}
+
+/***********************************************************************
+ * lu0: 
+ **********************************************************************/
+void lu0(float *diag)
+{
+   int i, j, k;
+
+   for (k=0; k<bots_arg_size_1; k++)
+      for (i=k+1; i<bots_arg_size_1; i++)
+      {
+         diag[i*bots_arg_size_1+k] = diag[i*bots_arg_size_1+k] / diag[k*bots_arg_size_1+k];
+         for (j=k+1; j<bots_arg_size_1; j++)
+            diag[i*bots_arg_size_1+j] = diag[i*bots_arg_size_1+j] - diag[i*bots_arg_size_1+k] * diag[k*bots_arg_size_1+j];
+      }
+}
+
+/***********************************************************************
+ * bdiv: 
+ **********************************************************************/
+void bdiv(float *diag, float *row)
+{
+   int i, j, k;
+   for (i=0; i<bots_arg_size_1; i++)
+      for (k=0; k<bots_arg_size_1; k++)
+      {
+         row[i*bots_arg_size_1+k] = row[i*bots_arg_size_1+k] / diag[k*bots_arg_size_1+k];
+         for (j=k+1; j<bots_arg_size_1; j++)
+            row[i*bots_arg_size_1+j] = row[i*bots_arg_size_1+j] - row[i*bots_arg_size_1+k]*diag[k*bots_arg_size_1+j];
+      }
+}
+/***********************************************************************
+ * bmod: 
+ **********************************************************************/
+void bmod(float *row, float *col, float *inner)
+{
+   int i, j, k;
+   for (i=0; i<bots_arg_size_1; i++)
+      for (j=0; j<bots_arg_size_1; j++)
+         for (k=0; k<bots_arg_size_1; k++)
+            inner[i*bots_arg_size_1+j] = inner[i*bots_arg_size_1+j] - row[i*bots_arg_size_1+k]*col[k*bots_arg_size_1+j];
+}
+/***********************************************************************
+ * fwd: 
+ **********************************************************************/
+void fwd(float *diag, float *col)
+{
+   int i, j, k;
+   for (j=0; j<bots_arg_size_1; j++)
+      for (k=0; k<bots_arg_size_1; k++) 
+         for (i=k+1; i<bots_arg_size_1; i++)
+            col[i*bots_arg_size_1+j] = col[i*bots_arg_size_1+j] - diag[i*bots_arg_size_1+k]*col[k*bots_arg_size_1+j];
+}
+
+void sparselu_init (float ***pBENCH, char *pass)
+{
+   *pBENCH = (float **) malloc(bots_arg_size*bots_arg_size*sizeof(float *));
+   genmat(*pBENCH);
+   print_structure(pass, *pBENCH);
+}
+
+
+void sparselu_seq_call(float **BENCH)
+{
+   int ii, jj, kk;
+
+   for (kk=0; kk<bots_arg_size; kk++)
+   {
+      lu0(BENCH[kk*bots_arg_size+kk]);
+      for (jj=kk+1; jj<bots_arg_size; jj++)
+         if (BENCH[kk*bots_arg_size+jj] != NULL)
+         {
+            fwd(BENCH[kk*bots_arg_size+kk], BENCH[kk*bots_arg_size+jj]);
+         }
+      for (ii=kk+1; ii<bots_arg_size; ii++) 
+         if (BENCH[ii*bots_arg_size+kk] != NULL)
+         {
+            bdiv (BENCH[kk*bots_arg_size+kk], BENCH[ii*bots_arg_size+kk]);
+         }
+      for (ii=kk+1; ii<bots_arg_size; ii++)
+         if (BENCH[ii*bots_arg_size+kk] != NULL)
+            for (jj=kk+1; jj<bots_arg_size; jj++)
+               if (BENCH[kk*bots_arg_size+jj] != NULL)
+               {
+                     if (BENCH[ii*bots_arg_size+jj]==NULL) BENCH[ii*bots_arg_size+jj] = allocate_clean_block();
+                     bmod(BENCH[ii*bots_arg_size+kk], BENCH[kk*bots_arg_size+jj], BENCH[ii*bots_arg_size+jj]);
+               }
+
+   }
+}
+
+void sparselu_par_call(float **BENCH)
+{
+   int ii, jj, kk;
+   
+   bots_message("Computing SparseLU Factorization (%dx%d matrix with %dx%d blocks) ",
+           bots_arg_size,bots_arg_size,bots_arg_size_1,bots_arg_size_1);
+#pragma omp parallel private(kk)
+   {
+   for (kk=0; kk<bots_arg_size; kk++) 
+   {
+#pragma omp single
+      lu0(BENCH[kk*bots_arg_size+kk]);
+
+#pragma omp for nowait
+      for (jj=kk+1; jj<bots_arg_size; jj++)
+         if (BENCH[kk*bots_arg_size+jj] != NULL)
+#if defined(FORCE_TIED_TASKS)
+            #pragma omp task firstprivate(kk, jj) shared(BENCH)
+#else
+            #pragma omp task untied firstprivate(kk, jj) shared(BENCH)
+#endif
+         {
+            fwd(BENCH[kk*bots_arg_size+kk], BENCH[kk*bots_arg_size+jj]);
+         }
+#pragma omp for
+      for (ii=kk+1; ii<bots_arg_size; ii++) 
+         if (BENCH[ii*bots_arg_size+kk] != NULL)
+#if defined(FORCE_TIED_TASKS)
+            #pragma omp task firstprivate(kk, ii) shared(BENCH)
+#else
+            #pragma omp task untied firstprivate(kk, ii) shared(BENCH)
+#endif
+         {
+            bdiv (BENCH[kk*bots_arg_size+kk], BENCH[ii*bots_arg_size+kk]);
+         }
+
+#pragma omp for private(jj)
+      for (ii=kk+1; ii<bots_arg_size; ii++)
+         if (BENCH[ii*bots_arg_size+kk] != NULL)
+            for (jj=kk+1; jj<bots_arg_size; jj++)
+               if (BENCH[kk*bots_arg_size+jj] != NULL)
+#if defined(FORCE_TIED_TASKS)
+               #pragma omp task firstprivate(kk, jj, ii) shared(BENCH)
+#else
+               #pragma omp task untied firstprivate(kk, jj, ii) shared(BENCH)
+#endif
+               {
+                     if (BENCH[ii*bots_arg_size+jj]==NULL) BENCH[ii*bots_arg_size+jj] = allocate_clean_block();
+                     bmod(BENCH[ii*bots_arg_size+kk], BENCH[kk*bots_arg_size+jj], BENCH[ii*bots_arg_size+jj]);
+               }
+
+   }
+   }
+   bots_message(" completed!\n");
+}
+
+void sparselu_fini (float **BENCH, char *pass)
+{
+   print_structure(pass, BENCH);
+}
+
+int sparselu_check(float **SEQ, float **BENCH)
+{
+   int ii,jj,ok=1;
+
+   for (ii=0; ((ii<bots_arg_size) && ok); ii++)
+   {
+      for (jj=0; ((jj<bots_arg_size) && ok); jj++)
+      {
+         if ((SEQ[ii*bots_arg_size+jj] == NULL) && (BENCH[ii*bots_arg_size+jj] != NULL)) ok = FALSE;
+         if ((SEQ[ii*bots_arg_size+jj] != NULL) && (BENCH[ii*bots_arg_size+jj] == NULL)) ok = FALSE;
+         if ((SEQ[ii*bots_arg_size+jj] != NULL) && (BENCH[ii*bots_arg_size+jj] != NULL))
+            ok = checkmat(SEQ[ii*bots_arg_size+jj], BENCH[ii*bots_arg_size+jj]);
+      }
+   }
+   if (ok) return BOTS_RESULT_SUCCESSFUL;
+   else return BOTS_RESULT_UNSUCCESSFUL;
+}
+
diff --git a/src/components/implementation/no_interface/omp_sparselu_for_bots/sparselu.h b/src/components/implementation/no_interface/omp_sparselu_for_bots/sparselu.h
new file mode 100644
index 0000000000..4edbe682da
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_for_bots/sparselu.h
@@ -0,0 +1,24 @@
+#ifndef SPARSELU_H
+#define SPARSELU_H
+
+#define EPSILON 1.0E-6
+
+int checkmat (float *M, float *N);
+void genmat (float *M[]);
+void print_structure(char *name, float *M[]);
+float * allocate_clean_block();
+void lu0(float *diag);
+void bdiv(float *diag, float *row);
+void bmod(float *row, float *col, float *inner);
+void fwd(float *diag, float *col);
+
+void sparselu_init (float ***pBENCH, char *pass); 
+void sparselu(float **BENCH);
+void sparselu_fini (float **BENCH, char *pass); 
+
+void sparselu_seq_call(float **BENCH);
+void sparselu_par_call(float **BENCH);
+
+int sparselu_check(float **SEQ, float **BENCH);
+
+#endif
diff --git a/src/components/implementation/no_interface/omp_sparselu_single_bots/Makefile b/src/components/implementation/no_interface/omp_sparselu_single_bots/Makefile
new file mode 100644
index 0000000000..8e664f9f1a
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_single_bots/Makefile
@@ -0,0 +1,19 @@
+COMPONENT=omp_sparselu_single_bots.o
+INTERFACES=
+#DEPENDENCIES=capmgr
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
+
+CFLAGS += -fopenmp
+# if tied tasks are required
+CFLAGS += -DFORCE_TIED_TASKS
+
+OMPC_FINAL_FLAGS=
+
+# one per compilation or none
+#CFLAGS += -DMANUAL_CUTOFF
+#CFLAGS += -DIF_CUTOFF
+#CFLAGS += -DFINAL_CUTOFF $(OMPC_FINAL_FLAGS) 
diff --git a/src/components/implementation/no_interface/omp_sparselu_single_bots/app-desc.h b/src/components/implementation/no_interface/omp_sparselu_single_bots/app-desc.h
new file mode 100644
index 0000000000..05f059121a
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_single_bots/app-desc.h
@@ -0,0 +1,56 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include "omp-tasks-app.h"
+
+#define BOTS_APP_NAME "SparseLU (Single version)"
+#define BOTS_APP_PARAMETERS_DESC "S1=%dx%d, S2=%dx%d"
+#define BOTS_APP_PARAMETERS_LIST ,bots_arg_size,bots_arg_size,bots_arg_size_1,bots_arg_size_1
+
+#define BOTS_APP_USES_ARG_SIZE
+#define BOTS_APP_DEF_ARG_SIZE 50
+#define BOTS_APP_DESC_ARG_SIZE "Matrix Size"
+
+#define BOTS_APP_USES_ARG_SIZE_1
+#define BOTS_APP_DEF_ARG_SIZE_1 100
+#define BOTS_APP_DESC_ARG_SIZE_1 "Submatrix Size"
+
+#define BOTS_APP_INIT float **SEQ,**BENCH;
+
+void sparselu_init(float ***pM, char *pass);
+void sparselu_fini(float **M, char *pass);
+void sparselu_seq_call(float **SEQ);
+void sparselu_par_call(float **BENCH);
+int sparselu_check(float **SEQ, float **BENCH);
+
+#define KERNEL_INIT sparselu_init(&BENCH,"benchmark");
+#define KERNEL_CALL sparselu_par_call(BENCH);
+#define KERNEL_FINI sparselu_fini(BENCH,"benchmark");
+
+#define KERNEL_SEQ_INIT sparselu_init(&SEQ,"serial");
+#define KERNEL_SEQ_CALL sparselu_seq_call(SEQ);
+#define KERNEL_SEQ_FINI sparselu_fini(SEQ,"serial");
+
+/*
+ * Phani: start with not doing serial!
+ */
+#undef BOTS_APP_CHECK_USES_SEQ_RESULT
+//#define KERNEL_CHECK sparselu_check(SEQ,BENCH);
+
diff --git a/src/components/implementation/no_interface/omp_sparselu_single_bots/bots.h b/src/components/implementation/no_interface/omp_sparselu_single_bots/bots.h
new file mode 120000
index 0000000000..ea0ad2b59f
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_single_bots/bots.h
@@ -0,0 +1 @@
+../omp_fib_bots/bots.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_single_bots/bots_common.c b/src/components/implementation/no_interface/omp_sparselu_single_bots/bots_common.c
new file mode 120000
index 0000000000..4802b0cf70
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_single_bots/bots_common.c
@@ -0,0 +1 @@
+../omp_fib_bots/bots_common.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_single_bots/bots_common.h b/src/components/implementation/no_interface/omp_sparselu_single_bots/bots_common.h
new file mode 120000
index 0000000000..14eda863e4
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_single_bots/bots_common.h
@@ -0,0 +1 @@
+../omp_fib_bots/bots_common.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_single_bots/bots_main.c b/src/components/implementation/no_interface/omp_sparselu_single_bots/bots_main.c
new file mode 120000
index 0000000000..14f2dab009
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_single_bots/bots_main.c
@@ -0,0 +1 @@
+../omp_fib_bots/bots_main.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_single_bots/bots_main.h b/src/components/implementation/no_interface/omp_sparselu_single_bots/bots_main.h
new file mode 120000
index 0000000000..86c06ad286
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_single_bots/bots_main.h
@@ -0,0 +1 @@
+../omp_fib_bots/bots_main.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_single_bots/init.c b/src/components/implementation/no_interface/omp_sparselu_single_bots/init.c
new file mode 120000
index 0000000000..9e09b82e77
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_single_bots/init.c
@@ -0,0 +1 @@
+../omp_fib_bots/init.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_single_bots/omp-tasks-app.h b/src/components/implementation/no_interface/omp_sparselu_single_bots/omp-tasks-app.h
new file mode 120000
index 0000000000..9fba574408
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_single_bots/omp-tasks-app.h
@@ -0,0 +1 @@
+../omp_fib_bots/omp-tasks-app.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_single_bots/posix_basic.c b/src/components/implementation/no_interface/omp_sparselu_single_bots/posix_basic.c
new file mode 120000
index 0000000000..9afee078fb
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_single_bots/posix_basic.c
@@ -0,0 +1 @@
+../omp_fib_bots/posix_basic.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_sparselu_single_bots/sparselu.c b/src/components/implementation/no_interface/omp_sparselu_single_bots/sparselu.c
new file mode 100644
index 0000000000..e32b446c02
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_single_bots/sparselu.c
@@ -0,0 +1,325 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h> 
+#include <string.h>
+#include <math.h>
+#include <libgen.h>
+#include "bots.h"
+#include "sparselu.h"
+
+/***********************************************************************
+ * checkmat: 
+ **********************************************************************/
+int checkmat (float *M, float *N)
+{
+   int i, j;
+   float r_err;
+
+   for (i = 0; i < bots_arg_size_1; i++)
+   {
+      for (j = 0; j < bots_arg_size_1; j++)
+      {
+         r_err = M[i*bots_arg_size_1+j] - N[i*bots_arg_size_1+j];
+         if ( r_err == 0.0 ) continue;
+
+         if (r_err < 0.0 ) r_err = -r_err;
+
+         if ( M[i*bots_arg_size_1+j] == 0 ) 
+         {
+           bots_message("Checking failure: A[%d][%d]=%f  B[%d][%d]=%f; \n",
+                    i,j, M[i*bots_arg_size_1+j], i,j, N[i*bots_arg_size_1+j]);
+           return FALSE;
+         }  
+         r_err = r_err / M[i*bots_arg_size_1+j];
+         if(r_err > EPSILON)
+         {
+            bots_message("Checking failure: A[%d][%d]=%f  B[%d][%d]=%f; Relative Error=%f\n",
+                    i,j, M[i*bots_arg_size_1+j], i,j, N[i*bots_arg_size_1+j], r_err);
+            return FALSE;
+         }
+      }
+   }
+   return TRUE;
+}
+/***********************************************************************
+ * genmat: 
+ **********************************************************************/
+void genmat (float *M[])
+{
+   int null_entry, init_val, i, j, ii, jj;
+   float *p;
+
+   init_val = 1325;
+
+   /* generating the structure */
+   for (ii=0; ii < bots_arg_size; ii++)
+   {
+      for (jj=0; jj < bots_arg_size; jj++)
+      {
+         /* computing null entries */
+         null_entry=FALSE;
+         if ((ii<jj) && (ii%3 !=0)) null_entry = TRUE;
+         if ((ii>jj) && (jj%3 !=0)) null_entry = TRUE;
+	 if (ii%2==1) null_entry = TRUE;
+	 if (jj%2==1) null_entry = TRUE;
+	 if (ii==jj) null_entry = FALSE;
+	 if (ii==jj-1) null_entry = FALSE;
+         if (ii-1 == jj) null_entry = FALSE; 
+         /* allocating matrix */
+         if (null_entry == FALSE){
+            M[ii*bots_arg_size+jj] = (float *) malloc(bots_arg_size_1*bots_arg_size_1*sizeof(float));
+	    if ((M[ii*bots_arg_size+jj] == NULL))
+            {
+               bots_message("Error: Out of memory\n");
+               exit(101);
+            }
+            /* initializing matrix */
+            p = M[ii*bots_arg_size+jj];
+            for (i = 0; i < bots_arg_size_1; i++) 
+            {
+               for (j = 0; j < bots_arg_size_1; j++)
+               {
+	            init_val = (3125 * init_val) % 65536;
+      	            (*p) = (float)((init_val - 32768.0) / 16384.0);
+                    p++;
+               }
+            }
+         }
+         else
+         {
+            M[ii*bots_arg_size+jj] = NULL;
+         }
+      }
+   }
+}
+/***********************************************************************
+ * print_structure: 
+ **********************************************************************/
+void print_structure(char *name, float *M[])
+{
+   int ii, jj;
+   bots_message("Structure for matrix %s @ 0x%p\n",name, M);
+   for (ii = 0; ii < bots_arg_size; ii++) {
+     for (jj = 0; jj < bots_arg_size; jj++) {
+        if (M[ii*bots_arg_size+jj]!=NULL) {bots_message("x");}
+        else bots_message(" ");
+     }
+     bots_message("\n");
+   }
+   bots_message("\n");
+}
+/***********************************************************************
+ * allocate_clean_block: 
+ **********************************************************************/
+float * allocate_clean_block()
+{
+  int i,j;
+  float *p, *q;
+
+  p = (float *) malloc(bots_arg_size_1*bots_arg_size_1*sizeof(float));
+  q=p;
+  if (p!=NULL){
+     for (i = 0; i < bots_arg_size_1; i++) 
+        for (j = 0; j < bots_arg_size_1; j++){(*p)=0.0; p++;}
+	
+  }
+  else
+  {
+      bots_message("Error: Out of memory\n");
+      exit (101);
+  }
+  return (q);
+}
+
+/***********************************************************************
+ * lu0: 
+ **********************************************************************/
+void lu0(float *diag)
+{
+   int i, j, k;
+
+   for (k=0; k<bots_arg_size_1; k++)
+      for (i=k+1; i<bots_arg_size_1; i++)
+      {
+         diag[i*bots_arg_size_1+k] = diag[i*bots_arg_size_1+k] / diag[k*bots_arg_size_1+k];
+         for (j=k+1; j<bots_arg_size_1; j++)
+            diag[i*bots_arg_size_1+j] = diag[i*bots_arg_size_1+j] - diag[i*bots_arg_size_1+k] * diag[k*bots_arg_size_1+j];
+      }
+}
+
+/***********************************************************************
+ * bdiv: 
+ **********************************************************************/
+void bdiv(float *diag, float *row)
+{
+   int i, j, k;
+   for (i=0; i<bots_arg_size_1; i++)
+      for (k=0; k<bots_arg_size_1; k++)
+      {
+         row[i*bots_arg_size_1+k] = row[i*bots_arg_size_1+k] / diag[k*bots_arg_size_1+k];
+         for (j=k+1; j<bots_arg_size_1; j++)
+            row[i*bots_arg_size_1+j] = row[i*bots_arg_size_1+j] - row[i*bots_arg_size_1+k]*diag[k*bots_arg_size_1+j];
+      }
+}
+/***********************************************************************
+ * bmod: 
+ **********************************************************************/
+void bmod(float *row, float *col, float *inner)
+{
+   int i, j, k;
+   for (i=0; i<bots_arg_size_1; i++)
+      for (j=0; j<bots_arg_size_1; j++)
+         for (k=0; k<bots_arg_size_1; k++)
+            inner[i*bots_arg_size_1+j] = inner[i*bots_arg_size_1+j] - row[i*bots_arg_size_1+k]*col[k*bots_arg_size_1+j];
+}
+/***********************************************************************
+ * fwd: 
+ **********************************************************************/
+void fwd(float *diag, float *col)
+{
+   int i, j, k;
+   for (j=0; j<bots_arg_size_1; j++)
+      for (k=0; k<bots_arg_size_1; k++) 
+         for (i=k+1; i<bots_arg_size_1; i++)
+            col[i*bots_arg_size_1+j] = col[i*bots_arg_size_1+j] - diag[i*bots_arg_size_1+k]*col[k*bots_arg_size_1+j];
+}
+
+
+void sparselu_init (float ***pBENCH, char *pass)
+{
+   *pBENCH = (float **) malloc(bots_arg_size*bots_arg_size*sizeof(float *));
+   genmat(*pBENCH);
+   print_structure(pass, *pBENCH);
+}
+
+void sparselu_par_call(float **BENCH)
+{
+   int ii, jj, kk;
+
+   bots_message("Computing SparseLU Factorization (%dx%d matrix with %dx%d blocks) ",
+           bots_arg_size,bots_arg_size,bots_arg_size_1,bots_arg_size_1);
+#pragma omp parallel
+#pragma omp single nowait
+#if defined(FORCE_TIED_TASKS)
+#pragma omp task
+#else
+#pragma omp task untied
+#endif
+   for (kk=0; kk<bots_arg_size; kk++) 
+   {
+      lu0(BENCH[kk*bots_arg_size+kk]);
+      for (jj=kk+1; jj<bots_arg_size; jj++)
+         if (BENCH[kk*bots_arg_size+jj] != NULL)
+#if defined(FORCE_TIED_TASKS)
+            #pragma omp task firstprivate(kk, jj) shared(BENCH)
+#else
+            #pragma omp task untied firstprivate(kk, jj) shared(BENCH)
+#endif
+         {
+            fwd(BENCH[kk*bots_arg_size+kk], BENCH[kk*bots_arg_size+jj]);
+         }
+      for (ii=kk+1; ii<bots_arg_size; ii++) 
+         if (BENCH[ii*bots_arg_size+kk] != NULL)
+#if defined(FORCE_TIED_TASKS)
+            #pragma omp task firstprivate(kk, ii) shared(BENCH)
+#else
+            #pragma omp task untied firstprivate(kk, ii) shared(BENCH)
+#endif
+         {
+            bdiv (BENCH[kk*bots_arg_size+kk], BENCH[ii*bots_arg_size+kk]);
+         }
+
+      #pragma omp taskwait
+
+      for (ii=kk+1; ii<bots_arg_size; ii++)
+         if (BENCH[ii*bots_arg_size+kk] != NULL)
+            for (jj=kk+1; jj<bots_arg_size; jj++)
+               if (BENCH[kk*bots_arg_size+jj] != NULL)
+#if defined(FORCE_TIED_TASKS)
+               #pragma omp task firstprivate(kk, jj, ii) shared(BENCH)
+#else
+               #pragma omp task untied firstprivate(kk, jj, ii) shared(BENCH)
+#endif
+               {
+                     if (BENCH[ii*bots_arg_size+jj]==NULL) BENCH[ii*bots_arg_size+jj] = allocate_clean_block();
+                     bmod(BENCH[ii*bots_arg_size+kk], BENCH[kk*bots_arg_size+jj], BENCH[ii*bots_arg_size+jj]);
+               }
+
+      #pragma omp taskwait
+   }
+   bots_message(" completed!\n");
+}
+
+
+void sparselu_seq_call(float **BENCH)
+{
+   int ii, jj, kk;
+
+   for (kk=0; kk<bots_arg_size; kk++)
+   {
+      lu0(BENCH[kk*bots_arg_size+kk]);
+      for (jj=kk+1; jj<bots_arg_size; jj++)
+         if (BENCH[kk*bots_arg_size+jj] != NULL)
+         {
+            fwd(BENCH[kk*bots_arg_size+kk], BENCH[kk*bots_arg_size+jj]);
+         }
+      for (ii=kk+1; ii<bots_arg_size; ii++)
+         if (BENCH[ii*bots_arg_size+kk] != NULL)
+         {
+            bdiv (BENCH[kk*bots_arg_size+kk], BENCH[ii*bots_arg_size+kk]);
+         }
+      for (ii=kk+1; ii<bots_arg_size; ii++)
+         if (BENCH[ii*bots_arg_size+kk] != NULL)
+            for (jj=kk+1; jj<bots_arg_size; jj++)
+               if (BENCH[kk*bots_arg_size+jj] != NULL)
+               {
+                     if (BENCH[ii*bots_arg_size+jj]==NULL) BENCH[ii*bots_arg_size+jj] = allocate_clean_block();
+                     bmod(BENCH[ii*bots_arg_size+kk], BENCH[kk*bots_arg_size+jj], BENCH[ii*bots_arg_size+jj]);
+               }
+
+   }
+}
+
+void sparselu_fini (float **BENCH, char *pass)
+{
+   print_structure(pass, BENCH);
+}
+
+int sparselu_check(float **SEQ, float **BENCH)
+{
+   int ii,jj,ok=1;
+
+   for (ii=0; ((ii<bots_arg_size) && ok); ii++)
+   {
+      for (jj=0; ((jj<bots_arg_size) && ok); jj++)
+      {
+         if ((SEQ[ii*bots_arg_size+jj] == NULL) && (BENCH[ii*bots_arg_size+jj] != NULL)) ok = FALSE;
+         if ((SEQ[ii*bots_arg_size+jj] != NULL) && (BENCH[ii*bots_arg_size+jj] == NULL)) ok = FALSE;
+         if ((SEQ[ii*bots_arg_size+jj] != NULL) && (BENCH[ii*bots_arg_size+jj] != NULL))
+            ok = checkmat(SEQ[ii*bots_arg_size+jj], BENCH[ii*bots_arg_size+jj]);
+      }
+   }
+   if (ok) return BOTS_RESULT_SUCCESSFUL;
+   else return BOTS_RESULT_UNSUCCESSFUL;
+}
+
diff --git a/src/components/implementation/no_interface/omp_sparselu_single_bots/sparselu.h b/src/components/implementation/no_interface/omp_sparselu_single_bots/sparselu.h
new file mode 100644
index 0000000000..4edbe682da
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_sparselu_single_bots/sparselu.h
@@ -0,0 +1,24 @@
+#ifndef SPARSELU_H
+#define SPARSELU_H
+
+#define EPSILON 1.0E-6
+
+int checkmat (float *M, float *N);
+void genmat (float *M[]);
+void print_structure(char *name, float *M[]);
+float * allocate_clean_block();
+void lu0(float *diag);
+void bdiv(float *diag, float *row);
+void bmod(float *row, float *col, float *inner);
+void fwd(float *diag, float *col);
+
+void sparselu_init (float ***pBENCH, char *pass); 
+void sparselu(float **BENCH);
+void sparselu_fini (float **BENCH, char *pass); 
+
+void sparselu_seq_call(float **BENCH);
+void sparselu_par_call(float **BENCH);
+
+int sparselu_check(float **SEQ, float **BENCH);
+
+#endif
diff --git a/src/components/implementation/no_interface/omp_strassen_bots/Makefile b/src/components/implementation/no_interface/omp_strassen_bots/Makefile
new file mode 100644
index 0000000000..55589c981b
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_strassen_bots/Makefile
@@ -0,0 +1,19 @@
+COMPONENT=omp_strassen_bots.o
+INTERFACES=
+#DEPENDENCIES=capmgr
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
+
+CFLAGS += -fopenmp
+# if tied tasks are required
+CFLAGS += -DFORCE_TIED_TASKS
+
+OMPC_FINAL_FLAGS=
+
+# one per compilation or none
+#CFLAGS += -DMANUAL_CUTOFF
+#CFLAGS += -DIF_CUTOFF
+#CFLAGS += -DFINAL_CUTOFF $(OMPC_FINAL_FLAGS) 
diff --git a/src/components/implementation/no_interface/omp_strassen_bots/app-desc.h b/src/components/implementation/no_interface/omp_strassen_bots/app-desc.h
new file mode 100644
index 0000000000..3113bb570e
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_strassen_bots/app-desc.h
@@ -0,0 +1,81 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+
+#include "omp-tasks-app.h"
+
+#define BOTS_APP_NAME "Strassen"
+#define BOTS_APP_PARAMETERS_DESC "N=%d:Y=%d"
+#define BOTS_APP_PARAMETERS_LIST ,bots_arg_size,bots_app_cutoff_value
+
+#define BOTS_APP_USES_ARG_SIZE
+#define BOTS_APP_DEF_ARG_SIZE 1024
+#define BOTS_APP_DESC_ARG_SIZE "Matrix Size"
+
+#define BOTS_APP_USES_ARG_BLOCK
+#define BOTS_APP_DEF_ARG_BLOCK 32
+#define BOTS_APP_DESC_ARG_BLOCK "Matrix Block Size"
+
+/* Below this cut off strassen uses MultiplyByDivideAndConquer() algorithm */
+#define BOTS_APP_USES_ARG_CUTOFF
+#define BOTS_APP_DEF_ARG_CUTOFF 64
+#define BOTS_APP_DESC_ARG_CUTOFF "Strassen Cutoff"
+
+/* Task creation cut off */
+#define BOTS_CUTOFF_DEF_VALUE 3
+
+/***********************************************************************
+ * The real numbers we are using --- either double or float
+ **********************************************************************/
+typedef double REAL;
+typedef unsigned long PTR;
+void init_matrix(int n, REAL *A, int an);
+void strassen_main_par(REAL *A, REAL *B, REAL *C, int n);
+void strassen_main_seq(REAL *A, REAL *B, REAL *C, int n);
+int compare_matrix(int n, REAL *A, int an, REAL *B, int bn);
+
+#define BOTS_APP_INIT\
+    double *A, *B, *C, *D;\
+    if ((bots_arg_size & (bots_arg_size - 1)) != 0 || (bots_arg_size % 16) != 0) {\
+        bots_message("Error: matrix size (%d) must be a power of 2 and a multiple of %d\n", bots_arg_size, 16);\
+        exit (1);\
+    }\
+    A = (double *) malloc (bots_arg_size * bots_arg_size * sizeof(double));\
+    B = (double *) malloc (bots_arg_size * bots_arg_size * sizeof(double));\
+    C = (double *) malloc (bots_arg_size * bots_arg_size * sizeof(double));\
+    D = (double *) malloc (bots_arg_size * bots_arg_size * sizeof(double));\
+    init_matrix(bots_arg_size,A,bots_arg_size);\
+    init_matrix(bots_arg_size,B,bots_arg_size);
+
+//#define KERNEL_INIT
+#define KERNEL_CALL strassen_main_par(C,A,B,bots_arg_size);
+//#define KERNEL_FINI
+
+//#define KERNEL_SEQ_INIT
+#define KERNEL_SEQ_CALL strassen_main_seq(D,A,B,bots_arg_size);
+//#define KERNEL_SEQ_FINI
+
+/* 
+ * Phani: disabled running sequencial as we don't have munmap working and 
+ * this program uses a lot of memory per execution.
+ */
+#undef BOTS_APP_CHECK_USES_SEQ_RESULT
+#define KERNEL_CHECK compare_matrix(bots_arg_size,C,bots_arg_size,D,bots_arg_size);
+
+
diff --git a/src/components/implementation/no_interface/omp_strassen_bots/bots.h b/src/components/implementation/no_interface/omp_strassen_bots/bots.h
new file mode 120000
index 0000000000..ea0ad2b59f
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_strassen_bots/bots.h
@@ -0,0 +1 @@
+../omp_fib_bots/bots.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_strassen_bots/bots_common.c b/src/components/implementation/no_interface/omp_strassen_bots/bots_common.c
new file mode 120000
index 0000000000..4802b0cf70
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_strassen_bots/bots_common.c
@@ -0,0 +1 @@
+../omp_fib_bots/bots_common.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_strassen_bots/bots_common.h b/src/components/implementation/no_interface/omp_strassen_bots/bots_common.h
new file mode 120000
index 0000000000..14eda863e4
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_strassen_bots/bots_common.h
@@ -0,0 +1 @@
+../omp_fib_bots/bots_common.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_strassen_bots/bots_main.c b/src/components/implementation/no_interface/omp_strassen_bots/bots_main.c
new file mode 120000
index 0000000000..14f2dab009
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_strassen_bots/bots_main.c
@@ -0,0 +1 @@
+../omp_fib_bots/bots_main.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_strassen_bots/bots_main.h b/src/components/implementation/no_interface/omp_strassen_bots/bots_main.h
new file mode 120000
index 0000000000..86c06ad286
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_strassen_bots/bots_main.h
@@ -0,0 +1 @@
+../omp_fib_bots/bots_main.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_strassen_bots/init.c b/src/components/implementation/no_interface/omp_strassen_bots/init.c
new file mode 120000
index 0000000000..9e09b82e77
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_strassen_bots/init.c
@@ -0,0 +1 @@
+../omp_fib_bots/init.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_strassen_bots/omp-tasks-app.h b/src/components/implementation/no_interface/omp_strassen_bots/omp-tasks-app.h
new file mode 120000
index 0000000000..9fba574408
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_strassen_bots/omp-tasks-app.h
@@ -0,0 +1 @@
+../omp_fib_bots/omp-tasks-app.h
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_strassen_bots/posix_basic.c b/src/components/implementation/no_interface/omp_strassen_bots/posix_basic.c
new file mode 120000
index 0000000000..9afee078fb
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_strassen_bots/posix_basic.c
@@ -0,0 +1 @@
+../omp_fib_bots/posix_basic.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_strassen_bots/strassen.c b/src/components/implementation/no_interface/omp_strassen_bots/strassen.c
new file mode 100644
index 0000000000..59996b9d82
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_strassen_bots/strassen.c
@@ -0,0 +1,1375 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/**********************************************************************************************/
+
+/*
+ * Copyright (c) 1996 Massachusetts Institute of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to use, copy, modify, and distribute the Software without
+ * restriction, provided the Software, including any modified copies made
+ * under this license, is not distributed for a fee, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE MASSACHUSETTS INSTITUTE OF TECHNOLOGY BE LIABLE
+ * FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+ * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * /WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Except as contained in this notice, the name of the Massachusetts
+ * Institute of Technology shall not be used in advertising or otherwise
+ * to promote the sale, use or other dealings in this Software without
+ * prior written authorization from the Massachusetts Institute of
+ * Technology.
+ *
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "app-desc.h"
+#include "bots.h"
+#include "strassen.h"
+
+/***********************************************************************
+ * Naive sequential algorithm, for comparison purposes
+ **********************************************************************/
+void matrixmul(int n, REAL *A, int an, REAL *B, int bn, REAL *C, int cn)
+{
+   int i, j, k;
+   REAL s;
+
+   for (i = 0; i < n; ++i)
+   { 
+      for (j = 0; j < n; ++j)
+      {
+         s = 0.0;
+         for (k = 0; k < n; ++k) s += ELEM(A, an, i, k) * ELEM(B, bn, k, j);
+         ELEM(C, cn, i, j) = s;
+      }
+   }
+}
+/*****************************************************************************
+**
+** FastNaiveMatrixMultiply
+**
+** For small to medium sized matrices A, B, and C of size
+** MatrixSize * MatrixSize this function performs the operation
+** C = A x B efficiently.
+**
+** Note MatrixSize must be divisible by 8.
+**
+** INPUT:
+**    C = (*C WRITE) Address of top left element of matrix C.
+**    A = (*A IS READ ONLY) Address of top left element of matrix A.
+**    B = (*B IS READ ONLY) Address of top left element of matrix B.
+**    MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
+**    RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
+**    RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
+**    RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
+**
+** OUTPUT:
+**    C = (*C WRITE) Matrix C contains A x B. (Initial value of *C undefined.)
+**
+*****************************************************************************/
+void FastNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
+     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB)
+{ 
+  /* Assumes size of real is 8 bytes */
+  PTR RowWidthBInBytes = RowWidthB  << 3;
+  PTR RowWidthAInBytes = RowWidthA << 3;
+  PTR MatrixWidthInBytes = MatrixSize << 3;
+  PTR RowIncrementC = ( RowWidthC - MatrixSize) << 3;
+  unsigned Horizontal, Vertical;
+  
+  REAL *ARowStart = A;
+  for (Vertical = 0; Vertical < MatrixSize; Vertical++) {
+    for (Horizontal = 0; Horizontal < MatrixSize; Horizontal += 8) {
+      REAL *BColumnStart = B + Horizontal;
+      REAL FirstARowValue = *ARowStart++;
+
+      REAL Sum0 = FirstARowValue * (*BColumnStart);
+      REAL Sum1 = FirstARowValue * (*(BColumnStart+1));
+      REAL Sum2 = FirstARowValue * (*(BColumnStart+2));
+      REAL Sum3 = FirstARowValue * (*(BColumnStart+3));
+      REAL Sum4 = FirstARowValue * (*(BColumnStart+4));
+      REAL Sum5 = FirstARowValue * (*(BColumnStart+5));
+      REAL Sum6 = FirstARowValue * (*(BColumnStart+6));
+      REAL Sum7 = FirstARowValue * (*(BColumnStart+7));	
+
+      unsigned Products;
+      for (Products = 1; Products < MatrixSize; Products++) {
+	REAL ARowValue = *ARowStart++;
+	BColumnStart = (REAL*) (((PTR) BColumnStart) + RowWidthBInBytes);
+	Sum0 += ARowValue * (*BColumnStart);
+	Sum1 += ARowValue * (*(BColumnStart+1));
+	Sum2 += ARowValue * (*(BColumnStart+2));
+	Sum3 += ARowValue * (*(BColumnStart+3));
+	Sum4 += ARowValue * (*(BColumnStart+4));
+	Sum5 += ARowValue * (*(BColumnStart+5));
+	Sum6 += ARowValue * (*(BColumnStart+6));
+	Sum7 += ARowValue * (*(BColumnStart+7));	
+      }
+      ARowStart = (REAL*) ( ((PTR) ARowStart) - MatrixWidthInBytes);
+
+      *(C) = Sum0;
+      *(C+1) = Sum1;
+      *(C+2) = Sum2;
+      *(C+3) = Sum3;
+      *(C+4) = Sum4;
+      *(C+5) = Sum5;
+      *(C+6) = Sum6;
+      *(C+7) = Sum7;
+      C+=8;
+    }
+    ARowStart = (REAL*) ( ((PTR) ARowStart) + RowWidthAInBytes );
+    C = (REAL*) ( ((PTR) C) + RowIncrementC );
+  }
+}
+/*****************************************************************************
+**
+** FastAdditiveNaiveMatrixMultiply
+**
+** For small to medium sized matrices A, B, and C of size
+** MatrixSize * MatrixSize this function performs the operation
+** C += A x B efficiently.
+**
+** Note MatrixSize must be divisible by 8.
+**
+** INPUT:
+**    C = (*C READ/WRITE) Address of top left element of matrix C.
+**    A = (*A IS READ ONLY) Address of top left element of matrix A.
+**    B = (*B IS READ ONLY) Address of top left element of matrix B.
+**    MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
+**    RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
+**    RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
+**    RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
+**
+** OUTPUT:
+**    C = (*C READ/WRITE) Matrix C contains C + A x B.
+**
+*****************************************************************************/
+void FastAdditiveNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
+     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB)
+{ 
+  /* Assumes size of real is 8 bytes */
+  PTR RowWidthBInBytes = RowWidthB  << 3;
+  PTR RowWidthAInBytes = RowWidthA << 3;
+  PTR MatrixWidthInBytes = MatrixSize << 3;
+  PTR RowIncrementC = ( RowWidthC - MatrixSize) << 3;
+  unsigned Horizontal, Vertical;
+  
+  REAL *ARowStart = A;
+  for (Vertical = 0; Vertical < MatrixSize; Vertical++) {
+    for (Horizontal = 0; Horizontal < MatrixSize; Horizontal += 8) {
+      REAL *BColumnStart = B + Horizontal;
+
+      REAL Sum0 = *C;
+      REAL Sum1 = *(C+1);
+      REAL Sum2 = *(C+2);
+      REAL Sum3 = *(C+3);
+      REAL Sum4 = *(C+4);
+      REAL Sum5 = *(C+5);
+      REAL Sum6 = *(C+6);
+      REAL Sum7 = *(C+7);	
+
+      unsigned Products;
+      for (Products = 0; Products < MatrixSize; Products++) {
+	REAL ARowValue = *ARowStart++;
+
+	Sum0 += ARowValue * (*BColumnStart);
+	Sum1 += ARowValue * (*(BColumnStart+1));
+	Sum2 += ARowValue * (*(BColumnStart+2));
+	Sum3 += ARowValue * (*(BColumnStart+3));
+	Sum4 += ARowValue * (*(BColumnStart+4));
+	Sum5 += ARowValue * (*(BColumnStart+5));
+	Sum6 += ARowValue * (*(BColumnStart+6));
+	Sum7 += ARowValue * (*(BColumnStart+7));
+
+	BColumnStart = (REAL*) (((PTR) BColumnStart) + RowWidthBInBytes);
+
+      }
+      ARowStart = (REAL*) ( ((PTR) ARowStart) - MatrixWidthInBytes);
+
+      *(C) = Sum0;
+      *(C+1) = Sum1;
+      *(C+2) = Sum2;
+      *(C+3) = Sum3;
+      *(C+4) = Sum4;
+      *(C+5) = Sum5;
+      *(C+6) = Sum6;
+      *(C+7) = Sum7;
+      C+=8;
+    }
+
+    ARowStart = (REAL*) ( ((PTR) ARowStart) + RowWidthAInBytes );
+    C = (REAL*) ( ((PTR) C) + RowIncrementC );
+  }
+}
+/*****************************************************************************
+**
+** MultiplyByDivideAndConquer
+**
+** For medium to medium-large (would you like fries with that) sized
+** matrices A, B, and C of size MatrixSize * MatrixSize this function
+** efficiently performs the operation
+**    C  = A x B (if AdditiveMode == 0)
+**    C += A x B (if AdditiveMode != 0)
+**
+** Note MatrixSize must be divisible by 16.
+**
+** INPUT:
+**    C = (*C READ/WRITE) Address of top left element of matrix C.
+**    A = (*A IS READ ONLY) Address of top left element of matrix A.
+**    B = (*B IS READ ONLY) Address of top left element of matrix B.
+**    MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
+**    RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
+**    RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
+**    RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
+**    AdditiveMode = 0 if we want C = A x B, otherwise we'll do C += A x B
+**
+** OUTPUT:
+**    C (+)= A x B. (+ if AdditiveMode != 0)
+**
+*****************************************************************************/
+void MultiplyByDivideAndConquer(REAL *C, REAL *A, REAL *B,
+				     unsigned MatrixSize,
+				     unsigned RowWidthC,
+				     unsigned RowWidthA,
+				     unsigned RowWidthB,
+				     int AdditiveMode
+				    )
+{
+  #define A00 A
+  #define B00 B
+  #define C00 C
+  REAL  *A01, *A10, *A11, *B01, *B10, *B11, *C01, *C10, *C11;
+  unsigned QuadrantSize = MatrixSize >> 1;
+
+  /* partition the matrix */
+  A01 = A00 + QuadrantSize;
+  A10 = A00 + RowWidthA * QuadrantSize;
+  A11 = A10 + QuadrantSize;
+
+  B01 = B00 + QuadrantSize;
+  B10 = B00 + RowWidthB * QuadrantSize;
+  B11 = B10 + QuadrantSize;
+
+  C01 = C00 + QuadrantSize;
+  C10 = C00 + RowWidthC * QuadrantSize;
+  C11 = C10 + QuadrantSize;
+
+  if (QuadrantSize > SizeAtWhichNaiveAlgorithmIsMoreEfficient) {
+
+    MultiplyByDivideAndConquer(C00, A00, B00, QuadrantSize,
+				     RowWidthC, RowWidthA, RowWidthB,
+				     AdditiveMode);
+
+    MultiplyByDivideAndConquer(C01, A00, B01, QuadrantSize,
+				     RowWidthC, RowWidthA, RowWidthB,
+				     AdditiveMode);
+
+    MultiplyByDivideAndConquer(C11, A10, B01, QuadrantSize,
+				     RowWidthC, RowWidthA, RowWidthB,
+				     AdditiveMode);
+
+    MultiplyByDivideAndConquer(C10, A10, B00, QuadrantSize,
+				     RowWidthC, RowWidthA, RowWidthB,
+				     AdditiveMode);
+
+    MultiplyByDivideAndConquer(C00, A01, B10, QuadrantSize,
+				     RowWidthC, RowWidthA, RowWidthB,
+				     1);
+
+    MultiplyByDivideAndConquer(C01, A01, B11, QuadrantSize,
+				     RowWidthC, RowWidthA, RowWidthB,
+				     1);
+
+    MultiplyByDivideAndConquer(C11, A11, B11, QuadrantSize,
+				     RowWidthC, RowWidthA, RowWidthB,
+				     1);
+
+    MultiplyByDivideAndConquer(C10, A11, B10, QuadrantSize,
+				     RowWidthC, RowWidthA, RowWidthB,
+				     1);
+    
+  } else {
+
+    if (AdditiveMode) {
+      FastAdditiveNaiveMatrixMultiply(C00, A00, B00, QuadrantSize,
+			      RowWidthC, RowWidthA, RowWidthB);
+      
+      FastAdditiveNaiveMatrixMultiply(C01, A00, B01, QuadrantSize,
+			      RowWidthC, RowWidthA, RowWidthB);
+      
+      FastAdditiveNaiveMatrixMultiply(C11, A10, B01, QuadrantSize,
+			      RowWidthC, RowWidthA, RowWidthB);
+      
+      FastAdditiveNaiveMatrixMultiply(C10, A10, B00, QuadrantSize,
+			      RowWidthC, RowWidthA, RowWidthB);
+      
+    } else {
+      
+      FastNaiveMatrixMultiply(C00, A00, B00, QuadrantSize,
+			      RowWidthC, RowWidthA, RowWidthB);
+      
+      FastNaiveMatrixMultiply(C01, A00, B01, QuadrantSize,
+			      RowWidthC, RowWidthA, RowWidthB);
+      
+      FastNaiveMatrixMultiply(C11, A10, B01, QuadrantSize,
+			      RowWidthC, RowWidthA, RowWidthB);
+      
+      FastNaiveMatrixMultiply(C10, A10, B00, QuadrantSize,
+			      RowWidthC, RowWidthA, RowWidthB);
+    }
+
+    FastAdditiveNaiveMatrixMultiply(C00, A01, B10, QuadrantSize,
+				    RowWidthC, RowWidthA, RowWidthB);
+    
+    FastAdditiveNaiveMatrixMultiply(C01, A01, B11, QuadrantSize,
+				    RowWidthC, RowWidthA, RowWidthB);
+    
+    FastAdditiveNaiveMatrixMultiply(C11, A11, B11, QuadrantSize,
+				    RowWidthC, RowWidthA, RowWidthB);
+    
+    FastAdditiveNaiveMatrixMultiply(C10, A11, B10, QuadrantSize,
+				    RowWidthC, RowWidthA, RowWidthB);
+  }
+  return;
+}
+/*****************************************************************************
+**
+** OptimizedStrassenMultiply
+**
+** For large matrices A, B, and C of size MatrixSize * MatrixSize this
+** function performs the operation C = A x B efficiently.
+**
+** INPUT:
+**    C = (*C WRITE) Address of top left element of matrix C.
+**    A = (*A IS READ ONLY) Address of top left element of matrix A.
+**    B = (*B IS READ ONLY) Address of top left element of matrix B.
+**    MatrixSize = Size of matrices (for n*n matrix, MatrixSize = n)
+**    RowWidthA = Number of elements in memory between A[x,y] and A[x,y+1]
+**    RowWidthB = Number of elements in memory between B[x,y] and B[x,y+1]
+**    RowWidthC = Number of elements in memory between C[x,y] and C[x,y+1]
+**
+** OUTPUT:
+**    C = (*C WRITE) Matrix C contains A x B. (Initial value of *C undefined.)
+**
+*****************************************************************************/
+void OptimizedStrassenMultiply_seq(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
+     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth)
+{
+  unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */
+  unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize * QuadrantSize
+                                 + 32;
+  unsigned Column, Row;
+  
+  /************************************************************************
+  ** For each matrix A, B, and C, we'll want pointers to each quandrant
+  ** in the matrix. These quandrants will be addressed as follows:
+  **  --        --
+  **  | A11  A12 |
+  **  |          |
+  **  | A21  A22 |
+  **  --        --
+  ************************************************************************/
+  REAL /* *A11, *B11, *C11, */ *A12, *B12, *C12,
+       *A21, *B21, *C21, *A22, *B22, *C22;
+
+  REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT;
+  #define T2sMULT C22
+  #define NumberOfVariables 11
+
+  PTR TempMatrixOffset = 0;
+  PTR MatrixOffsetA = 0;
+  PTR MatrixOffsetB = 0;
+
+  char *Heap;
+  void *StartHeap;
+
+  /* Distance between the end of a matrix row and the start of the next row */
+  PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3;
+  PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3;
+  PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3;
+
+  if (MatrixSize <= bots_app_cutoff_value) {
+    MultiplyByDivideAndConquer(C, A, B, MatrixSize, RowWidthC, RowWidthA, RowWidthB, 0);
+    return;
+  }
+
+  /* Initialize quandrant matrices */
+  #define A11 A
+  #define B11 B
+  #define C11 C
+  A12 = A11 + QuadrantSize;
+  B12 = B11 + QuadrantSize;
+  C12 = C11 + QuadrantSize;
+  A21 = A + (RowWidthA * QuadrantSize);
+  B21 = B + (RowWidthB * QuadrantSize);
+  C21 = C + (RowWidthC * QuadrantSize);
+  A22 = A21 + QuadrantSize;
+  B22 = B21 + QuadrantSize;
+  C22 = C21 + QuadrantSize;
+
+  /* Allocate Heap Space Here */
+  StartHeap = Heap = malloc(QuadrantSizeInBytes * NumberOfVariables);
+  /* ensure that heap is on cache boundary */
+  if ( ((PTR) Heap) & 31)
+     Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) );
+  
+  /* Distribute the heap space over the variables */
+  S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  
+  /***************************************************************************
+  ** Step through all columns row by row (vertically)
+  ** (jumps in memory by RowWidth => bad locality)
+  ** (but we want the best locality on the innermost loop)
+  ***************************************************************************/
+  for (Row = 0; Row < QuadrantSize; Row++) {
+    
+    /*************************************************************************
+    ** Step through each row horizontally (addressing elements in each column)
+    ** (jumps linearly througn memory => good locality)
+    *************************************************************************/
+    for (Column = 0; Column < QuadrantSize; Column++) {
+      
+      /***********************************************************
+      ** Within this loop, the following holds for MatrixOffset:
+      ** MatrixOffset = (Row * RowWidth) + Column
+      ** (note: that the unit of the offset is number of reals)
+      ***********************************************************/
+      /* Element of Global Matrix, such as A, B, C */
+      #define E(Matrix)   (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) )
+      #define EA(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) )
+      #define EB(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) )
+
+      /* FIXME - may pay to expand these out - got higher speed-ups below */
+      /* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */
+      E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) );
+
+      /* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */
+      E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21);
+
+      /* S3 = A11 - A21 */
+      E(S3) = EA(A11) - EA(A21);
+      
+      /* S7 = B22 - B12 */
+      E(S7) = EB(B22) - EB(B12);
+
+      TempMatrixOffset += sizeof(REAL);
+      MatrixOffsetA += sizeof(REAL);
+      MatrixOffsetB += sizeof(REAL);
+    } /* end row loop*/
+
+    MatrixOffsetA += RowIncrementA;
+    MatrixOffsetB += RowIncrementB;
+  } /* end column loop */
+
+  /* M2 = A11 x B11 */
+  OptimizedStrassenMultiply_seq(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1);
+
+  /* M5 = S1 * S5 */
+  OptimizedStrassenMultiply_seq(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of T1 = S2 x S6 + M2 */
+  OptimizedStrassenMultiply_seq(T1sMULT, S2, S6,  QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of T2 = T1 + S3 x S7 */
+  OptimizedStrassenMultiply_seq(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of C11 = M2 + A12 * B21 */
+  OptimizedStrassenMultiply_seq(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1);
+  
+  /* Step 1 of C12 = S4 x B22 + T1 + M5 */
+  OptimizedStrassenMultiply_seq(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1);
+
+  /* Step 1 of C21 = T2 - A22 * S8 */
+  OptimizedStrassenMultiply_seq(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1);
+
+  /***************************************************************************
+  ** Step through all columns row by row (vertically)
+  ** (jumps in memory by RowWidth => bad locality)
+  ** (but we want the best locality on the innermost loop)
+  ***************************************************************************/
+  for (Row = 0; Row < QuadrantSize; Row++) {
+    /*************************************************************************
+    ** Step through each row horizontally (addressing elements in each column)
+    ** (jumps linearly througn memory => good locality)
+    *************************************************************************/
+    for (Column = 0; Column < QuadrantSize; Column += 4) {
+      REAL LocalM5_0 = *(M5);
+      REAL LocalM5_1 = *(M5+1);
+      REAL LocalM5_2 = *(M5+2);
+      REAL LocalM5_3 = *(M5+3);
+      REAL LocalM2_0 = *(M2);
+      REAL LocalM2_1 = *(M2+1);
+      REAL LocalM2_2 = *(M2+2);
+      REAL LocalM2_3 = *(M2+3);
+      REAL T1_0 = *(T1sMULT) + LocalM2_0;
+      REAL T1_1 = *(T1sMULT+1) + LocalM2_1;
+      REAL T1_2 = *(T1sMULT+2) + LocalM2_2;
+      REAL T1_3 = *(T1sMULT+3) + LocalM2_3;
+      REAL T2_0 = *(C22) + T1_0;
+      REAL T2_1 = *(C22+1) + T1_1;
+      REAL T2_2 = *(C22+2) + T1_2;
+      REAL T2_3 = *(C22+3) + T1_3;
+      (*(C11))   += LocalM2_0;
+      (*(C11+1)) += LocalM2_1;
+      (*(C11+2)) += LocalM2_2;
+      (*(C11+3)) += LocalM2_3;
+      (*(C12))   += LocalM5_0 + T1_0;
+      (*(C12+1)) += LocalM5_1 + T1_1;
+      (*(C12+2)) += LocalM5_2 + T1_2;
+      (*(C12+3)) += LocalM5_3 + T1_3;
+      (*(C22))   = LocalM5_0 + T2_0;
+      (*(C22+1)) = LocalM5_1 + T2_1;
+      (*(C22+2)) = LocalM5_2 + T2_2;
+      (*(C22+3)) = LocalM5_3 + T2_3;
+      (*(C21  )) = (- *(C21  )) + T2_0;
+      (*(C21+1)) = (- *(C21+1)) + T2_1;
+      (*(C21+2)) = (- *(C21+2)) + T2_2;
+      (*(C21+3)) = (- *(C21+3)) + T2_3;
+      M5 += 4;
+      M2 += 4;
+      T1sMULT += 4;
+      C11 += 4;
+      C12 += 4;
+      C21 += 4;
+      C22 += 4;
+    }
+    C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC);
+    C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC);
+    C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC);
+    C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC);
+  }
+  free(StartHeap);
+}
+#if defined(IF_CUTOFF)
+void OptimizedStrassenMultiply_par(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
+     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth)
+{
+  unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */
+  unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize * QuadrantSize
+                                 + 32;
+  unsigned Column, Row;
+  
+  /************************************************************************
+  ** For each matrix A, B, and C, we'll want pointers to each quandrant
+  ** in the matrix. These quandrants will be addressed as follows:
+  **  --        --
+  **  | A11  A12 |
+  **  |          |
+  **  | A21  A22 |
+  **  --        --
+  ************************************************************************/
+  REAL /* *A11, *B11, *C11, */ *A12, *B12, *C12,
+       *A21, *B21, *C21, *A22, *B22, *C22;
+
+  REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT;
+  #define T2sMULT C22
+  #define NumberOfVariables 11
+
+  PTR TempMatrixOffset = 0;
+  PTR MatrixOffsetA = 0;
+  PTR MatrixOffsetB = 0;
+
+  char *Heap;
+  void *StartHeap;
+
+  /* Distance between the end of a matrix row and the start of the next row */
+  PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3;
+  PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3;
+  PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3;
+
+  if (MatrixSize <= bots_app_cutoff_value) {
+    MultiplyByDivideAndConquer(C, A, B, MatrixSize, RowWidthC, RowWidthA, RowWidthB, 0);
+    return;
+  }
+
+  /* Initialize quandrant matrices */
+  #define A11 A
+  #define B11 B
+  #define C11 C
+  A12 = A11 + QuadrantSize;
+  B12 = B11 + QuadrantSize;
+  C12 = C11 + QuadrantSize;
+  A21 = A + (RowWidthA * QuadrantSize);
+  B21 = B + (RowWidthB * QuadrantSize);
+  C21 = C + (RowWidthC * QuadrantSize);
+  A22 = A21 + QuadrantSize;
+  B22 = B21 + QuadrantSize;
+  C22 = C21 + QuadrantSize;
+
+  /* Allocate Heap Space Here */
+  StartHeap = Heap = malloc(QuadrantSizeInBytes * NumberOfVariables);
+  /* ensure that heap is on cache boundary */
+  if ( ((PTR) Heap) & 31)
+     Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) );
+  
+  /* Distribute the heap space over the variables */
+  S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  
+  /***************************************************************************
+  ** Step through all columns row by row (vertically)
+  ** (jumps in memory by RowWidth => bad locality)
+  ** (but we want the best locality on the innermost loop)
+  ***************************************************************************/
+  for (Row = 0; Row < QuadrantSize; Row++) {
+    
+    /*************************************************************************
+    ** Step through each row horizontally (addressing elements in each column)
+    ** (jumps linearly througn memory => good locality)
+    *************************************************************************/
+    for (Column = 0; Column < QuadrantSize; Column++) {
+      
+      /***********************************************************
+      ** Within this loop, the following holds for MatrixOffset:
+      ** MatrixOffset = (Row * RowWidth) + Column
+      ** (note: that the unit of the offset is number of reals)
+      ***********************************************************/
+      /* Element of Global Matrix, such as A, B, C */
+      #define E(Matrix)   (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) )
+      #define EA(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) )
+      #define EB(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) )
+
+      /* FIXME - may pay to expand these out - got higher speed-ups below */
+      /* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */
+      E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) );
+
+      /* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */
+      E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21);
+
+      /* S3 = A11 - A21 */
+      E(S3) = EA(A11) - EA(A21);
+      
+      /* S7 = B22 - B12 */
+      E(S7) = EB(B22) - EB(B12);
+
+      TempMatrixOffset += sizeof(REAL);
+      MatrixOffsetA += sizeof(REAL);
+      MatrixOffsetB += sizeof(REAL);
+    } /* end row loop*/
+
+    MatrixOffsetA += RowIncrementA;
+    MatrixOffsetB += RowIncrementB;
+  } /* end column loop */
+
+#if defined(FORCE_TIED_TASKS)
+  /* M2 = A11 x B11 */
+  #pragma omp task if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1);
+
+  /* M5 = S1 * S5 */
+  #pragma omp task if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of T1 = S2 x S6 + M2 */
+  #pragma omp task if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(T1sMULT, S2, S6,  QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of T2 = T1 + S3 x S7 */
+  #pragma omp task if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of C11 = M2 + A12 * B21 */
+  #pragma omp task if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1);
+  
+  /* Step 1 of C12 = S4 x B22 + T1 + M5 */
+  #pragma omp task if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1);
+
+  /* Step 1 of C21 = T2 - A22 * S8 */
+  #pragma omp task if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1);
+#else
+  /* M2 = A11 x B11 */
+  #pragma omp task untied if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1);
+
+  /* M5 = S1 * S5 */
+  #pragma omp task untied if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of T1 = S2 x S6 + M2 */
+  #pragma omp task untied if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(T1sMULT, S2, S6,  QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of T2 = T1 + S3 x S7 */
+  #pragma omp task untied if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of C11 = M2 + A12 * B21 */
+  #pragma omp task untied if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1);
+  
+  /* Step 1 of C12 = S4 x B22 + T1 + M5 */
+  #pragma omp task untied if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1);
+
+  /* Step 1 of C21 = T2 - A22 * S8 */
+  #pragma omp task untied if (Depth < bots_cutoff_value)
+  OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1);
+#endif
+  /**********************************************
+  ** Synchronization Point
+  **********************************************/
+  #pragma omp taskwait
+  /***************************************************************************
+  ** Step through all columns row by row (vertically)
+  ** (jumps in memory by RowWidth => bad locality)
+  ** (but we want the best locality on the innermost loop)
+  ***************************************************************************/
+  for (Row = 0; Row < QuadrantSize; Row++) {
+    /*************************************************************************
+    ** Step through each row horizontally (addressing elements in each column)
+    ** (jumps linearly througn memory => good locality)
+    *************************************************************************/
+    for (Column = 0; Column < QuadrantSize; Column += 4) {
+      REAL LocalM5_0 = *(M5);
+      REAL LocalM5_1 = *(M5+1);
+      REAL LocalM5_2 = *(M5+2);
+      REAL LocalM5_3 = *(M5+3);
+      REAL LocalM2_0 = *(M2);
+      REAL LocalM2_1 = *(M2+1);
+      REAL LocalM2_2 = *(M2+2);
+      REAL LocalM2_3 = *(M2+3);
+      REAL T1_0 = *(T1sMULT) + LocalM2_0;
+      REAL T1_1 = *(T1sMULT+1) + LocalM2_1;
+      REAL T1_2 = *(T1sMULT+2) + LocalM2_2;
+      REAL T1_3 = *(T1sMULT+3) + LocalM2_3;
+      REAL T2_0 = *(C22) + T1_0;
+      REAL T2_1 = *(C22+1) + T1_1;
+      REAL T2_2 = *(C22+2) + T1_2;
+      REAL T2_3 = *(C22+3) + T1_3;
+      (*(C11))   += LocalM2_0;
+      (*(C11+1)) += LocalM2_1;
+      (*(C11+2)) += LocalM2_2;
+      (*(C11+3)) += LocalM2_3;
+      (*(C12))   += LocalM5_0 + T1_0;
+      (*(C12+1)) += LocalM5_1 + T1_1;
+      (*(C12+2)) += LocalM5_2 + T1_2;
+      (*(C12+3)) += LocalM5_3 + T1_3;
+      (*(C22))   = LocalM5_0 + T2_0;
+      (*(C22+1)) = LocalM5_1 + T2_1;
+      (*(C22+2)) = LocalM5_2 + T2_2;
+      (*(C22+3)) = LocalM5_3 + T2_3;
+      (*(C21  )) = (- *(C21  )) + T2_0;
+      (*(C21+1)) = (- *(C21+1)) + T2_1;
+      (*(C21+2)) = (- *(C21+2)) + T2_2;
+      (*(C21+3)) = (- *(C21+3)) + T2_3;
+      M5 += 4;
+      M2 += 4;
+      T1sMULT += 4;
+      C11 += 4;
+      C12 += 4;
+      C21 += 4;
+      C22 += 4;
+    }
+    C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC);
+    C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC);
+    C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC);
+    C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC);
+  }
+  free(StartHeap);
+}
+#elif defined(MANUAL_CUTOFF)
+void OptimizedStrassenMultiply_par(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
+     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth)
+{
+  unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */
+  unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize * QuadrantSize
+                                 + 32;
+  unsigned Column, Row;
+  
+  /************************************************************************
+  ** For each matrix A, B, and C, we'll want pointers to each quandrant
+  ** in the matrix. These quandrants will be addressed as follows:
+  **  --        --
+  **  | A11  A12 |
+  **  |          |
+  **  | A21  A22 |
+  **  --        --
+  ************************************************************************/
+  REAL /* *A11, *B11, *C11, */ *A12, *B12, *C12,
+       *A21, *B21, *C21, *A22, *B22, *C22;
+
+  REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT;
+  #define T2sMULT C22
+  #define NumberOfVariables 11
+
+  PTR TempMatrixOffset = 0;
+  PTR MatrixOffsetA = 0;
+  PTR MatrixOffsetB = 0;
+
+  char *Heap;
+  void *StartHeap;
+
+  /* Distance between the end of a matrix row and the start of the next row */
+  PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3;
+  PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3;
+  PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3;
+
+  if (MatrixSize <= bots_app_cutoff_value) {
+    MultiplyByDivideAndConquer(C, A, B, MatrixSize, RowWidthC, RowWidthA, RowWidthB, 0);
+    return;
+  }
+
+  /* Initialize quandrant matrices */
+  #define A11 A
+  #define B11 B
+  #define C11 C
+  A12 = A11 + QuadrantSize;
+  B12 = B11 + QuadrantSize;
+  C12 = C11 + QuadrantSize;
+  A21 = A + (RowWidthA * QuadrantSize);
+  B21 = B + (RowWidthB * QuadrantSize);
+  C21 = C + (RowWidthC * QuadrantSize);
+  A22 = A21 + QuadrantSize;
+  B22 = B21 + QuadrantSize;
+  C22 = C21 + QuadrantSize;
+
+  /* Allocate Heap Space Here */
+  StartHeap = Heap = malloc(QuadrantSizeInBytes * NumberOfVariables);
+  /* ensure that heap is on cache boundary */
+  if ( ((PTR) Heap) & 31)
+     Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) );
+  
+  /* Distribute the heap space over the variables */
+  S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  
+  /***************************************************************************
+  ** Step through all columns row by row (vertically)
+  ** (jumps in memory by RowWidth => bad locality)
+  ** (but we want the best locality on the innermost loop)
+  ***************************************************************************/
+  for (Row = 0; Row < QuadrantSize; Row++) {
+    
+    /*************************************************************************
+    ** Step through each row horizontally (addressing elements in each column)
+    ** (jumps linearly througn memory => good locality)
+    *************************************************************************/
+    for (Column = 0; Column < QuadrantSize; Column++) {
+      
+      /***********************************************************
+      ** Within this loop, the following holds for MatrixOffset:
+      ** MatrixOffset = (Row * RowWidth) + Column
+      ** (note: that the unit of the offset is number of reals)
+      ***********************************************************/
+      /* Element of Global Matrix, such as A, B, C */
+      #define E(Matrix)   (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) )
+      #define EA(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) )
+      #define EB(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) )
+
+      /* FIXME - may pay to expand these out - got higher speed-ups below */
+      /* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */
+      E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) );
+
+      /* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */
+      E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21);
+
+      /* S3 = A11 - A21 */
+      E(S3) = EA(A11) - EA(A21);
+      
+      /* S7 = B22 - B12 */
+      E(S7) = EB(B22) - EB(B12);
+
+      TempMatrixOffset += sizeof(REAL);
+      MatrixOffsetA += sizeof(REAL);
+      MatrixOffsetB += sizeof(REAL);
+    } /* end row loop*/
+
+    MatrixOffsetA += RowIncrementA;
+    MatrixOffsetB += RowIncrementB;
+  } /* end column loop */
+
+  if (Depth < bots_cutoff_value)
+  {
+#if defined(FORCE_TIED_TASKS)
+    /* M2 = A11 x B11 */
+    #pragma omp task
+    OptimizedStrassenMultiply_par(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1);
+
+    /* M5 = S1 * S5 */
+    #pragma omp task
+    OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+    /* Step 1 of T1 = S2 x S6 + M2 */
+    #pragma omp task
+    OptimizedStrassenMultiply_par(T1sMULT, S2, S6,  QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+    /* Step 1 of T2 = T1 + S3 x S7 */
+    #pragma omp task
+    OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1);
+
+    /* Step 1 of C11 = M2 + A12 * B21 */
+    #pragma omp task
+    OptimizedStrassenMultiply_par(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1);
+  
+    /* Step 1 of C12 = S4 x B22 + T1 + M5 */
+    #pragma omp task
+    OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1);
+
+    /* Step 1 of C21 = T2 - A22 * S8 */
+    #pragma omp task
+    OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1);
+#else
+    /* M2 = A11 x B11 */
+    #pragma omp task untied
+    OptimizedStrassenMultiply_par(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1);
+
+    /* M5 = S1 * S5 */
+    #pragma omp task untied
+    OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+    /* Step 1 of T1 = S2 x S6 + M2 */
+    #pragma omp task untied
+    OptimizedStrassenMultiply_par(T1sMULT, S2, S6,  QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+    /* Step 1 of T2 = T1 + S3 x S7 */
+    #pragma omp task untied
+    OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1);
+
+    /* Step 1 of C11 = M2 + A12 * B21 */
+    #pragma omp task untied
+    OptimizedStrassenMultiply_par(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1);
+  
+    /* Step 1 of C12 = S4 x B22 + T1 + M5 */
+    #pragma omp task untied
+    OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1);
+
+    /* Step 1 of C21 = T2 - A22 * S8 */
+    #pragma omp task untied
+    OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1);
+#endif
+
+    /**********************************************
+    ** Synchronization Point
+    **********************************************/
+    #pragma omp taskwait
+  }
+  else
+  {
+    /* M2 = A11 x B11 */
+    OptimizedStrassenMultiply_par(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1);
+    /* M5 = S1 * S5 */
+    OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+    /* Step 1 of T1 = S2 x S6 + M2 */
+    OptimizedStrassenMultiply_par(T1sMULT, S2, S6,  QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+    /* Step 1 of T2 = T1 + S3 x S7 */
+    OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1);
+    /* Step 1 of C11 = M2 + A12 * B21 */
+    OptimizedStrassenMultiply_par(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1);
+    /* Step 1 of C12 = S4 x B22 + T1 + M5 */
+    OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1);
+    /* Step 1 of C21 = T2 - A22 * S8 */
+    OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1);
+  }
+
+  /***************************************************************************
+  ** Step through all columns row by row (vertically)
+  ** (jumps in memory by RowWidth => bad locality)
+  ** (but we want the best locality on the innermost loop)
+  ***************************************************************************/
+  for (Row = 0; Row < QuadrantSize; Row++) {
+    /*************************************************************************
+    ** Step through each row horizontally (addressing elements in each column)
+    ** (jumps linearly througn memory => good locality)
+    *************************************************************************/
+    for (Column = 0; Column < QuadrantSize; Column += 4) {
+      REAL LocalM5_0 = *(M5);
+      REAL LocalM5_1 = *(M5+1);
+      REAL LocalM5_2 = *(M5+2);
+      REAL LocalM5_3 = *(M5+3);
+      REAL LocalM2_0 = *(M2);
+      REAL LocalM2_1 = *(M2+1);
+      REAL LocalM2_2 = *(M2+2);
+      REAL LocalM2_3 = *(M2+3);
+      REAL T1_0 = *(T1sMULT) + LocalM2_0;
+      REAL T1_1 = *(T1sMULT+1) + LocalM2_1;
+      REAL T1_2 = *(T1sMULT+2) + LocalM2_2;
+      REAL T1_3 = *(T1sMULT+3) + LocalM2_3;
+      REAL T2_0 = *(C22) + T1_0;
+      REAL T2_1 = *(C22+1) + T1_1;
+      REAL T2_2 = *(C22+2) + T1_2;
+      REAL T2_3 = *(C22+3) + T1_3;
+      (*(C11))   += LocalM2_0;
+      (*(C11+1)) += LocalM2_1;
+      (*(C11+2)) += LocalM2_2;
+      (*(C11+3)) += LocalM2_3;
+      (*(C12))   += LocalM5_0 + T1_0;
+      (*(C12+1)) += LocalM5_1 + T1_1;
+      (*(C12+2)) += LocalM5_2 + T1_2;
+      (*(C12+3)) += LocalM5_3 + T1_3;
+      (*(C22))   = LocalM5_0 + T2_0;
+      (*(C22+1)) = LocalM5_1 + T2_1;
+      (*(C22+2)) = LocalM5_2 + T2_2;
+      (*(C22+3)) = LocalM5_3 + T2_3;
+      (*(C21  )) = (- *(C21  )) + T2_0;
+      (*(C21+1)) = (- *(C21+1)) + T2_1;
+      (*(C21+2)) = (- *(C21+2)) + T2_2;
+      (*(C21+3)) = (- *(C21+3)) + T2_3;
+      M5 += 4;
+      M2 += 4;
+      T1sMULT += 4;
+      C11 += 4;
+      C12 += 4;
+      C21 += 4;
+      C22 += 4;
+    }
+    C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC);
+    C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC);
+    C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC);
+    C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC);
+  }
+  free(StartHeap);
+}
+#else
+void OptimizedStrassenMultiply_par(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
+     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth)
+{
+  unsigned QuadrantSize = MatrixSize >> 1; /* MatixSize / 2 */
+  unsigned QuadrantSizeInBytes = sizeof(REAL) * QuadrantSize * QuadrantSize
+                                 + 32;
+  unsigned Column, Row;
+  
+  /************************************************************************
+  ** For each matrix A, B, and C, we'll want pointers to each quandrant
+  ** in the matrix. These quandrants will be addressed as follows:
+  **  --        --
+  **  | A11  A12 |
+  **  |          |
+  **  | A21  A22 |
+  **  --        --
+  ************************************************************************/
+  REAL /* *A11, *B11, *C11, */ *A12, *B12, *C12,
+       *A21, *B21, *C21, *A22, *B22, *C22;
+
+  REAL *S1,*S2,*S3,*S4,*S5,*S6,*S7,*S8,*M2,*M5,*T1sMULT;
+  #define T2sMULT C22
+  #define NumberOfVariables 11
+
+  PTR TempMatrixOffset = 0;
+  PTR MatrixOffsetA = 0;
+  PTR MatrixOffsetB = 0;
+
+  char *Heap;
+  void *StartHeap;
+
+  /* Distance between the end of a matrix row and the start of the next row */
+  PTR RowIncrementA = ( RowWidthA - QuadrantSize ) << 3;
+  PTR RowIncrementB = ( RowWidthB - QuadrantSize ) << 3;
+  PTR RowIncrementC = ( RowWidthC - QuadrantSize ) << 3;
+
+  if (MatrixSize <= bots_app_cutoff_value) {
+    MultiplyByDivideAndConquer(C, A, B, MatrixSize, RowWidthC, RowWidthA, RowWidthB, 0);
+    return;
+  }
+
+  /* Initialize quandrant matrices */
+  #define A11 A
+  #define B11 B
+  #define C11 C
+  A12 = A11 + QuadrantSize;
+  B12 = B11 + QuadrantSize;
+  C12 = C11 + QuadrantSize;
+  A21 = A + (RowWidthA * QuadrantSize);
+  B21 = B + (RowWidthB * QuadrantSize);
+  C21 = C + (RowWidthC * QuadrantSize);
+  A22 = A21 + QuadrantSize;
+  B22 = B21 + QuadrantSize;
+  C22 = C21 + QuadrantSize;
+
+  /* Allocate Heap Space Here */
+  StartHeap = Heap = malloc(QuadrantSizeInBytes * NumberOfVariables);
+  /* ensure that heap is on cache boundary */
+  if ( ((PTR) Heap) & 31)
+     Heap = (char*) ( ((PTR) Heap) + 32 - ( ((PTR) Heap) & 31) );
+  
+  /* Distribute the heap space over the variables */
+  S1 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S3 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S4 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S6 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S7 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  S8 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  M2 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  M5 = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  T1sMULT = (REAL*) Heap; Heap += QuadrantSizeInBytes;
+  
+  /***************************************************************************
+  ** Step through all columns row by row (vertically)
+  ** (jumps in memory by RowWidth => bad locality)
+  ** (but we want the best locality on the innermost loop)
+  ***************************************************************************/
+  for (Row = 0; Row < QuadrantSize; Row++) {
+    
+    /*************************************************************************
+    ** Step through each row horizontally (addressing elements in each column)
+    ** (jumps linearly througn memory => good locality)
+    *************************************************************************/
+    for (Column = 0; Column < QuadrantSize; Column++) {
+      
+      /***********************************************************
+      ** Within this loop, the following holds for MatrixOffset:
+      ** MatrixOffset = (Row * RowWidth) + Column
+      ** (note: that the unit of the offset is number of reals)
+      ***********************************************************/
+      /* Element of Global Matrix, such as A, B, C */
+      #define E(Matrix)   (* (REAL*) ( ((PTR) Matrix) + TempMatrixOffset ) )
+      #define EA(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetA ) )
+      #define EB(Matrix)  (* (REAL*) ( ((PTR) Matrix) + MatrixOffsetB ) )
+
+      /* FIXME - may pay to expand these out - got higher speed-ups below */
+      /* S4 = A12 - ( S2 = ( S1 = A21 + A22 ) - A11 ) */
+      E(S4) = EA(A12) - ( E(S2) = ( E(S1) = EA(A21) + EA(A22) ) - EA(A11) );
+
+      /* S8 = (S6 = B22 - ( S5 = B12 - B11 ) ) - B21 */
+      E(S8) = ( E(S6) = EB(B22) - ( E(S5) = EB(B12) - EB(B11) ) ) - EB(B21);
+
+      /* S3 = A11 - A21 */
+      E(S3) = EA(A11) - EA(A21);
+      
+      /* S7 = B22 - B12 */
+      E(S7) = EB(B22) - EB(B12);
+
+      TempMatrixOffset += sizeof(REAL);
+      MatrixOffsetA += sizeof(REAL);
+      MatrixOffsetB += sizeof(REAL);
+    } /* end row loop*/
+
+    MatrixOffsetA += RowIncrementA;
+    MatrixOffsetB += RowIncrementB;
+  } /* end column loop */
+
+#if defined(FORCE_TIED_TASKS)
+  /* M2 = A11 x B11 */
+  #pragma omp task
+  OptimizedStrassenMultiply_par(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1);
+
+  /* M5 = S1 * S5 */
+  #pragma omp task
+  OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of T1 = S2 x S6 + M2 */
+  #pragma omp task
+  OptimizedStrassenMultiply_par(T1sMULT, S2, S6,  QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of T2 = T1 + S3 x S7 */
+  #pragma omp task
+  OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of C11 = M2 + A12 * B21 */
+  #pragma omp task
+  OptimizedStrassenMultiply_par(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1);
+  
+  /* Step 1 of C12 = S4 x B22 + T1 + M5 */
+  #pragma omp task
+  OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1);
+
+  /* Step 1 of C21 = T2 - A22 * S8 */
+  #pragma omp task
+  OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1);
+#else
+  /* M2 = A11 x B11 */
+  #pragma omp task untied
+  OptimizedStrassenMultiply_par(M2, A11, B11, QuadrantSize, QuadrantSize, RowWidthA, RowWidthB, Depth+1);
+
+  /* M5 = S1 * S5 */
+  #pragma omp task untied
+  OptimizedStrassenMultiply_par(M5, S1, S5, QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of T1 = S2 x S6 + M2 */
+  #pragma omp task untied
+  OptimizedStrassenMultiply_par(T1sMULT, S2, S6,  QuadrantSize, QuadrantSize, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of T2 = T1 + S3 x S7 */
+  #pragma omp task untied
+  OptimizedStrassenMultiply_par(C22, S3, S7, QuadrantSize, RowWidthC /*FIXME*/, QuadrantSize, QuadrantSize, Depth+1);
+
+  /* Step 1 of C11 = M2 + A12 * B21 */
+  #pragma omp task untied
+  OptimizedStrassenMultiply_par(C11, A12, B21, QuadrantSize, RowWidthC, RowWidthA, RowWidthB, Depth+1);
+  
+  /* Step 1 of C12 = S4 x B22 + T1 + M5 */
+  #pragma omp task untied
+  OptimizedStrassenMultiply_par(C12, S4, B22, QuadrantSize, RowWidthC, QuadrantSize, RowWidthB, Depth+1);
+
+  /* Step 1 of C21 = T2 - A22 * S8 */
+  #pragma omp task untied
+  OptimizedStrassenMultiply_par(C21, A22, S8, QuadrantSize, RowWidthC, RowWidthA, QuadrantSize, Depth+1);
+#endif
+
+  /**********************************************
+  ** Synchronization Point
+  **********************************************/
+  #pragma omp taskwait
+  /***************************************************************************
+  ** Step through all columns row by row (vertically)
+  ** (jumps in memory by RowWidth => bad locality)
+  ** (but we want the best locality on the innermost loop)
+  ***************************************************************************/
+  for (Row = 0; Row < QuadrantSize; Row++) {
+    /*************************************************************************
+    ** Step through each row horizontally (addressing elements in each column)
+    ** (jumps linearly througn memory => good locality)
+    *************************************************************************/
+    for (Column = 0; Column < QuadrantSize; Column += 4) {
+      REAL LocalM5_0 = *(M5);
+      REAL LocalM5_1 = *(M5+1);
+      REAL LocalM5_2 = *(M5+2);
+      REAL LocalM5_3 = *(M5+3);
+      REAL LocalM2_0 = *(M2);
+      REAL LocalM2_1 = *(M2+1);
+      REAL LocalM2_2 = *(M2+2);
+      REAL LocalM2_3 = *(M2+3);
+      REAL T1_0 = *(T1sMULT) + LocalM2_0;
+      REAL T1_1 = *(T1sMULT+1) + LocalM2_1;
+      REAL T1_2 = *(T1sMULT+2) + LocalM2_2;
+      REAL T1_3 = *(T1sMULT+3) + LocalM2_3;
+      REAL T2_0 = *(C22) + T1_0;
+      REAL T2_1 = *(C22+1) + T1_1;
+      REAL T2_2 = *(C22+2) + T1_2;
+      REAL T2_3 = *(C22+3) + T1_3;
+      (*(C11))   += LocalM2_0;
+      (*(C11+1)) += LocalM2_1;
+      (*(C11+2)) += LocalM2_2;
+      (*(C11+3)) += LocalM2_3;
+      (*(C12))   += LocalM5_0 + T1_0;
+      (*(C12+1)) += LocalM5_1 + T1_1;
+      (*(C12+2)) += LocalM5_2 + T1_2;
+      (*(C12+3)) += LocalM5_3 + T1_3;
+      (*(C22))   = LocalM5_0 + T2_0;
+      (*(C22+1)) = LocalM5_1 + T2_1;
+      (*(C22+2)) = LocalM5_2 + T2_2;
+      (*(C22+3)) = LocalM5_3 + T2_3;
+      (*(C21  )) = (- *(C21  )) + T2_0;
+      (*(C21+1)) = (- *(C21+1)) + T2_1;
+      (*(C21+2)) = (- *(C21+2)) + T2_2;
+      (*(C21+3)) = (- *(C21+3)) + T2_3;
+      M5 += 4;
+      M2 += 4;
+      T1sMULT += 4;
+      C11 += 4;
+      C12 += 4;
+      C21 += 4;
+      C22 += 4;
+    }
+    C11 = (REAL*) ( ((PTR) C11 ) + RowIncrementC);
+    C12 = (REAL*) ( ((PTR) C12 ) + RowIncrementC);
+    C21 = (REAL*) ( ((PTR) C21 ) + RowIncrementC);
+    C22 = (REAL*) ( ((PTR) C22 ) + RowIncrementC);
+  }
+  free(StartHeap);
+}
+#endif
+/*
+ * Set an n by n matrix A to random values.  The distance between
+ * rows is an
+ */
+void init_matrix(int n, REAL *A, int an)
+{
+     int i, j;
+
+     for (i = 0; i < n; ++i)
+	  for (j = 0; j < n; ++j) 
+	       ELEM(A, an, i, j) = ((double) rand()) / (double) RAND_MAX; 
+}
+
+/*
+ * Compare two matrices.  Print an error message if they differ by
+ * more than EPSILON.
+ */
+int compare_matrix(int n, REAL *A, int an, REAL *B, int bn)
+{
+     int i, j;
+     REAL c;
+
+     for (i = 0; i < n; ++i)
+	  for (j = 0; j < n; ++j) {
+	       /* compute the relative error c */
+	       c = ELEM(A, an, i, j) - ELEM(B, bn, i, j);
+	       if (c < 0.0) 
+		    c = -c;
+
+	       c = c / ELEM(A, an, i, j);
+	       if (c > EPSILON) {
+		    bots_message("Strassen: Wrong answer!\n");
+		    return BOTS_RESULT_UNSUCCESSFUL;
+	       }
+	  }
+
+     return BOTS_RESULT_SUCCESSFUL;
+}
+	       
+/*
+ * Allocate a matrix of side n (therefore n^2 elements)
+ */
+REAL *alloc_matrix(int n) 
+{
+     return malloc(n * n * sizeof(REAL));
+}
+
+void strassen_main_par(REAL *A, REAL *B, REAL *C, int n)
+{
+	bots_message("Computing parallel Strassen algorithm (n=%d) ", n);
+	#pragma omp parallel
+	#pragma omp single
+#if defined(FORCE_TIED_TASKS)
+	#pragma omp task
+#else
+	#pragma omp task untied     
+#endif
+		OptimizedStrassenMultiply_par(C, A, B, n, n, n, n, 1);
+	bots_message(" completed!\n");
+}
+void strassen_main_seq(REAL *A, REAL *B, REAL *C, int n)
+{
+	bots_message("Computing sequential Strassen algorithm (n=%d) ", n);
+	OptimizedStrassenMultiply_seq(C, A, B, n, n, n, n, 1);
+	bots_message(" completed!\n");
+}
+
diff --git a/src/components/implementation/no_interface/omp_strassen_bots/strassen.h b/src/components/implementation/no_interface/omp_strassen_bots/strassen.h
new file mode 100644
index 0000000000..7944f77880
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_strassen_bots/strassen.h
@@ -0,0 +1,66 @@
+/**********************************************************************************************/
+/*  This program is part of the Barcelona OpenMP Tasks Suite                                  */
+/*  Copyright (C) 2009 Barcelona Supercomputing Center - Centro Nacional de Supercomputacion  */
+/*  Copyright (C) 2009 Universitat Politecnica de Catalunya                                   */
+/*                                                                                            */
+/*  This program is free software; you can redistribute it and/or modify                      */
+/*  it under the terms of the GNU General Public License as published by                      */
+/*  the Free Software Foundation; either version 2 of the License, or                         */
+/*  (at your option) any later version.                                                       */
+/*                                                                                            */
+/*  This program is distributed in the hope that it will be useful,                           */
+/*  but WITHOUT ANY WARRANTY; without even the implied warranty of                            */
+/*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the                             */
+/*  GNU General Public License for more details.                                              */
+/*                                                                                            */
+/*  You should have received a copy of the GNU General Public License                         */
+/*  along with this program; if not, write to the Free Software                               */
+/*  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA            */
+/**********************************************************************************************/
+#ifndef _STRASSEN_H
+#define _STRASSEN_H
+/* ******************************************************************* */
+/* STRASSEN APPLICATION CUT OFF's                                      */
+/* ******************************************************************* */
+/* Strassen uses three different functions to compute Matrix Multiply. */
+/* Each of them is related to an application cut off value:            */
+/*  - Initial algorithm: OptimizedStrassenMultiply()                   */
+/*  - bots_app_cutoff_value: MultiplyByDivideAndConquer()              */
+/*  - SizeAtWhichNaiveAlgorithmIsMoreEfficient: FastAdditiveNaiveMatrixMultiply() */
+/* ******************************************************************* */
+
+/*FIXME: at the moment we use a constant value, change to parameter ???*/
+/* Below this cut off  strassen uses FastAdditiveNaiveMatrixMultiply algorithm */
+#define SizeAtWhichNaiveAlgorithmIsMoreEfficient 16
+
+/***********************************************************************
+ * maximum tolerable relative error (for the checking routine)
+ **********************************************************************/
+#define EPSILON (1.0E-6)
+/***********************************************************************
+ * Matrices are stored in row-major order; A is a pointer to
+ * the first element of the matrix, and an is the number of elements
+ * between two rows. This macro produces the element A[i,j]
+ * given A, an, i and j
+ **********************************************************************/
+#define ELEM(A, an, i, j) (A[(i)*(an)+(j)])
+
+void matrixmul(int n, REAL *A, int an, REAL *B, int bn, REAL *C, int cn);
+void FastNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
+     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB);
+void FastAdditiveNaiveMatrixMultiply(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
+     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB);
+void MultiplyByDivideAndConquer(REAL *C, REAL *A, REAL *B,
+				     unsigned MatrixSize,
+				     unsigned RowWidthC,
+				     unsigned RowWidthA,
+				     unsigned RowWidthB,
+				     int AdditiveMode
+				    );
+void OptimizedStrassenMultiply_par(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
+     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth);
+void OptimizedStrassenMultiply_seq(REAL *C, REAL *A, REAL *B, unsigned MatrixSize,
+     unsigned RowWidthC, unsigned RowWidthA, unsigned RowWidthB, int Depth);
+REAL *alloc_matrix(int n);
+#endif
+
diff --git a/src/components/include/part_task.h b/src/components/include/part_task.h
index ffe6ffa847..0a5ec834b3 100644
--- a/src/components/include/part_task.h
+++ b/src/components/include/part_task.h
@@ -11,12 +11,12 @@
 #define PART_THD_COREID(t) (t >> 16)
 #define PART_THD_THDID(t)  ((t << 16) >> 16)
 
-#define PART_MAX_TASKS      256
-#define PART_MAX_DATA       256
+#define PART_MAX_TASKS      2048 
+#define PART_MAX_DATA       2048
 #define PART_MAX_PAR_THDS   NUM_CPU
-#define PART_MAX_CORE_THDS  64
+#define PART_MAX_CORE_THDS  48
 #define PART_MAX_THDS       512
-#define PART_MAX_CHILD      16
+#define PART_MAX_CHILD      1024
 #define PART_MAX_WORKSHARES 16
 
 typedef void (*part_fn_t)(void *);
diff --git a/src/components/lib/Makefile b/src/components/lib/Makefile
index e6313481a5..0ede0f381b 100644
--- a/src/components/lib/Makefile
+++ b/src/components/lib/Makefile
@@ -62,7 +62,7 @@ distclean:
 	make -C libcxx clean
 
 musl:
-	cd musl-1.1.11; ./configure "CFLAGS=-m32 -O3" "LDFLAGS=-Wl,-melf_i386" --disable-shared --target=i386; cd ..
+	cd musl-1.1.11; ./configure "CFLAGS=-m32 -march=i386 -msse -O3" "LDFLAGS=-Wl,-melf_i386" --disable-shared --target=i386; cd ..
 	make -C musl-1.1.11
 	make -C musl-1.1.11 install
 
diff --git a/src/components/lib/sl/sl_xcore.c b/src/components/lib/sl/sl_xcore.c
index e46fc92113..b105a18411 100644
--- a/src/components/lib/sl/sl_xcore.c
+++ b/src/components/lib/sl/sl_xcore.c
@@ -77,21 +77,25 @@ sl_xcore_thd_lookup(thdid_t tid)
 
 extern struct sl_thd *sl_thd_alloc_no_cs(cos_thd_fn_t fn, void *data);
 
+#define SL_IPI_ENABLE
+
 static inline int
 _sl_xcore_request_enqueue_no_cs(cpuid_t core, struct sl_xcore_request *rq)
 {
 	int ret = 0;
-	asndcap_t snd = 0;
-	
+
 	if (unlikely(core >= NUM_CPU)) return -1;
 	if (unlikely(core == cos_cpuid())) return -1;
 	if (unlikely(!bitmap_check(sl__globals()->core_bmp, core))) return -1;
 	ret = ck_ring_enqueue_mpsc_xcore(sl__ring(core), sl__ring_buffer(core), rq);
-	snd = sl__globals()->xcore_asnd[cos_cpuid()][core];
+
+#ifdef SL_IPI_ENABLE
+	asndcap_t snd = sl__globals()->xcore_asnd[cos_cpuid()][core];
 	assert(snd);
 
 	/* send an IPI for the request */
 	cos_asnd(snd, 0);
+#endif
 
 	if (unlikely(ret == false)) return -1;
 
diff --git a/src/platform/i386/runscripts/omp_fft_bots.sh b/src/platform/i386/runscripts/omp_fft_bots.sh
new file mode 100644
index 0000000000..858f140dd1
--- /dev/null
+++ b/src/platform/i386/runscripts/omp_fft_bots.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+cp omp_fft_bots.o llboot.o
+./cos_linker "llboot.o, :" ./gen_client_stub
+
+#cp llboot_comp.o llboot.o
+#cp omp_fft_bots.o boot.o
+#cp test_boot.o dummy1.o
+#cp test_boot.o dummy2.o
+#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub
diff --git a/src/platform/i386/runscripts/omp_sparselu_for_bots.sh b/src/platform/i386/runscripts/omp_sparselu_for_bots.sh
new file mode 100644
index 0000000000..785b0eae92
--- /dev/null
+++ b/src/platform/i386/runscripts/omp_sparselu_for_bots.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+cp omp_sparselu_for_bots.o llboot.o
+./cos_linker "llboot.o, :" ./gen_client_stub
+
+#cp llboot_comp.o llboot.o
+#cp omp_sparselu_for_bots.o boot.o
+#cp test_boot.o dummy1.o
+#cp test_boot.o dummy2.o
+#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub
diff --git a/src/platform/i386/runscripts/omp_sparselu_single_bots.sh b/src/platform/i386/runscripts/omp_sparselu_single_bots.sh
new file mode 100644
index 0000000000..1d1374aef4
--- /dev/null
+++ b/src/platform/i386/runscripts/omp_sparselu_single_bots.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+cp omp_sparselu_single_bots.o llboot.o
+./cos_linker "llboot.o, :" ./gen_client_stub
+
+#cp llboot_comp.o llboot.o
+#cp omp_sparselu_single_bots.o boot.o
+#cp test_boot.o dummy1.o
+#cp test_boot.o dummy2.o
+#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub
diff --git a/src/platform/i386/runscripts/omp_strassen_bots.sh b/src/platform/i386/runscripts/omp_strassen_bots.sh
new file mode 100644
index 0000000000..3fe5a88ac3
--- /dev/null
+++ b/src/platform/i386/runscripts/omp_strassen_bots.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+cp omp_strassen_bots.o llboot.o
+./cos_linker "llboot.o, :" ./gen_client_stub
+
+#cp llboot_comp.o llboot.o
+#cp omp_strassen_bots.o boot.o
+#cp test_boot.o dummy1.o
+#cp test_boot.o dummy2.o
+#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub

From 3269cd1638fe5c09840f68aa61ebb0e24dfcba81 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Wed, 22 May 2019 02:21:27 -0400
Subject: [PATCH 100/127] Fixed user-level sl_thd_rcv

* To return 0 on wakeup like cos_rcv does.
* On non-blocking case, to return -EAGAIN at user-level.
---
 src/components/include/sl.h | 12 ++++++++----
 src/kernel/include/thd.h    |  4 ++--
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index 3ac908ac21..4d41e2770a 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -898,14 +898,18 @@ sl_thd_rcv(rcv_flags_t flags)
 {
 	struct sl_thd *t = sl_thd_curr();
 	unsigned long *p = &sl_thd_dcbinfo(t)->pending, q = 0;
+	int ret = 0;
 
 	assert(sl_thd_rcvcap(t));
 check:
 	sl_cs_enter();
-	q = *p;
-	if (q == 0) {
+	/* there no pending event in the dcbinfo->pending */
+	if ((q = ps_load(p)) == 0) {
 		if (unlikely(!(flags & RCV_ULONLY))) goto rcv;
-		if (unlikely(flags & RCV_NON_BLOCKING)) goto done;
+		if (unlikely(flags & RCV_NON_BLOCKING)) {
+			ret = -EAGAIN;
+			goto done;
+		}
 
 		sl_thd_sched_block_no_cs(t, SL_THD_BLOCKED, 0);
 		sl_cs_exit_switchto(sl__globals_core()->sched_thd);
@@ -918,7 +922,7 @@ sl_thd_rcv(rcv_flags_t flags)
 done:
 	sl_cs_exit();
 
-	return q;
+	return ret;
 rcv:
 	sl_cs_exit();
 
diff --git a/src/kernel/include/thd.h b/src/kernel/include/thd.h
index 00cdb65e65..e507033db8 100644
--- a/src/kernel/include/thd.h
+++ b/src/kernel/include/thd.h
@@ -252,15 +252,15 @@ thd_rcvcap_set_counter(struct thread *t, sched_tok_t cntr)
 static void
 thd_rcvcap_pending_set(struct thread *arcvt)
 {
-	arcvt->rcvcap.pending = 1;
-
 	if (likely(arcvt->dcbinfo)) arcvt->dcbinfo->pending = 1;
+	else                        arcvt->rcvcap.pending = 1;
 }
 
 static void
 thd_rcvcap_pending_reset(struct thread *arcvt)
 {
 	arcvt->rcvcap.pending = 0;
+	if (likely(arcvt->dcbinfo)) arcvt->dcbinfo->pending = 0;
 }
 
 static inline int

From 7c5d142099b6fedcec4bd8066b24e37105109ae2 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Wed, 22 May 2019 02:22:43 -0400
Subject: [PATCH 101/127] test program to test sl_thd_rcv(), uses IPIs for
 interrupts.

---
 .../implementation/tests/unit_slrcv/Makefile  |   8 ++
 .../implementation/tests/unit_slrcv/init.c    |  91 +++++++++++++++
 .../implementation/tests/unit_slrcv/spinlib.c | 110 ++++++++++++++++++
 .../implementation/tests/unit_slrcv/spinlib.h |  20 ++++
 src/platform/i386/runscripts/unit_slrcv.sh    |   4 +
 5 files changed, 233 insertions(+)
 create mode 100644 src/components/implementation/tests/unit_slrcv/Makefile
 create mode 100644 src/components/implementation/tests/unit_slrcv/init.c
 create mode 100644 src/components/implementation/tests/unit_slrcv/spinlib.c
 create mode 100644 src/components/implementation/tests/unit_slrcv/spinlib.h
 create mode 100644 src/platform/i386/runscripts/unit_slrcv.sh

diff --git a/src/components/implementation/tests/unit_slrcv/Makefile b/src/components/implementation/tests/unit_slrcv/Makefile
new file mode 100644
index 0000000000..0c3074e079
--- /dev/null
+++ b/src/components/implementation/tests/unit_slrcv/Makefile
@@ -0,0 +1,8 @@
+COMPONENT=unit_slrcvtest.o
+INTERFACES=
+DEPENDENCIES=
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format $(LIBSLRAW) -lsl_mod_fprr -lsl_thd_static_backend -lcos_dcb
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/tests/unit_slrcv/init.c b/src/components/implementation/tests/unit_slrcv/init.c
new file mode 100644
index 0000000000..8a1fb73eaf
--- /dev/null
+++ b/src/components/implementation/tests/unit_slrcv/init.c
@@ -0,0 +1,91 @@
+#include <cos_kernel_api.h>
+#include <cos_defkernel_api.h>
+#include <llprint.h>
+#include <sl.h>
+#include <cos_dcb.h>
+#include <hypercall.h>
+#include "spinlib.h"
+
+static struct sl_xcore_thd *ping;
+static struct sl_xcore_thd *pong;
+
+#define WORK_US (10*1000*1000)
+
+static inline void
+ping_fn(void *d)
+{
+	asndcap_t s = *(asndcap_t *)d;
+
+	while (1) {
+		printc("s");
+		int r = cos_asnd(s, 0);
+
+		assert(r == 0);
+		spinlib_usecs(WORK_US);
+	}
+	sl_thd_exit();
+}
+
+static inline void
+pong_fn(arcvcap_t r, void *d)
+{
+	while (1) {
+		int p = sl_thd_rcv(RCV_ULONLY);
+		//int p = cos_rcv(r, 0);
+
+		printc("%d", p);
+	}
+	sl_thd_exit();
+}
+
+void
+cos_init(void *d)
+{
+	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
+	struct cos_compinfo *   ci    = cos_compinfo_get(defci);
+	int i;
+	static volatile unsigned long init_done[NUM_CPU] = { 0 };
+	static volatile arcvcap_t r = 0;
+	static volatile asndcap_t s = 0;
+	unsigned int cycs_per_us = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
+
+	assert(NUM_CPU == 2);
+
+	if (cos_cpuid() == 0) {
+		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
+		cos_defcompinfo_llinit();
+		cos_dcb_info_init_curr();
+		sl_init(SL_MIN_PERIOD_US);
+		spinlib_calib(cycs_per_us);
+
+		struct sl_thd *t = sl_thd_aep_alloc(pong_fn, NULL, 0, 0, 0, 0);
+		assert(t);
+		sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1));
+		r = sl_thd_rcvcap(t);
+		assert(r);
+	} else {
+		while (!ps_load(&init_done[0])) ;
+
+		cos_defcompinfo_sched_init();
+		cos_dcb_info_init_curr();
+		sl_init(SL_MIN_PERIOD_US);
+		
+		struct sl_thd *t = sl_thd_alloc(ping_fn, (void *)&s);
+		assert(t);
+		sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1));
+
+		while (!r) ;
+		s = cos_asnd_alloc(ci, r, ci->captbl_cap);
+		assert(s);
+	}
+	ps_faa(&init_done[cos_cpuid()], 1);
+
+	/* make sure the INITTHD of the scheduler is created on all cores.. for cross-core sl initialization to work! */
+	for (i = 0; i < NUM_CPU; i++) {
+		while (!ps_load(&init_done[i])) ;
+	}
+	sl_sched_loop_nonblock();
+
+	PRINTC("Should never get here!\n");
+	assert(0);
+}
diff --git a/src/components/implementation/tests/unit_slrcv/spinlib.c b/src/components/implementation/tests/unit_slrcv/spinlib.c
new file mode 100644
index 0000000000..22ff1218b3
--- /dev/null
+++ b/src/components/implementation/tests/unit_slrcv/spinlib.c
@@ -0,0 +1,110 @@
+#include "spinlib.h"
+#include <sl.h>
+
+#define SPINLIB_CALIB 256
+
+static u64_t spinlib_cycs_per_spin_iters = 0;
+static u64_t spinlib_usecs_per_spin_iters = 0;
+unsigned int spinlib_cycs_per_us = 0;
+static unsigned int spinlib_init = 0;
+
+void spinlib_calib(unsigned int cycs_per_us) __attribute__((optimize("O0")));
+void spinlib_usecs(cycles_t usecs) __attribute__((optimize("O0")));
+void spinlib_cycles(cycles_t cycs) __attribute__((optimize("O0")));
+void spinlib_std_iters(void) __attribute__((optimize("O0")));
+
+#define SPINLIB_TEST_NITEMS 4
+
+static void
+spinlib_calib_test(void)
+{
+	microsec_t test_us[SPINLIB_TEST_NITEMS] = { 1000, 2000, 3000, 4000 };
+	int i;
+
+	for (i = 0; i < SPINLIB_TEST_NITEMS; i++) {
+		cycles_t st, end, elapsed_cycs;
+
+		rdtscll(st);
+		spinlib_usecs(test_us[i]);
+		rdtscll(end);
+		elapsed_cycs = end - st;
+
+		PRINTC("SPIN %lluus => elapsed :%lluus %llucycs\n", test_us[i], elapsed_cycs, sl_cyc2usec(elapsed_cycs));
+	}
+}
+
+void
+spinlib_std_iters(void)
+{
+	unsigned int i;
+
+	for (i = 0 ; i < SPINLIB_ITERS_SPIN ; i++) {
+		__asm__ __volatile__("nop": : :"memory");
+	}
+}
+
+/* time taken in that loop */
+void
+spinlib_calib(unsigned int cycs_per_us)
+{
+	cycles_t total_cycs = 0;
+	unsigned int iters = 0;
+
+	if (spinlib_init) return;
+	spinlib_cycs_per_us = cycs_per_us;
+
+	while (iters < SPINLIB_CALIB) {
+		cycles_t start, end;
+
+		rdtscll(start);
+		spinlib_std_iters();
+		rdtscll(end);
+
+		total_cycs += (end - start);
+		iters ++;
+	}
+
+	spinlib_cycs_per_spin_iters = total_cycs / SPINLIB_CALIB;
+	spinlib_usecs_per_spin_iters = spinlib_cycs_per_spin_iters / spinlib_cycs_per_us;
+
+	spinlib_init = 0;
+	printc("Spin calibration: ITERS:%u Cycs/ITERS:%llu usecs/ITERS:%llu\n",
+	       SPINLIB_ITERS_SPIN, spinlib_cycs_per_spin_iters, spinlib_usecs_per_spin_iters);
+	spinlib_calib_test();
+}
+
+void
+spinlib_cycles(cycles_t cycs)
+{
+	unsigned int i = 0;
+	unsigned int iters = cycs / spinlib_cycs_per_spin_iters;
+	unsigned int left = cycs % spinlib_cycs_per_spin_iters;
+
+	assert(cycs >= spinlib_cycs_per_spin_iters);
+
+	/* round off to next cycs/spin */
+	if (left >= (spinlib_cycs_per_spin_iters / 2)) iters ++;
+
+	while (i < iters) {
+		spinlib_std_iters();
+		i ++;
+	}
+}
+
+void
+spinlib_usecs(cycles_t usecs)
+{
+	unsigned int i = 0;
+	unsigned int iters = usecs / spinlib_usecs_per_spin_iters;
+	unsigned int left = usecs % spinlib_usecs_per_spin_iters;
+
+	assert(usecs >= spinlib_usecs_per_spin_iters);
+
+	/* round off to next usec */
+	if (left >= (spinlib_usecs_per_spin_iters / 2)) iters ++;
+
+	while (i < iters) {
+		spinlib_std_iters();
+		i ++;
+	}
+}
diff --git a/src/components/implementation/tests/unit_slrcv/spinlib.h b/src/components/implementation/tests/unit_slrcv/spinlib.h
new file mode 100644
index 0000000000..6c477fc48c
--- /dev/null
+++ b/src/components/implementation/tests/unit_slrcv/spinlib.h
@@ -0,0 +1,20 @@
+#ifndef SPINLIB_H
+#define SPINLIB_H
+
+#include <cos_debug.h>
+#include <cos_types.h>
+#include <cos_component.h>
+
+/*
+ * this is probably the trickiest thing to configure and
+ * the accuracy of the workgen depends very much on this.
+ */
+#define SPINLIB_ITERS_SPIN (51000)
+
+extern unsigned int spinlib_cycs_per_us;
+
+extern void spinlib_calib(unsigned int cycs_per_us);
+extern void spinlib_usecs(cycles_t usecs);
+extern void spinlib_cycles(cycles_t cycs);
+
+#endif /* SPINLIB_H */
diff --git a/src/platform/i386/runscripts/unit_slrcv.sh b/src/platform/i386/runscripts/unit_slrcv.sh
new file mode 100644
index 0000000000..a12a03d75d
--- /dev/null
+++ b/src/platform/i386/runscripts/unit_slrcv.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+cp unit_slrcvtest.o llboot.o
+./cos_linker "llboot.o, :" ./gen_client_stub

From 357676b9242d4ee3fb58e1834461bcf002279d11 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Wed, 22 May 2019 13:25:32 -0400
Subject: [PATCH 102/127] SparseLU works but is not stable (remember to bump up
 number of tasks to 8k)

* it says successful sometimes, failed sometimes.
---
 src/components/Makefile.comp                  |  1 -
 .../no_interface/omp_dijkstra/posix_basic.c   | 44 +++++++++++++++----
 .../omp_sparselu_single_bots/app-desc.h       |  7 +--
 src/components/include/deque.h                |  4 +-
 src/components/include/part_task.h            |  2 +-
 src/components/lib/Makefile                   |  2 +-
 src/components/lib/cos_gomp/cos_gomp.c        |  9 +++-
 src/components/lib/part_raw.c                 | 36 ++++++++++++++-
 src/platform/i386/qemu-kvm.sh                 |  2 +-
 9 files changed, 84 insertions(+), 23 deletions(-)

diff --git a/src/components/Makefile.comp b/src/components/Makefile.comp
index 6afb70aaff..2a4887534e 100644
--- a/src/components/Makefile.comp
+++ b/src/components/Makefile.comp
@@ -56,7 +56,6 @@ LDFLAGS+=-no-pie
 CFLAGS+=-fno-pie
 CXXFLAGS+=-fno-pie
 endif
-CFLAGS+=-march=i386 -msse
 
 SERVER_STUB=s_stub.o
 CLIENT_STUB=c_stub.o
diff --git a/src/components/implementation/no_interface/omp_dijkstra/posix_basic.c b/src/components/implementation/no_interface/omp_dijkstra/posix_basic.c
index 5a86408010..522c760962 100644
--- a/src/components/implementation/no_interface/omp_dijkstra/posix_basic.c
+++ b/src/components/implementation/no_interface/omp_dijkstra/posix_basic.c
@@ -12,6 +12,40 @@
 #include <cos_types.h>
 #include <ps.h>
 
+/*
+ * hack for more memory using the most insecure feature in composite: 
+ * map random physical addresses to virtual addresses and do whatever with it!
+ */
+#define START_PHY round_up_to_page(0x00100000 + COS_PHYMEM_MAX_SZ + PAGE_SIZE)
+#define PHY_MAX (512*1024*1024)
+
+static unsigned free_phy_offset = 0;
+
+void *
+__alloc_memory(size_t sz)
+{
+	void *va = NULL;
+	struct cos_compinfo *ci = cos_compinfo_get(cos_defcompinfo_curr_get());
+	unsigned off = ps_faa(&free_phy_offset, sz);
+
+	/* 
+	 * first use physical memory hack and 
+	 * if we run out, then use heap alloc so 
+	 * we don't run out of standard memory first 
+	 */
+	if (off > PHY_MAX || off + sz > PHY_MAX) {
+		va = cos_page_bump_allocn(ci, round_up_to_page(sz));
+	} else {
+		/* use physical memory hack! */
+		va = cos_hw_map(ci, BOOT_CAPTBL_SELF_INITHW_BASE, START_PHY + off, sz);
+	}
+
+	assert(va);
+	memset(va, 0, sz);
+
+	return va;
+}
+
 //#include <memmgr.h>
 
 // HACK: The hack to end all hacks
@@ -31,15 +65,9 @@ cos_mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset)
 		return MAP_FAILED;
 	}
 
-	int pages;
-	if (length % 4096) {
-		pages = length / 4096 + 1;
-	} else {
-		pages = length / 4096;
-	}
-
 	//addr = (void *)memmgr_heap_page_allocn(pages);
-	addr = (void *)cos_page_bump_allocn(cos_compinfo_get(cos_defcompinfo_curr_get()), pages * PAGE_SIZE);
+	addr = __alloc_memory(length);
+//	addr = (void *)cos_page_bump_allocn(cos_compinfo_get(cos_defcompinfo_curr_get()), round_up_to_page(length));
 	if (!addr){
 		ret = (void *) -1;
 	} else {
diff --git a/src/components/implementation/no_interface/omp_sparselu_single_bots/app-desc.h b/src/components/implementation/no_interface/omp_sparselu_single_bots/app-desc.h
index 05f059121a..5362c8504e 100644
--- a/src/components/implementation/no_interface/omp_sparselu_single_bots/app-desc.h
+++ b/src/components/implementation/no_interface/omp_sparselu_single_bots/app-desc.h
@@ -48,9 +48,6 @@ int sparselu_check(float **SEQ, float **BENCH);
 #define KERNEL_SEQ_CALL sparselu_seq_call(SEQ);
 #define KERNEL_SEQ_FINI sparselu_fini(SEQ,"serial");
 
-/*
- * Phani: start with not doing serial!
- */
-#undef BOTS_APP_CHECK_USES_SEQ_RESULT
-//#define KERNEL_CHECK sparselu_check(SEQ,BENCH);
+#define BOTS_APP_CHECK_USES_SEQ_RESULT
+#define KERNEL_CHECK sparselu_check(SEQ,BENCH);
 
diff --git a/src/components/include/deque.h b/src/components/include/deque.h
index 7f5a1fe164..696eb5781c 100644
--- a/src/components/include/deque.h
+++ b/src/components/include/deque.h
@@ -19,7 +19,7 @@
  * PPoPP implementation paper, "Correct and Efficient Work-Stealing for Weak Memory Models"
  * https://www.di.ens.fr/~zappa/readings/ppopp13.pdf
  */
-#define DEQUE_MAX_SZ (1<<13)
+#define DEQUE_MAX_SZ (1<<14)
 
 #define DEQUE_PROTOTYPE(name, type)							\
 struct deque_##name {									\
@@ -37,7 +37,7 @@ deque_init_##name(struct deque_##name *q, size_t sz)					\
 											\
 	if (sz) {									\
 		/* only for size with pow of 2 */					\
-		assert((sz & (sz - 1)) == 0);						\
+		/* assert((sz & (sz - 1)) == 0); */					\
 		assert(sz <= DEQUE_MAX_SZ);						\
 	} else {									\
 		sz = DEQUE_MAX_SZ;							\
diff --git a/src/components/include/part_task.h b/src/components/include/part_task.h
index 0a5ec834b3..a42f9ad64a 100644
--- a/src/components/include/part_task.h
+++ b/src/components/include/part_task.h
@@ -11,7 +11,7 @@
 #define PART_THD_COREID(t) (t >> 16)
 #define PART_THD_THDID(t)  ((t << 16) >> 16)
 
-#define PART_MAX_TASKS      2048 
+#define PART_MAX_TASKS      2048
 #define PART_MAX_DATA       2048
 #define PART_MAX_PAR_THDS   NUM_CPU
 #define PART_MAX_CORE_THDS  48
diff --git a/src/components/lib/Makefile b/src/components/lib/Makefile
index 0ede0f381b..e6313481a5 100644
--- a/src/components/lib/Makefile
+++ b/src/components/lib/Makefile
@@ -62,7 +62,7 @@ distclean:
 	make -C libcxx clean
 
 musl:
-	cd musl-1.1.11; ./configure "CFLAGS=-m32 -march=i386 -msse -O3" "LDFLAGS=-Wl,-melf_i386" --disable-shared --target=i386; cd ..
+	cd musl-1.1.11; ./configure "CFLAGS=-m32 -O3" "LDFLAGS=-Wl,-melf_i386" --disable-shared --target=i386; cd ..
 	make -C musl-1.1.11
 	make -C musl-1.1.11 install
 
diff --git a/src/components/lib/cos_gomp/cos_gomp.c b/src/components/lib/cos_gomp/cos_gomp.c
index f388db628f..1c338c537b 100644
--- a/src/components/lib/cos_gomp/cos_gomp.c
+++ b/src/components/lib/cos_gomp/cos_gomp.c
@@ -319,10 +319,15 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
 	assert(depend == NULL);
 
 	if (if_clause) {
-		struct part_task *pt = _cos_gomp_alloc_explicit();
-		struct part_data *d = part_data_alloc();
+		struct part_task *pt;
+		struct part_data *d;
 		char *arg = NULL;
 
+		pt = _cos_gomp_alloc_explicit();
+		assert(pt);
+		d = part_data_alloc();
+		assert(d);
+
 		assert(pt && d);
 		assert(arg_size + arg_align - 1 <= PART_MAX_DATA);
 		memset(d->data, 0, PART_MAX_DATA);
diff --git a/src/components/lib/part_raw.c b/src/components/lib/part_raw.c
index 04c130d9b0..273ce48f4a 100644
--- a/src/components/lib/part_raw.c
+++ b/src/components/lib/part_raw.c
@@ -162,6 +162,21 @@ partdata_store_init_all(vaddr_t mem)
 	}
 }
 
+static inline struct part_data *
+partdata_store_dequeue_any(void)
+{
+	struct part_data *p = NULL;
+	int i = 0;
+
+	for (i = 0; i < NUM_CPU; i++) {
+		p = partdata_store_dequeue(&pd_head[(cos_cpuid() + i) % NUM_CPU]);
+
+		if (p) break;
+	}
+
+	return p;
+}
+
 struct parttask_head pt_head[NUM_CPU];
 
 static inline void
@@ -179,6 +194,21 @@ parttask_store_init_all(vaddr_t mem)
 	}
 }
 
+static inline struct part_task *
+parttask_store_dequeue_any(void)
+{
+	struct part_task *p = NULL;
+	int i = 0;
+
+	for (i = 0; i < NUM_CPU; i++) {
+		p = parttask_store_dequeue(&pt_head[(cos_cpuid() + i) % NUM_CPU]);
+
+		if (p) break;
+	}
+
+	return p;
+}
+
 /* idle thread to wakeup when there is nothing to do on this core! */
 static void
 part_idle_fn(void *d)
@@ -197,7 +227,8 @@ part_idle_fn(void *d)
 struct part_data *
 part_data_alloc(void)
 {
-	struct part_data *d = partdata_store_dequeue(&pd_head[cos_cpuid()]);
+	struct part_data *d = partdata_store_dequeue_any();
+	//struct part_data *d = partdata_store_dequeue(&pd_head[cos_cpuid()]);
 
 	if (!d) return d;
 	if (!ps_cas(&d->flag, 0, 1)) assert(0);
@@ -242,7 +273,8 @@ part_data_free(struct part_data *d)
 struct part_task *
 part_task_alloc(part_task_type_t type)
 {
-	struct part_task *t = parttask_store_dequeue(&pt_head[cos_cpuid()]);
+	struct part_task *t = parttask_store_dequeue_any();
+	//struct part_task *t = parttask_store_dequeue(&pt_head[cos_cpuid()]);
 
 	if (!t) return t;
 
diff --git a/src/platform/i386/qemu-kvm.sh b/src/platform/i386/qemu-kvm.sh
index 2ec66f87b1..d1d7c43efe 100755
--- a/src/platform/i386/qemu-kvm.sh
+++ b/src/platform/i386/qemu-kvm.sh
@@ -12,4 +12,4 @@ fi
 MODULES=$(sh $1 | awk '/^Writing image/ { print $3; }' | tr '\n' ' ')
 
 #qemu-system-i386 -m 768 -nographic -kernel kernel.img -no-reboot -s -initrd "$(echo $MODULES | tr ' ' ',')"
-qemu-system-i386 -enable-kvm -rtc base=localtime,clock=host,driftfix=none -smp sockets=1,cores=4,threads=1 -cpu host -nographic -m 768 -kernel kernel.img -initrd "$(echo $MODULES | tr ' ' ',')"
+qemu-system-i386 -enable-kvm -rtc base=localtime,clock=host,driftfix=none -smp sockets=1,cores=6,threads=1 -cpu host -nographic -m 2048 -kernel kernel.img -initrd "$(echo $MODULES | tr ' ' ',')"

From e05963a85210b2d29a54aa10658b8e14021a7893 Mon Sep 17 00:00:00 2001
From: Phani <phanikishoreg@gwu.edu>
Date: Wed, 22 May 2019 13:35:54 -0400
Subject: [PATCH 103/127] makefile fix: cannot clean ps on distclean (first
 time)

* I recently added that rule to clean ps on distclean but I didn't
  realize it wont work on fresh clone, for now reverting it.
---
 src/components/lib/Makefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/components/lib/Makefile b/src/components/lib/Makefile
index e6313481a5..5d55f38cbe 100644
--- a/src/components/lib/Makefile
+++ b/src/components/lib/Makefile
@@ -57,9 +57,8 @@ distclean:
 	make -C musl-1.1.11 distclean
 # keep the following commands in one line. make executes each line
 # with a new shell.
-	make -C ck uninstall
-	make -C ps clean
 	make -C libcxx clean
+	make -C ck uninstall
 
 musl:
 	cd musl-1.1.11; ./configure "CFLAGS=-m32 -O3" "LDFLAGS=-Wl,-melf_i386" --disable-shared --target=i386; cd ..

From 8dc3f883d072c0b421b7324e7a3e3b1b041ddc8a Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Wed, 22 May 2019 22:12:06 -0400
Subject: [PATCH 104/127] Timer programming only in the case of scheduler
 thread scheduling

---
 src/components/include/sl.h      | 20 ++++++++++----------
 src/components/lib/sl/sl_sched.c |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index 4d41e2770a..f6fe36cddf 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -574,7 +574,7 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 	struct sl_thd         *t = to;
 	struct sl_global_core *globals = sl__globals_core();
 	sched_tok_t            tok;
-	cycles_t               now;
+//	cycles_t               now;
 	s64_t                  offset;
 	int                    ret;
 
@@ -584,12 +584,12 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 #endif
 
 	tok    = cos_sched_sync();
-	now    = sl_now();
+//	now    = sl_now();
 
 	/* still wakeup without timeouts? that adds to dispatch overhead! */
-	offset = (s64_t)(globals->timer_next - now);
-	if (globals->timer_next && offset <= 0) sl_timeout_expended(now, globals->timer_next);
-	sl_timeout_wakeup_expired(now);
+//	offset = (s64_t)(globals->timer_next - now);
+//	if (globals->timer_next && offset <= 0) sl_timeout_expended(now, globals->timer_next);
+//	sl_timeout_wakeup_expired(now);
 
 	/*
 	 * Once we exit, we can't trust t's memory as it could be
@@ -625,11 +625,11 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 	 * if the periodic timer is already ahead,
 	 * don't reprogram it!
 	 */
-	if (likely(offset > globals->cyc_per_usec && globals->timer_prev)) {
+//	if (likely(offset > globals->cyc_per_usec && globals->timer_prev)) {
 		ret = sl_thd_dispatch(t, tok, sl_thd_curr());
-	} else {
-		ret = sl_thd_activate(t, tok, globals->timeout_next);
-	}
+//	} else {
+//		ret = sl_thd_activate(t, tok, globals->timeout_next);
+//	}
 
 	/*
 	 * one observation, in slowpath switch:
@@ -713,7 +713,7 @@ sl_cs_exit_schedule_nospin_arg_timeout(struct sl_thd *to, cycles_t abs_timeout)
 		struct sl_thd_policy *pt = sl_mod_schedule();
 
 		if (unlikely(!pt))
-			t = globals->sched_thd;
+			t = globals->idle_thd;
 		else
 			t = sl_mod_thd_get(pt);
 	}
diff --git a/src/components/lib/sl/sl_sched.c b/src/components/lib/sl/sl_sched.c
index 212f416a8b..60c07c1da1 100644
--- a/src/components/lib/sl/sl_sched.c
+++ b/src/components/lib/sl/sl_sched.c
@@ -769,7 +769,7 @@ sl_sched_loop_intern(int non_block)
 
 		if (sl_cs_enter_sched()) continue;
 		/* If switch returns an inconsistency, we retry anyway */
-		sl_cs_exit_schedule_nospin();
+		sl_cs_exit_schedule_nospin_timeout(g->timer_next);
 	}
 }
 

From 8e28bcfd10e404af8cd157d8b078e114593724d3 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Wed, 22 May 2019 22:13:17 -0400
Subject: [PATCH 105/127] Added the program for work-conservation problem
 analysis

---
 .../omp_workconservation/Makefile             | 10 ++
 .../no_interface/omp_workconservation/init.c  |  1 +
 .../omp_workconservation/work_problem.c       | 94 +++++++++++++++++++
 src/components/include/part.h                 |  5 +-
 .../i386/runscripts/omp_workconsprob.sh       | 10 ++
 5 files changed, 118 insertions(+), 2 deletions(-)
 create mode 100644 src/components/implementation/no_interface/omp_workconservation/Makefile
 create mode 120000 src/components/implementation/no_interface/omp_workconservation/init.c
 create mode 100644 src/components/implementation/no_interface/omp_workconservation/work_problem.c
 create mode 100644 src/platform/i386/runscripts/omp_workconsprob.sh

diff --git a/src/components/implementation/no_interface/omp_workconservation/Makefile b/src/components/implementation/no_interface/omp_workconservation/Makefile
new file mode 100644
index 0000000000..816ae03c7e
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_workconservation/Makefile
@@ -0,0 +1,10 @@
+COMPONENT=omp_workconsprob.o
+INTERFACES=
+#DEPENDENCIES=capmgr
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
+
+CFLAGS += -fopenmp
diff --git a/src/components/implementation/no_interface/omp_workconservation/init.c b/src/components/implementation/no_interface/omp_workconservation/init.c
new file mode 120000
index 0000000000..b2694bf833
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_workconservation/init.c
@@ -0,0 +1 @@
+../omp_hello/init.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_workconservation/work_problem.c b/src/components/implementation/no_interface/omp_workconservation/work_problem.c
new file mode 100644
index 0000000000..6d03cee429
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_workconservation/work_problem.c
@@ -0,0 +1,94 @@
+#include <assert.h>
+#include <sched.h>
+#include <sys/syscall.h>
+#include <stdio.h>
+#include <omp.h>
+#include <llprint.h>
+#include <cos_component.h>
+#include <sl.h>
+
+#define GETTID() cos_thdid()
+#define sched_getcpu() cos_cpuid()
+#define CYC_US 3200
+
+/*
+ * From Chaos tests!
+ * NOTE: number obtained by running composite instance with no interference..
+ *       (validated with fiasco so far, it is 10us)
+ */
+#define ITERS_10US 5850
+#define MULTIPLE 100
+
+#define SPIN_ITERS (ITERS_10US*MULTIPLE)
+
+static void __spin_fn(void) __attribute__((optimize("O0")));
+
+static void
+__spin_fn(void)
+{
+        unsigned int spin = 0;
+
+        while (spin < SPIN_ITERS) {
+                __asm__ __volatile__("nop": : :"memory");
+                spin++;
+        }
+}
+
+#define ITERS 10
+
+int main(void)
+{
+	unsigned long long max = 0;
+	int i;
+	unsigned long long x, y;
+
+	rdtscll(x);
+	__spin_fn();
+	rdtscll(y);
+	printc("%llu:%llu\n", y - x, sl_cyc2usec(y - x));
+
+
+	for (i = 0; i < ITERS; i++) {
+		volatile unsigned long long st = 0, en = 0;
+
+		rdtscll(st);
+		#pragma omp parallel
+		{
+			//printf("(a, %u:%u, %d)\n", sched_getcpu(), GETTID(), omp_get_thread_num());
+			#pragma omp single
+			{
+				//printf("(b, %u:%u, %d)\n", sched_getcpu(), GETTID(), omp_get_thread_num());
+				#pragma omp task
+				{
+					//printf("(c, %u:%u, %d)\n", sched_getcpu(), GETTID(), omp_get_thread_num());
+					#pragma omp task
+					{
+						__spin_fn();
+						//printf("(d, %u:%u, %d)\n", sched_getcpu(), GETTID(), omp_get_thread_num());
+					}
+					#pragma omp taskwait
+				}
+
+				#pragma omp task
+				{
+					__spin_fn();
+					//printf("(e, %u:%u, %d)\n", sched_getcpu(), GETTID(), omp_get_thread_num());
+				}
+				__spin_fn();
+				#pragma omp taskwait
+			}
+			//printf("(f, %u:%u, %d)\n", sched_getcpu(), GETTID(), omp_get_thread_num());
+		}
+		rdtscll(en);
+		long diff = en - st;
+		if (diff > 0) {
+		       if (max < diff) max = diff;
+			printc("%llu\n", (en - st) / CYC_US);
+		}
+	}
+
+	printc("Max: %llu\n", max / CYC_US);
+//	printf("Time: %llu, %llu\n", en - st, (en -st) / CYC_US);
+
+	return 0;
+}
diff --git a/src/components/include/part.h b/src/components/include/part.h
index cdf97767bf..77decb709f 100644
--- a/src/components/include/part.h
+++ b/src/components/include/part.h
@@ -234,8 +234,8 @@ part_list_remove(struct part_task *t)
 	int in_nest = 0;
 
 	assert(t->type == PART_TASK_T_WORKSHARE);
-	assert(t->nthds > 1);
 #if defined(PART_ENABLE_NESTED)
+	assert(t->nthds > 1);
 	assert(!ps_list_singleton(t, partask));
 
 	crt_lock_take(&part_l_lock);
@@ -327,6 +327,7 @@ part_task_barrier(struct part_task *t, int is_end)
 		if (t->type == PART_TASK_T_WORKSHARE) {
 			assert(is_master);
 			ts->part_context = t->parent;
+			part_list_remove(t);
 
 			return;
 		}
@@ -357,7 +358,7 @@ part_task_barrier(struct part_task *t, int is_end)
 			sl_thd_block(0);
 		}
 	}
-	assert(ps_load(&t->barrier_epoch) == cbep + 1);
+	//assert(ps_load(&t->barrier_epoch) == cbep + 1);
 
 	if (!is_end) return;
 	ps_faa(&t->end, 1);
diff --git a/src/platform/i386/runscripts/omp_workconsprob.sh b/src/platform/i386/runscripts/omp_workconsprob.sh
new file mode 100644
index 0000000000..5e7a8985a6
--- /dev/null
+++ b/src/platform/i386/runscripts/omp_workconsprob.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+cp omp_workconsprob.o llboot.o
+./cos_linker "llboot.o, :" ./gen_client_stub
+
+#cp llboot_comp.o llboot.o
+#cp omp_hello.o boot.o
+#cp test_boot.o dummy1.o
+#cp test_boot.o dummy2.o
+#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub

From 3ad80d2b63529df0203eacb55b8f2a27d9a95183 Mon Sep 17 00:00:00 2001
From: Phani <phanikishoreg@gwu.edu>
Date: Fri, 24 May 2019 16:29:45 -0400
Subject: [PATCH 106/127] random changes on the last 2 days

---
 .../no_interface/omp_dijkstra/posix_basic.c   |   9 +-
 .../no_interface/omp_ubench/Makefile          |  19 +
 .../no_interface/omp_ubench/init.c            |   1 +
 .../no_interface/omp_ubench/posix_basic.c     |   1 +
 .../no_interface/omp_ubench/ubench.c          | 156 ++++++
 .../omp_workconservation/work_problem.c       |  27 +-
 .../tests/micro_booter/mb_tests.c             |  92 ++--
 .../tests/micro_booter/micro_booter.c         |   4 +-
 .../tests/micro_booter/micro_booter.h         |   2 +-
 .../implementation/tests/micro_chan/Makefile  |   8 +
 .../tests/micro_chan/unit_schedlib.c          | 476 ++++++++++++++++++
 .../tests/unit_schedcomp/unit_schedlib.c      |  24 +-
 .../tests/unit_schedtests/inv.S               |   1 +
 .../tests/unit_schedtests/unit_schedlib.c     | 139 ++++-
 src/components/include/crt_chan.h             |   3 +-
 src/components/include/part_task.h            |   6 +-
 src/kernel/include/shared/cos_config.h        |   2 +-
 src/platform/i386/runscripts/omp_ubench.sh    |  10 +
 18 files changed, 898 insertions(+), 82 deletions(-)
 create mode 100644 src/components/implementation/no_interface/omp_ubench/Makefile
 create mode 120000 src/components/implementation/no_interface/omp_ubench/init.c
 create mode 120000 src/components/implementation/no_interface/omp_ubench/posix_basic.c
 create mode 100644 src/components/implementation/no_interface/omp_ubench/ubench.c
 create mode 100644 src/components/implementation/tests/micro_chan/Makefile
 create mode 100644 src/components/implementation/tests/micro_chan/unit_schedlib.c
 create mode 120000 src/components/implementation/tests/unit_schedtests/inv.S
 create mode 100644 src/platform/i386/runscripts/omp_ubench.sh

diff --git a/src/components/implementation/no_interface/omp_dijkstra/posix_basic.c b/src/components/implementation/no_interface/omp_dijkstra/posix_basic.c
index 522c760962..41c8507068 100644
--- a/src/components/implementation/no_interface/omp_dijkstra/posix_basic.c
+++ b/src/components/implementation/no_interface/omp_dijkstra/posix_basic.c
@@ -17,7 +17,7 @@
  * map random physical addresses to virtual addresses and do whatever with it!
  */
 #define START_PHY round_up_to_page(0x00100000 + COS_PHYMEM_MAX_SZ + PAGE_SIZE)
-#define PHY_MAX (512*1024*1024)
+#define PHY_MAX ((512 * 1024 * 1024) + (256 * 1024 * 1024))
 
 static unsigned free_phy_offset = 0;
 
@@ -26,7 +26,11 @@ __alloc_memory(size_t sz)
 {
 	void *va = NULL;
 	struct cos_compinfo *ci = cos_compinfo_get(cos_defcompinfo_curr_get());
-	unsigned off = ps_faa(&free_phy_offset, sz);
+	//unsigned off = ps_faa(&free_phy_offset, sz);
+	unsigned off;
+
+try_again:
+	off = ps_load(&free_phy_offset);
 
 	/* 
 	 * first use physical memory hack and 
@@ -36,6 +40,7 @@ __alloc_memory(size_t sz)
 	if (off > PHY_MAX || off + sz > PHY_MAX) {
 		va = cos_page_bump_allocn(ci, round_up_to_page(sz));
 	} else {
+		if (!ps_cas(&free_phy_offset, off, off + sz)) goto try_again;
 		/* use physical memory hack! */
 		va = cos_hw_map(ci, BOOT_CAPTBL_SELF_INITHW_BASE, START_PHY + off, sz);
 	}
diff --git a/src/components/implementation/no_interface/omp_ubench/Makefile b/src/components/implementation/no_interface/omp_ubench/Makefile
new file mode 100644
index 0000000000..d93533c7e5
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_ubench/Makefile
@@ -0,0 +1,19 @@
+COMPONENT=omp_ubench.o
+INTERFACES=
+#DEPENDENCIES=capmgr
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api -lcos_gomp $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
+
+CFLAGS += -fopenmp
+# if tied tasks are required
+#CFLAGS += -DFORCE_TIED_TASKS
+
+#OMPC_FINAL_FLAGS=
+
+# one per compilation or none
+#CFLAGS += -DMANUAL_CUTOFF
+#CFLAGS += -DIF_CUTOFF
+#CFLAGS += -DFINAL_CUTOFF $(OMPC_FINAL_FLAGS) 
diff --git a/src/components/implementation/no_interface/omp_ubench/init.c b/src/components/implementation/no_interface/omp_ubench/init.c
new file mode 120000
index 0000000000..9e09b82e77
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_ubench/init.c
@@ -0,0 +1 @@
+../omp_fib_bots/init.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_ubench/posix_basic.c b/src/components/implementation/no_interface/omp_ubench/posix_basic.c
new file mode 120000
index 0000000000..9afee078fb
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_ubench/posix_basic.c
@@ -0,0 +1 @@
+../omp_fib_bots/posix_basic.c
\ No newline at end of file
diff --git a/src/components/implementation/no_interface/omp_ubench/ubench.c b/src/components/implementation/no_interface/omp_ubench/ubench.c
new file mode 100644
index 0000000000..6d22daaf25
--- /dev/null
+++ b/src/components/implementation/no_interface/omp_ubench/ubench.c
@@ -0,0 +1,156 @@
+#include <omp.h>
+#include <cos_types.h>
+#include <sl.h>
+#include <cos_component.h>
+
+#define ITERS 1000
+#define RECUR 4 
+
+#define DISPLAY_VALS
+
+void
+test_parallel(void)
+{
+	cycles_t max = 0, total = 0;
+	int i, x = 0;
+
+	for (i = 0; i < ITERS; i++) {
+		cycles_t st, en, diff;
+
+		rdtscll(st);
+		#pragma omp parallel
+		{
+			x++;
+		}
+		rdtscll(en);
+
+		diff = en - st;
+		total += diff;
+		if (diff > max) max = diff;
+#ifdef DISPLAY_VALS
+		PRINTC("%llu\n", diff);
+#endif
+	}
+
+	PRINTC("uBench Parallel (NCORES:%u, NITERS=%d): AVG:%llu WC:%llu\n", NUM_CPU, ITERS, total / ITERS, max);
+}
+
+void
+test_parallel_critical(void)
+{
+	cycles_t max = 0, total = 0;
+	int i, x = 0;
+
+	for (i = 0; i < ITERS; i++) {
+		cycles_t st, en, diff;
+
+		rdtscll(st);
+		#pragma omp parallel
+		{
+			#pragma omp critical
+			{
+				x++;
+			}
+		}
+		rdtscll(en);
+
+		diff = en - st;
+		total += diff;
+		if (diff > max) max = diff;
+#ifdef DISPLAY_VALS
+		PRINTC("%llu\n", diff);
+#endif
+	}
+
+	PRINTC("uBench Parallel+Critical (NCORES:%u, NITERS=%d): AVG:%llu WC:%llu\n", NUM_CPU, ITERS, total / ITERS, max);
+}
+
+void
+test_parallel_task(void)
+{
+	cycles_t max = 0, total = 0;
+	int i, x = 0, y = 0;
+
+	for (i = 0; i < ITERS; i++) {
+		cycles_t st, en, diff;
+
+		rdtscll(st);
+		#pragma omp parallel
+		{
+			x++;
+			#pragma omp task
+			{
+				y++;
+			}
+			#pragma omp taskwait
+		}
+		rdtscll(en);
+
+		diff = en - st;
+		total += diff;
+		if (diff > max) max = diff;
+#ifdef DISPLAY_VALS
+		PRINTC("%llu\n", diff);
+#endif
+	}
+
+	PRINTC("uBench Parallel+Task+Taskwait (NCORES:%u, NITERS=%d): AVG:%llu WC:%llu\n", NUM_CPU, ITERS, total / ITERS, max);
+}
+
+void
+test_parallel_task_4levels(void)
+{
+	cycles_t max = 0, total = 0;
+	int i, x = 0, y = 0;
+
+	for (i = 0; i < ITERS; i++) {
+		cycles_t st, en, diff;
+
+		rdtscll(st);
+		#pragma omp parallel
+		{
+			x++;
+			#pragma omp task
+			{
+				#pragma omp task
+				{
+					#pragma omp task
+					{
+						#pragma omp task
+						{
+							y++;
+						}
+						#pragma omp taskwait
+						y++;
+					}
+					#pragma omp taskwait
+					y++;
+				}
+				#pragma omp taskwait
+				y++;
+			}
+			#pragma omp taskwait
+		}
+		rdtscll(en);
+
+		diff = en - st;
+		total += diff;
+		if (diff > max) max = diff;
+#ifdef DISPLAY_VALS
+		PRINTC("%llu\n", diff);
+#endif
+	}
+
+	PRINTC("uBench Parallel+Task 4levels+Taskwait (NCORES:%u, NITERS=%d): AVG:%llu WC:%llu\n", NUM_CPU, ITERS, total / ITERS, max);
+}
+
+int
+main(void)
+{
+//	test_parallel();
+//	test_parallel_critical();
+	test_parallel_task();
+//	test_parallel_task_4levels();
+
+	return 0;
+}
diff --git a/src/components/implementation/no_interface/omp_workconservation/work_problem.c b/src/components/implementation/no_interface/omp_workconservation/work_problem.c
index 6d03cee429..b2cafd461c 100644
--- a/src/components/implementation/no_interface/omp_workconservation/work_problem.c
+++ b/src/components/implementation/no_interface/omp_workconservation/work_problem.c
@@ -17,7 +17,7 @@
  *       (validated with fiasco so far, it is 10us)
  */
 #define ITERS_10US 5850
-#define MULTIPLE 100
+#define MULTIPLE 10000
 
 #define SPIN_ITERS (ITERS_10US*MULTIPLE)
 
@@ -34,18 +34,18 @@ __spin_fn(void)
         }
 }
 
-#define ITERS 10
+#define ITERS 1000
 
 int main(void)
 {
-	unsigned long long max = 0;
+	unsigned long long max = 0, total = 0;
 	int i;
 	unsigned long long x, y;
 
 	rdtscll(x);
 	__spin_fn();
 	rdtscll(y);
-	printc("%llu:%llu\n", y - x, sl_cyc2usec(y - x));
+	printc("%llu:%llu\n\n\n", y - x, sl_cyc2usec(y - x));
 
 
 	for (i = 0; i < ITERS; i++) {
@@ -54,17 +54,13 @@ int main(void)
 		rdtscll(st);
 		#pragma omp parallel
 		{
-			//printf("(a, %u:%u, %d)\n", sched_getcpu(), GETTID(), omp_get_thread_num());
 			#pragma omp single
 			{
-				//printf("(b, %u:%u, %d)\n", sched_getcpu(), GETTID(), omp_get_thread_num());
 				#pragma omp task
 				{
-					//printf("(c, %u:%u, %d)\n", sched_getcpu(), GETTID(), omp_get_thread_num());
 					#pragma omp task
 					{
 						__spin_fn();
-						//printf("(d, %u:%u, %d)\n", sched_getcpu(), GETTID(), omp_get_thread_num());
 					}
 					#pragma omp taskwait
 				}
@@ -72,23 +68,22 @@ int main(void)
 				#pragma omp task
 				{
 					__spin_fn();
-					//printf("(e, %u:%u, %d)\n", sched_getcpu(), GETTID(), omp_get_thread_num());
 				}
 				__spin_fn();
 				#pragma omp taskwait
 			}
-			//printf("(f, %u:%u, %d)\n", sched_getcpu(), GETTID(), omp_get_thread_num());
 		}
 		rdtscll(en);
 		long diff = en - st;
-		if (diff > 0) {
-		       if (max < diff) max = diff;
-			printc("%llu\n", (en - st) / CYC_US);
-		}
+		assert(diff > 0);
+
+		total += diff;
+		if (diff > max) max = diff;
+		printc("%ld, %ld\n", diff, diff / CYC_US);
 	}
 
-	printc("Max: %llu\n", max / CYC_US);
-//	printf("Time: %llu, %llu\n", en - st, (en -st) / CYC_US);
+	printc("(cyc) Avg: %llu, Max: %llu\n", (total / ITERS), max);
+	printc("(us) Avg: %llu, Max: %llu\n", (total / ITERS) / CYC_US, max / CYC_US);
 
 	return 0;
 }
diff --git a/src/components/implementation/tests/micro_booter/mb_tests.c b/src/components/implementation/tests/micro_booter/mb_tests.c
index 5f66295200..0277a2cfbf 100644
--- a/src/components/implementation/tests/micro_booter/mb_tests.c
+++ b/src/components/implementation/tests/micro_booter/mb_tests.c
@@ -31,12 +31,15 @@ test_scb_dcb(void)
 	test_dcb();
 }
 
+volatile int switched = 0;
+
 static void
 thd_fn_perf(void *d)
 {
 	cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
 
 	while (1) {
+switched = 1;
 		cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
 	}
 	PRINTC("Error, shouldn't get here!\n");
@@ -46,7 +49,7 @@ static void
 test_thds_perf(void)
 {
 	thdcap_t  ts;
-	long long total_swt_cycles = 0;
+	long long total_swt_cycles = 0, max = 0;
 	long long start_swt_cycles = 0, end_swt_cycles = 0;
 	int       i;
 
@@ -54,15 +57,22 @@ test_thds_perf(void)
 	assert(ts);
 	cos_thd_switch(ts);
 
-	rdtscll(start_swt_cycles);
 	for (i = 0; i < ITER; i++) {
+	cycles_t diff;
+	switched = 0;
+	rdtscll(start_swt_cycles);
 		cos_thd_switch(ts);
-	}
 	rdtscll(end_swt_cycles);
-	total_swt_cycles = (end_swt_cycles - start_swt_cycles) / 2LL;
+		assert(switched);
+
+		diff = end_swt_cycles - start_swt_cycles;
+	total_swt_cycles += diff;
+		if (diff > max) max = diff;
+	}
+	//total_swt_cycles = (end_swt_cycles - start_swt_cycles) / 2LL;
 
-	PRINTC("Average THD SWTCH (Total: %lld / Iterations: %lld ): %lld\n", total_swt_cycles, (long long)ITER,
-	       (total_swt_cycles / (long long)ITER));
+	PRINTC("Average THD SWTCH (Iters: %lld ): %lld, WC:%llu\n", (long long)ITER,
+	       (total_swt_cycles / (2 * (long long)ITER)), max / 2);
 }
 
 static void
@@ -150,6 +160,7 @@ async_thd_fn_perf(void *thdcap)
 
 	for (i = 0; i < ITER + 1; i++) {
 		cos_rcv(rc, 0);
+		switched = 1;
 	}
 
 	cos_thd_switch(tc);
@@ -160,21 +171,27 @@ async_thd_parent_perf(void *thdcap)
 {
 	thdcap_t  tc                = (thdcap_t)thdcap;
 	asndcap_t sc                = scp_global[cos_cpuid()];
-	long long total_asnd_cycles = 0;
+	long long total_asnd_cycles = 0, max = 0;
 	long long start_asnd_cycles = 0, end_arcv_cycles = 0;
 	int       i;
 
 	cos_asnd(sc, 1);
 
-	rdtscll(start_asnd_cycles);
 	for (i = 0; i < ITER; i++) {
+	cycles_t diff;
+	switched = 0;
+	rdtscll(start_asnd_cycles);
 		cos_asnd(sc, 1);
-	}
 	rdtscll(end_arcv_cycles);
-	total_asnd_cycles = (end_arcv_cycles - start_asnd_cycles) / 2;
+	assert(switched);
+	diff = end_arcv_cycles - start_asnd_cycles;
+	if (diff > max) max = diff;
+	total_asnd_cycles += diff;
+	}
+	//total_asnd_cycles = (end_arcv_cycles - start_asnd_cycles) / 2;
 
-	PRINTC("Average ASND/ARCV (Total: %lld / Iterations: %lld ): %lld\n", total_asnd_cycles, (long long)(ITER),
-	       (total_asnd_cycles / (long long)(ITER)));
+	PRINTC("Average ASND+ARCV (Iterations: %lld ): %lld, WC: %llu\n", (long long)(ITER),
+	       (total_asnd_cycles / (long long)(ITER)), max);
 
 	async_test_flag[cos_cpuid()] = 0;
 	while (1) cos_thd_switch(tc);
@@ -778,7 +795,7 @@ long long midinv_cycles[NUM_CPU] = { 0LL };
 int
 test_serverfn(int a, int b, int c)
 {
-	rdtscll(midinv_cycles[cos_cpuid()]);
+	//rdtscll(midinv_cycles[cos_cpuid()]);
 	return 0xDEADBEEF;
 }
 
@@ -834,7 +851,7 @@ test_inv_perf(void)
 	compcap_t    cc;
 	sinvcap_t    ic;
 	int          i;
-	long long    total_inv_cycles = 0LL, total_ret_cycles = 0LL;
+	long long    total_inv_cycles = 0LL, total_ret_cycles = 0LL, max_inv = 0, max_ret = 0;
 	unsigned int ret;
 
 	cc = cos_comp_alloc(&booter_info, booter_info.captbl_cap, booter_info.pgtbl_cap, 0, (vaddr_t)NULL, 0);
@@ -846,19 +863,24 @@ test_inv_perf(void)
 
 	for (i = 0; i < ITER; i++) {
 		long long start_cycles = 0LL, end_cycles = 0LL;
+		long long diff_inv, diff_ret;
 
 		midinv_cycles[cos_cpuid()] = 0LL;
 		rdtscll(start_cycles);
 		call_cap_mb(ic, 1, 2, 3);
 		rdtscll(end_cycles);
-		total_inv_cycles += (midinv_cycles[cos_cpuid()] - start_cycles);
-		total_ret_cycles += (end_cycles - midinv_cycles[cos_cpuid()]);
+//		total_inv_cycles += (midinv_cycles[cos_cpuid()] - start_cycles);
+//		total_ret_cycles += (end_cycles - midinv_cycles[cos_cpuid()]);
+		diff_inv = end_cycles - start_cycles;
+
+		if (diff_inv > max_inv) max_inv = diff_inv;
+		total_inv_cycles += diff_inv;
 	}
 
-	PRINTC("Average SINV (Total: %lld / Iterations: %lld ): %lld\n", total_inv_cycles, (long long)(ITER),
-	       (total_inv_cycles / (long long)(ITER)));
-	PRINTC("Average SRET (Total: %lld / Iterations: %lld ): %lld\n", total_ret_cycles, (long long)(ITER),
-	       (total_ret_cycles / (long long)(ITER)));
+	PRINTC("Average SINV RPC (Iterations: %lld ): %lld, WC:%llu\n", (long long)(ITER),
+	       (total_inv_cycles / (long long)(ITER)), max_inv);
+//	PRINTC("Average SRET (Total: %lld / Iterations: %lld ): %lld\n", total_ret_cycles, (long long)(ITER),
+	       //(total_ret_cycles / (long long)(ITER)));
 }
 
 void
@@ -931,22 +953,22 @@ test_run_mb(void)
 	cyc_per_usec = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
 
 	/* test_ipi(); */
-	test_timer();
-	test_budgets();
-
-	test_scb_dcb();
-	test_thds();
-	test_thds_perf();
-
-	test_mem();
-
-	test_async_endpoints();
-	test_async_endpoints_perf();
-
-	test_inv();
+//	test_timer();
+//	test_budgets();
+//
+//	test_scb_dcb();
+//	test_thds();
+//	test_thds_perf();
+//
+//	test_mem();
+//
+//	test_async_endpoints();
+//	test_async_endpoints_perf();
+//
+//	test_inv();
 	test_inv_perf();
-
-	test_captbl_expand();
+//
+//	test_captbl_expand();
 
 	/*
 	 * FIXME: Preemption stack mechanism in the kernel is disabled.
diff --git a/src/components/implementation/tests/micro_booter/micro_booter.c b/src/components/implementation/tests/micro_booter/micro_booter.c
index 04316ddda9..c02a041140 100644
--- a/src/components/implementation/tests/micro_booter/micro_booter.c
+++ b/src/components/implementation/tests/micro_booter/micro_booter.c
@@ -42,8 +42,8 @@ cos_init(void)
 	assert(termthd[cos_cpuid()]);
 
 	PRINTC("Micro Booter started.\n");
-	//test_run_mb();
-	test_ipi_full();
+	test_run_mb();
+	//test_ipi_full();
 
 	/* NOTE: This is just to make sense of the output on HW! To understand that microbooter runs to completion on all cores! */
 	test_done[cos_cpuid()] = 1;
diff --git a/src/components/implementation/tests/micro_booter/micro_booter.h b/src/components/implementation/tests/micro_booter/micro_booter.h
index 7cde432d92..d39ac6e572 100644
--- a/src/components/implementation/tests/micro_booter/micro_booter.h
+++ b/src/components/implementation/tests/micro_booter/micro_booter.h
@@ -28,7 +28,7 @@
 #include <cos_kernel_api.h>
 #include <cos_dcb.h>
 
-#define ITER 10000
+#define ITER 1000000
 #define TEST_NTHDS 5
 
 extern struct cos_compinfo booter_info;
diff --git a/src/components/implementation/tests/micro_chan/Makefile b/src/components/implementation/tests/micro_chan/Makefile
new file mode 100644
index 0000000000..9ecb1154a8
--- /dev/null
+++ b/src/components/implementation/tests/micro_chan/Makefile
@@ -0,0 +1,8 @@
+COMPONENT=micro_chan.o
+INTERFACES=
+DEPENDENCIES=
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format $(LIBSLRAW) -lsl_mod_fprr -lsl_thd_static_backend -lsl_blkpt
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/tests/micro_chan/unit_schedlib.c b/src/components/implementation/tests/micro_chan/unit_schedlib.c
new file mode 100644
index 0000000000..17387a43ec
--- /dev/null
+++ b/src/components/implementation/tests/micro_chan/unit_schedlib.c
@@ -0,0 +1,476 @@
+/*
+ * Copyright 2016, Phani Gadepalli and Gabriel Parmer, GWU, gparmer@gwu.edu.
+ *
+ * This uses a two clause BSD License.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <cos_component.h>
+#include <cobj_format.h>
+#include <cos_defkernel_api.h>
+#include <llprint.h>
+#include <sl.h>
+#include <cos_dcb.h>
+#include <crt_chan.h>
+#include <crt_lock.h>
+
+/* Iterations, channels */
+#define CHAN_ITER  1000000
+#define NCHANTHDS  2
+#define CHAN_BATCH 3
+
+CRT_CHAN_STATIC_ALLOC(c0, int, 4);
+CRT_CHAN_TYPE_PROTOTYPES(test, int, 4);
+struct crt_lock lock;
+
+typedef enum { CHILLING = 0, RECVING, SENDING } actions_t;
+unsigned long status[NCHANTHDS];
+unsigned long cnts[NCHANTHDS] = {0, };
+
+/* sl also defines a SPIN macro */
+#undef SPIN
+#define SPIN(iters)                                \
+	do {                                       \
+		if (iters > 0) {                   \
+			for (; iters > 0; iters--) \
+				;                  \
+		} else {                           \
+			while (1)                  \
+				;                  \
+		}                                  \
+	} while (0)
+
+
+#define N_TESTTHDS 2
+#define WORKITERS 100
+
+#define N_TESTTHDS_PERF 2
+#define PERF_ITERS 1000
+
+static volatile cycles_t mid_cycs = 0;
+static volatile int testing = 1;
+
+void
+test_thd_perffn(void *data)
+{
+	cycles_t start_cycs = 0, end_cycs = 0, wc_cycs = 0, total_cycs = 0;
+	unsigned int i = 0;
+
+	rdtscll(start_cycs);
+	sl_thd_yield(0);
+	rdtscll(end_cycs);
+	assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs);
+
+	for (i = 0; i < PERF_ITERS; i++) {
+		cycles_t diff1_cycs = 0, diff2_cycs = 0;
+
+		mid_cycs = 0;
+		rdtscll(start_cycs);
+		sl_thd_yield(0);
+		rdtscll(end_cycs);
+		assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs);
+
+		diff1_cycs = mid_cycs - start_cycs;
+		diff2_cycs = end_cycs - mid_cycs;
+
+		if (diff1_cycs > wc_cycs) wc_cycs = diff1_cycs;
+		if (diff2_cycs > wc_cycs) wc_cycs = diff2_cycs;
+		total_cycs += (diff1_cycs + diff2_cycs);
+	}
+
+	PRINTC("SWITCH UBENCH: avg: %llu, wc: %llu, iters:%u\n", (total_cycs / (2 * PERF_ITERS)), wc_cycs, PERF_ITERS);
+	testing = 0;
+	/* done testing! let the spinfn cleanup! */
+	sl_thd_yield(0);
+
+	sl_thd_exit();
+}
+
+void
+test_thd_spinfn(void *data)
+{
+	while (likely(testing)) {
+		rdtscll(mid_cycs);
+		sl_thd_yield(0);
+	}
+
+	sl_thd_exit();
+}
+/* Get the numbers */
+volatile unsigned long long start_time;
+volatile unsigned long long end_time;
+//void
+//test_thd_fn(void *data)
+//{
+//	cycles_t time;
+//	cycles_t iters;
+//	int rounds = 0;
+//	if (data!=0) {
+//		while (1) {
+//			rounds++;
+//			rdtscll(start_time);
+//			sl_thd_yield(3);
+//			rdtscll(end_time);
+//			print_uint((unsigned long)(end_time-start_time));
+//			print_string("\r\n");
+//			if(rounds == 10000)
+//				while(1);
+//		}
+//	}
+//	else {
+//		while (1) {
+//			sl_thd_yield(4);
+//		}
+//	}
+//}
+
+void
+test_thd_fn(void *data)
+{
+	cycles_t time;
+	cycles_t iters;
+	cycles_t total = 0, max = 0, diff;
+	int send;
+	int recv;
+	int rounds = 0;
+	if (data!=0) {
+		while (1) {
+			rounds ++;
+			crt_chan_recv_test(c0, &recv);
+			rdtscll(end_time);
+
+			diff = end_time - start_time;
+			if (diff > max) max = diff;
+			total += diff;
+
+			if (rounds == 10000) {
+				printc("Avg: %llu, Wc:%llu\n", total / 10000, max);
+
+				while (1) ;
+			}
+			//print_uint((unsigned long)(end_time-start_time));
+			//print_string("\r\n");
+			//if(rounds == 10000)
+			//	while(1);
+		}
+	}
+	else {
+		crt_chan_init_test(c0);
+		while (1) {
+			send = 0x1234;
+			rdtscll(start_time);
+			crt_chan_send_test(c0, &send);
+		}
+	}
+}
+
+//void
+//test_thd_fn(void *data)
+//{
+//	cycles_t time;
+//	cycles_t iters;
+//	int send;
+//	int recv;
+//	int rounds = 0;
+//
+//	if (data!=0) {
+//		while (1) {
+//			rounds ++;
+//
+//			crt_lock_take(&lock);
+//			sl_thd_yield(0);
+//			rdtscll(end_time);
+//			crt_lock_release(&lock);
+//			sl_thd_yield(0);
+//
+//			print_uint((unsigned long)(end_time-start_time));
+//			print_string("\r\n");
+//			if(rounds == 10000)
+//				while(1);
+//		}
+//	}
+//	else {
+//		crt_lock_init(&lock);
+//		while (1) {
+//			rdtscll(start_time);
+//			crt_lock_take(&lock);
+//			crt_lock_release(&lock);
+//			sl_thd_yield(0);
+//		}
+//	}
+//}
+//
+//volatile unsigned long long int_tsc;
+//void
+//test_thd_fn(capid_t cap, void *data)
+//{
+//	cycles_t time;
+//	cycles_t iters;
+//	int send;
+//	int recv;
+//	unsigned int result;
+//	int rounds = 0;
+//	if (data==0) {
+//		while (1) {
+//			//print_string("*");
+//		}
+//	}
+//	else {
+//		/* Higher priority on this branch */
+//		cos_hw_attach(BOOT_CAPTBL_SELF_INITHW_BASE, 63, sl_thd_rcvcap(sl_thd_lkup(sl_thdid())));
+//		cos_hw_custom(BOOT_CAPTBL_SELF_INITHW_BASE);
+//		while (1) {
+//			/* We are doing this receive anyway */
+//			cos_rcv(sl_thd_rcvcap(sl_thd_lkup(sl_thdid())), 0);
+//			rdtscll(end_time);
+//			addr[rounds] = (unsigned int)(end_time-int_tsc);
+//			rounds ++;
+//			if(rounds == 10000)
+//			{
+//				for (rounds = 0; rounds < 10000; rounds ++)
+//				{
+//					print_uint(addr[rounds]);
+//					print_string("\r\n");
+//				}
+//				while(1);
+//			}
+//		}
+//	}
+//}
+
+//	int rounds = 0;
+//void
+//test_thd_fn(capid_t cap, void *data)
+//{
+//	cycles_t time;
+//	cycles_t iters;
+//	int send;
+//	int recv;
+//	unsigned int result;
+//	/* if (data == 0) {
+//		while (1) {
+//			print_string("*");
+//		}
+//	}
+//	else */if (data == 0)
+//	{
+//		/* Higher priority on this branch - receiving stuff from interrupt */
+//		cos_hw_attach(BOOT_CAPTBL_SELF_INITHW_BASE, 63, sl_thd_rcvcap(sl_thd_lkup(sl_thdid())));
+//		cos_hw_custom(BOOT_CAPTBL_SELF_INITHW_BASE);
+//		while (1) {
+////			print_string(" :1a: \r\n");
+//			/* We are doing this receive anyway */
+////			sl_thd_rcv(RCV_ULONLY);
+//			cos_rcv(sl_thd_rcvcap(sl_thd_lkup(sl_thdid())), 0);
+////			print_string(" :1b: ");
+//			/* Send to the guy immediately */
+//			crt_chan_send_test(c0, &send);
+//			//sl_thd_wakeup(4);
+////			print_string(" :1c: ");
+//			//rdtscll(end_time);
+//			//addr[rounds] = (unsigned int)(end_time-int_tsc);
+//		}
+//	}
+//	else {
+//		while(1) {
+//			/* Finally, we send what we receive here */
+////			print_string(" :2a: ");
+//			//sl_thd_block(0);
+//			crt_chan_recv_test(c0, &recv);
+////			print_string(" :2b: ");
+//			rdtscll(end_time);
+//			//print_uint(addr[rounds]);
+//			//print_string(" - ");
+//			addr[rounds] = (unsigned int)(end_time-int_tsc);
+//			//print_uint(addr[rounds]);
+//			//print_string("\r\n");
+//			rounds ++;
+//			if(rounds == 10000)
+//			{
+//				for (rounds = 0; rounds < 10000; rounds ++)
+//				{
+//					print_uint(addr[rounds]);
+//					print_string("\r\n");
+//				}
+//				while(1);
+//			}
+//		}
+//	}
+//}
+
+//void
+//test_yield_perf(void)
+//{
+//	int                     i;
+//	struct sl_thd          *threads[N_TESTTHDS_PERF];
+//	union sched_param_union sp = {.c = {.type = SCHEDP_PRIO, .value = 31}};
+//
+//	for (i = 0; i < N_TESTTHDS_PERF; i++) {
+//		if (i == 1) threads[i] = sl_thd_alloc(test_thd_perffn, (void *)&threads[0]);
+//		else        threads[i] = sl_thd_alloc(test_thd_spinfn, NULL);
+//		assert(threads[i]);
+//		sl_thd_param_set(threads[i], sp.v);
+//		PRINTC("Thread %u:%lu created\n", sl_thd_thdid(threads[i]), sl_thd_thdcap(threads[i]));
+//	}
+//}
+
+//void
+//test_yields(void)
+//{
+//	int                     i;
+//	struct sl_thd *         threads[N_TESTTHDS];
+//	union sched_param_union sp = {.c = {.type = SCHEDP_PRIO, .value = 10}};
+//
+//	for (i = 0; i < N_TESTTHDS; i++) {
+//		threads[i] = sl_thd_alloc(test_thd_fn, (void *)i);
+//		assert(threads[i]);
+//		sl_thd_param_set(threads[i], sp.v);
+//		PRINTC("Thread %u:%lu created\n", sl_thd_thdid(threads[i]), sl_thd_thdcap(threads[i]));
+//	}
+//}
+
+void
+test_yields(void)
+{
+	int                     i;
+	struct sl_thd *         threads[N_TESTTHDS];
+	union sched_param_union sp = {.c = {.type = SCHEDP_PRIO, .value = 10}};
+
+	for (i = 0; i < N_TESTTHDS; i++) {
+		threads[i] = sl_thd_alloc(test_thd_fn, (void *)i);
+		assert(threads[i]);
+		if(i != 0)
+			sp.c.value = 10;
+		sl_thd_param_set(threads[i], sp.v);
+		PRINTC("Thread %u:%lu created\n", sl_thd_thdid(threads[i]), sl_thd_thdcap(threads[i]));
+	}
+}
+
+//void
+//test_yields(void)
+//{
+//	int                     i;
+//	struct sl_thd *         threads[N_TESTTHDS];
+//	union sched_param_union sp = {.c = {.type = SCHEDP_PRIO, .value = 10}};
+//
+//	crt_chan_init_test(&c0);
+//	for (i = 0; i < N_TESTTHDS; i++) {
+//		threads[i] = sl_thd_aep_alloc(test_thd_fn, (void *)i, 0, 0, 0, 0);
+//		assert(threads[i]);
+//		if(i != 0)
+//			sp.c.value = 9;
+//		sl_thd_param_set(threads[i], sp.v);
+//		PRINTC("Thread %u:%lu created\n", sl_thd_thdid(threads[i]), sl_thd_thdcap(threads[i]));
+//	}
+//}
+
+void
+test_high(void *data)
+{
+	struct sl_thd *t = data;
+
+	while (1) {
+		sl_thd_yield(sl_thd_thdid(t));
+		printc("h");
+	}
+}
+
+void
+test_low(void *data)
+{
+	while (1) {
+		int workiters = WORKITERS * 10;
+		SPIN(workiters);
+		printc("l");
+	}
+}
+
+void
+test_blocking_directed_yield(void)
+{
+	struct sl_thd *         low, *high;
+	union sched_param_union sph = {.c = {.type = SCHEDP_PRIO, .value = 5}};
+	union sched_param_union spl = {.c = {.type = SCHEDP_PRIO, .value = 10}};
+
+	low  = sl_thd_alloc(test_low, NULL);
+	high = sl_thd_alloc(test_high, low);
+	sl_thd_param_set(low, spl.v);
+	sl_thd_param_set(high, sph.v);
+}
+
+#define TEST_ITERS 1000
+
+void
+test_high_wakeup(void *data)
+{
+	unsigned int   toggle = 0, iters = 0;
+	struct sl_thd *t     = data;
+	cycles_t       start = sl_now();
+
+	while (1) {
+		cycles_t timeout = sl_now() + sl_usec2cyc(100);
+
+		if (toggle % 10 == 0)
+			printc(".h:%llums.", sl_cyc2usec(sl_thd_block_timeout(0, timeout)));
+		else
+			printc(".h:%up.", sl_thd_block_periodic(0));
+
+		toggle++;
+		iters++;
+
+		if (iters == TEST_ITERS) {
+			printc("\nTest done! (Duration: %llu ms)\n", sl_cyc2usec(sl_now() - start) / 1000);
+			printc("Deleting all threads. Idle thread should take over!\n");
+			sl_thd_free(t);
+			sl_thd_free(sl_thd_curr());
+
+			/* should not be scheduled. */
+			assert(0);
+		}
+	}
+}
+
+void
+test_timeout_wakeup(void)
+{
+	struct sl_thd *         low, *high;
+	union sched_param_union sph = {.c = {.type = SCHEDP_PRIO, .value = 5}};
+	union sched_param_union spl = {.c = {.type = SCHEDP_PRIO, .value = 10}};
+	union sched_param_union spw = {.c = {.type = SCHEDP_WINDOW, .value = 1000}};
+
+	low = sl_thd_alloc(test_low, NULL);
+	sl_thd_param_set(low, spl.v);
+	sl_thd_param_set(low, spw.v);
+
+	high = sl_thd_alloc(test_high_wakeup, low);
+	sl_thd_param_set(high, sph.v);
+	sl_thd_param_set(high, spw.v);
+}
+
+void
+cos_init(void)
+{
+	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
+	struct cos_compinfo *   ci    = cos_compinfo_get(defci);
+
+	printc("Unit-test for the scheduling library (sl)\n");
+	/* This is a hack, we know where the heap is */
+	cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
+	cos_defcompinfo_llinit();
+	cos_dcb_info_init_curr();
+	sl_init(SL_MIN_PERIOD_US);
+
+	//test_yield_perf();
+	test_yields();
+	//test_blocking_directed_yield();
+	//test_timeout_wakeup();
+
+	sl_sched_loop_nonblock();
+
+	assert(0);
+
+	return;
+}
diff --git a/src/components/implementation/tests/unit_schedcomp/unit_schedlib.c b/src/components/implementation/tests/unit_schedcomp/unit_schedlib.c
index c8980eafc0..919be433b1 100644
--- a/src/components/implementation/tests/unit_schedcomp/unit_schedlib.c
+++ b/src/components/implementation/tests/unit_schedcomp/unit_schedlib.c
@@ -49,10 +49,10 @@ test_thd_perffn(void *data)
 	assert(perf_thd == sl_thd_curr());
 	rdtscll(start_cycs);
 	//printc("a");
-	sl_thd_yield(yield_to);
+	//sl_thd_yield(yield_to);
 	//ret = sl_thd_dispatch(spin_thd, cos_sched_sync(), perf_thd);
 	//sl_thd_yield_thd_c(perf_thd, spin_thd);
-	//sl_thd_yield_thd(spin_thd);
+	sl_thd_yield_thd(spin_thd);
 	//assert(ret == 0);
 	rdtscll(end_cycs);
 	//assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs);
@@ -64,15 +64,15 @@ test_thd_perffn(void *data)
 		end_cycs = start_cycs = 0;
 		//mid_cycs = 0;
 		switched = 0;
-		cos_rdtscp(start_cycs);
-		//rdtscll(start_cycs);
+		//cos_rdtscp(start_cycs);
+		rdtscll(start_cycs);
 		//ret = sl_thd_dispatch(spin_thd, cos_sched_sync(), perf_thd);
 		//printc("a");
-		sl_thd_yield(yield_to);
+		//sl_thd_yield(yield_to);
 		//sl_thd_yield_thd_c(perf_thd, spin_thd);
-		//sl_thd_yield_thd(spin_thd);
-		//rdtscll(end_cycs);
-		cos_rdtscp(end_cycs);
+		sl_thd_yield_thd(spin_thd);
+		rdtscll(end_cycs);
+		//cos_rdtscp(end_cycs);
 		assert(switched);
 		assert(ret == 0);
 		//assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs);
@@ -80,7 +80,7 @@ test_thd_perffn(void *data)
 		//diff1_cycs = mid_cycs - start_cycs;
 		diff2_cycs = end_cycs - start_cycs;
 		assert(diff2_cycs > rdtscp_min);
-		diff2_cycs -= rdtscp_min;
+		//diff2_cycs -= rdtscp_min;
 
 		//if (diff1_cycs > wc_cycs) wc_cycs = diff1_cycs;
 		if (diff2_cycs > wc_cycs) wc_cycs = diff2_cycs;
@@ -88,7 +88,7 @@ test_thd_perffn(void *data)
 		total_cycs += diff2_cycs;
 	}
 
-	PRINTC("SWITCH UBENCH (2 switches): avg: %llu, wc: %llu, bc: %llu, iters:%u\n", (total_cycs / (PERF_ITERS)), wc_cycs, bc_cycs, PERF_ITERS);
+	PRINTC("SWITCH UBENCH : avg: %llu, wc: %llu, bc: %llu, iters:%u\n", (total_cycs / (PERF_ITERS)) / 2, wc_cycs / 2, bc_cycs / 2, PERF_ITERS);
 	testing = 0;
 	/* done testing! free the spin thread! */
 	while (1) ;
@@ -108,9 +108,9 @@ test_thd_spinfn(void *data)
 		switched = 1;
 		//sl_thd_dispatch(perf_thd, cos_sched_sync(), spin_thd);
 		//printc("b");
-		sl_thd_yield(yield_to);
+		//sl_thd_yield(yield_to);
 		//sl_thd_yield_thd_c(spin_thd, perf_thd);
-		//sl_thd_yield_thd(perf_thd);
+		sl_thd_yield_thd(perf_thd);
 	}
 
 	//sl_thd_dispatch(perf_thd, cos_sched_sync(), spin_thd);
diff --git a/src/components/implementation/tests/unit_schedtests/inv.S b/src/components/implementation/tests/unit_schedtests/inv.S
new file mode 120000
index 0000000000..cad20b8372
--- /dev/null
+++ b/src/components/implementation/tests/unit_schedtests/inv.S
@@ -0,0 +1 @@
+../micro_booter/inv.S
\ No newline at end of file
diff --git a/src/components/implementation/tests/unit_schedtests/unit_schedlib.c b/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
index c7cd84a532..19821aa5a8 100644
--- a/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
+++ b/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
@@ -35,17 +35,82 @@
 #define N_TESTTHDS_PERF 2
 #define PERF_ITERS 1000000
 
+#define MAGIC_RET 0xDEADBEEF
+
+#define INV_TEST
 static volatile cycles_t mid_cycs = 0;
 static volatile int testing = 1;
 
+
+int
+test_serverfn(int a, int b, int c)
+{
+        //rdtscll(midinv_cycles[cos_cpuid()]);
+        return MAGIC_RET;
+}
+
+extern void *__inv_test_serverfn(int a, int b, int c);
+
+static inline int
+call_cap_mb(u32_t cap_no, int arg1, int arg2, int arg3)
+{
+        int ret;
+
+        /*
+         * Which stack should we use for this invocation?  Simple, use
+         * this stack, at the current sp.  This is essentially a
+         * function call into another component, with odd calling
+         * conventions.
+         */
+        cap_no = (cap_no + 1) << COS_CAPABILITY_OFFSET;
+
+        __asm__ __volatile__("pushl %%ebp\n\t"
+                             "movl %%esp, %%ebp\n\t"
+                             "movl %%esp, %%edx\n\t"
+                             "movl $1f, %%ecx\n\t"
+                             "sysenter\n\t"
+                             "1:\n\t"
+                             "popl %%ebp"
+                             : "=a"(ret)
+                             : "a"(cap_no), "b"(arg1), "S"(arg2), "D"(arg3)
+                             : "memory", "cc", "ecx", "edx");
+
+        return ret;
+}
+
+sinvcap_t sinv_cap = 0;
+
+static inline void
+test_inv_setup(void)
+{
+	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
+	struct cos_compinfo *   ci    = cos_compinfo_get(defci);
+        compcap_t    cc;
+        sinvcap_t    ic;
+        int          i;
+        unsigned int ret;
+
+        cc = cos_comp_alloc(ci, ci->captbl_cap, ci->pgtbl_cap, 0, (vaddr_t)NULL, 0);
+        assert(cc > 0);
+        ic = cos_sinv_alloc(ci, cc, (vaddr_t)__inv_test_serverfn, 0);
+        assert(ic > 0);
+        ret = call_cap_mb(ic, 1, 2, 3);
+        assert(ret == (int)MAGIC_RET);
+
+	sinv_cap = ic;
+}
+
+static struct sl_thd *perf_thd = NULL, *spin_thd = NULL;
+
 void
 test_thd_perffn(void *data)
 {
 	cycles_t start_cycs = 0, end_cycs = 0, wc_cycs = 0, total_cycs = 0;
 	unsigned int i = 0;
+	struct sl_thd *c = sl_thd_curr();
 
 	rdtscll(start_cycs);
-	sl_thd_yield(0);
+	sl_thd_yield_thd(spin_thd);
 	rdtscll(end_cycs);
 	assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs);
 
@@ -54,7 +119,7 @@ test_thd_perffn(void *data)
 
 		mid_cycs = 0;
 		rdtscll(start_cycs);
-		sl_thd_yield(0);
+		sl_thd_yield_thd_c(c, spin_thd);
 		rdtscll(end_cycs);
 		assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs);
 
@@ -69,7 +134,52 @@ test_thd_perffn(void *data)
 	PRINTC("SWITCH UBENCH: avg: %llu, wc: %llu, iters:%u\n", (total_cycs / (2 * PERF_ITERS)), wc_cycs, PERF_ITERS);
 	testing = 0;
 	/* done testing! let the spinfn cleanup! */
-	sl_thd_yield(0);
+	sl_thd_yield_thd(spin_thd);
+
+	sl_thd_exit();
+}
+
+void
+test_inv_perffn(void *data)
+{
+	cycles_t start_cycs = 0, end_cycs = 0, wc_cycs = 0, total_cycs = 0;
+	unsigned int i = 0;
+	struct sl_thd *c = sl_thd_curr();
+
+	test_inv_setup();
+
+	rdtscll(start_cycs);
+	sl_thd_yield_thd(spin_thd);
+	rdtscll(end_cycs);
+	assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs);
+
+	for (i = 0; i < PERF_ITERS; i++) {
+		cycles_t diff_cycs = 0;
+		int ret;
+
+		sl_thd_yield_thd_c(c, spin_thd);
+		mid_cycs = 0;
+		rdtscll(start_cycs);
+		ret = call_cap_mb(sinv_cap, 1, 2, 3);
+		rdtscll(end_cycs);
+		assert(ret == (int)MAGIC_RET);
+//		assert(mid_cycs && mid_cycs > start_cycs && mid_cycs < end_cycs);
+//
+//		diff1_cycs = mid_cycs - start_cycs;
+//		diff2_cycs = end_cycs - mid_cycs;
+//
+//		if (diff1_cycs > wc_cycs) wc_cycs = diff1_cycs;
+//		if (diff2_cycs > wc_cycs) wc_cycs = diff2_cycs;
+//		total_cycs += (diff1_cycs + diff2_cycs);
+		diff_cycs = end_cycs - start_cycs;
+		if (diff_cycs > wc_cycs) wc_cycs = diff_cycs;
+		total_cycs += diff_cycs;
+	}
+
+	PRINTC("INV UBENCH: avg: %llu, wc: %llu, iters:%u\n", (total_cycs / PERF_ITERS), wc_cycs, PERF_ITERS);
+	testing = 0;
+	/* done testing! let the spinfn cleanup! */
+	sl_thd_yield_thd(spin_thd);
 
 	sl_thd_exit();
 }
@@ -77,9 +187,11 @@ test_thd_perffn(void *data)
 void
 test_thd_spinfn(void *data)
 {
+	struct sl_thd *c = sl_thd_curr();
+
 	while (likely(testing)) {
 		rdtscll(mid_cycs);
-		sl_thd_yield(0);
+		sl_thd_yield_thd_c(c, perf_thd);
 	}
 
 	sl_thd_exit();
@@ -105,8 +217,17 @@ test_yield_perf(void)
 	union sched_param_union sp = {.c = {.type = SCHEDP_PRIO, .value = 31}};
 
 	for (i = 0; i < N_TESTTHDS_PERF; i++) {
-		if (i == 1) threads[i] = sl_thd_alloc(test_thd_perffn, (void *)&threads[0]);
-		else        threads[i] = sl_thd_alloc(test_thd_spinfn, NULL);
+		if (i == 1) {
+#ifdef INV_TEST
+			threads[i] = sl_thd_alloc(test_inv_perffn, (void *)&threads[0]);
+#else
+			threads[i] = sl_thd_alloc(test_thd_perffn, (void *)&threads[0]);
+#endif
+			perf_thd = threads[i];
+		} else {
+			threads[i] = sl_thd_alloc(test_thd_spinfn, NULL);
+			spin_thd = threads[i];
+		}
 		assert(threads[i]);
 		sl_thd_param_set(threads[i], sp.v);
 		PRINTC("Thread %u:%lu created\n", sl_thd_thdid(threads[i]), sl_thd_thdcap(threads[i]));
@@ -221,10 +342,10 @@ cos_init(void)
 	cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
 	cos_defcompinfo_llinit();
 	cos_dcb_info_init_curr();
-	sl_init(SL_MIN_PERIOD_US);
+	sl_init(SL_MIN_PERIOD_US*100);
 
-	//test_yield_perf();
-	test_yields();
+	test_yield_perf();
+	//test_yields();
 	//test_blocking_directed_yield();
 	//test_timeout_wakeup();
 
diff --git a/src/components/include/crt_chan.h b/src/components/include/crt_chan.h
index 39a06974f4..ea833db694 100644
--- a/src/components/include/crt_chan.h
+++ b/src/components/include/crt_chan.h
@@ -93,7 +93,8 @@ __crt_chan_buff_idx(struct crt_chan *c, u32_t v, u32_t wraparound_mask)
 
 static inline int
 __crt_chan_full(struct crt_chan *c, u32_t wraparound_mask)
-{ return c->consumer == __crt_chan_buff_idx(c, c->producer + 1, wraparound_mask); }
+{ return __crt_chan_buff_idx(c, c->consumer, wraparound_mask) == __crt_chan_buff_idx(c, c->producer + 1, wraparound_mask); }
+//{ return c->consumer == __crt_chan_buff_idx(c, c->producer + 1, wraparound_mask); }
 
 static inline int
 __crt_chan_empty(struct crt_chan *c, u32_t wraparound_mask)
diff --git a/src/components/include/part_task.h b/src/components/include/part_task.h
index a42f9ad64a..dff7b9e42d 100644
--- a/src/components/include/part_task.h
+++ b/src/components/include/part_task.h
@@ -11,10 +11,10 @@
 #define PART_THD_COREID(t) (t >> 16)
 #define PART_THD_THDID(t)  ((t << 16) >> 16)
 
-#define PART_MAX_TASKS      2048
-#define PART_MAX_DATA       2048
+#define PART_MAX_TASKS      (NUM_CPU < 4 ? 2048 : 8192)
+#define PART_MAX_DATA       PART_MAX_TASKS 
 #define PART_MAX_PAR_THDS   NUM_CPU
-#define PART_MAX_CORE_THDS  48
+#define PART_MAX_CORE_THDS  (NUM_CPU == 1 ? 200 : (NUM_CPU == 2 ? 128 : (NUM_CPU < 5 ? 64 : 48)))
 #define PART_MAX_THDS       512
 #define PART_MAX_CHILD      1024
 #define PART_MAX_WORKSHARES 16
diff --git a/src/kernel/include/shared/cos_config.h b/src/kernel/include/shared/cos_config.h
index 8c46ae5377..bf501b3be9 100644
--- a/src/kernel/include/shared/cos_config.h
+++ b/src/kernel/include/shared/cos_config.h
@@ -17,7 +17,7 @@
 
 #include "cpu_ghz.h"
 
-#define NUM_CPU 2
+#define NUM_CPU 1
 #define NUM_CPU_BMP_BYTES ((NUM_CPU + 7) / 8)
 #define NUM_CPU_BMP_WORDS ((NUM_CPU_BMP_BYTES + 3) / 4)
 
diff --git a/src/platform/i386/runscripts/omp_ubench.sh b/src/platform/i386/runscripts/omp_ubench.sh
new file mode 100644
index 0000000000..100adcb020
--- /dev/null
+++ b/src/platform/i386/runscripts/omp_ubench.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+cp omp_ubench.o llboot.o
+./cos_linker "llboot.o, :" ./gen_client_stub
+
+#cp llboot_comp.o llboot.o
+#cp omp_ubench.o boot.o
+#cp test_boot.o dummy1.o
+#cp test_boot.o dummy2.o
+#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub

From fd0aa6475b689af64072ee70ccb832a0d3f3eaca Mon Sep 17 00:00:00 2001
From: Phani <phanikishoreg@gwu.edu>
Date: Fri, 13 Sep 2019 14:08:10 -0400
Subject: [PATCH 107/127] bugfix after removing code duplication in capmgr

---
 .../implementation/capmgr/naive/init.c        |  5 +++--
 .../no_interface/llbooter/boot_deps.h         | 22 ++++++++++++++++++-
 src/components/include/hypercall.h            |  7 ++++++
 3 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/src/components/implementation/capmgr/naive/init.c b/src/components/implementation/capmgr/naive/init.c
index 35f5b0edd1..b150101e29 100644
--- a/src/components/implementation/capmgr/naive/init.c
+++ b/src/components/implementation/capmgr/naive/init.c
@@ -22,13 +22,13 @@ capmgr_comp_info_init(struct cap_comp_info *rci, spdid_t spdid)
 	struct cos_defcompinfo *defci  = cos_defcompinfo_curr_get();
 	struct cos_compinfo    *ci     = cos_compinfo_get(defci);
 	struct cap_comp_info   *btinfo = cap_info_comp_find(0);
-	spdid_t sched_spdid = 0;
 	struct cap_comp_info *rci_sched = NULL;
 	struct cap_comp_cpu_info *rci_cpu = NULL;
 	struct sl_thd *ithd = NULL;
 	u64_t chbits = 0, chschbits = 0;
 	int ret = 0, is_sched = 0;
 	int remain_child = 0;
+	spdid_t sched_spdid = 0;
 	spdid_t childid;
 	comp_flag_t ch_flags;
 	struct cos_aep_info aep;
@@ -38,6 +38,7 @@ capmgr_comp_info_init(struct cap_comp_info *rci, spdid_t spdid)
 	assert(cap_info_init_check(rci));
 	rci_cpu = cap_info_cpu_local(rci);
 
+	sched_spdid = hypercall_comp_sched_get(spdid);
 	if (spdid == 0 || (spdid != cos_spd_id() && cap_info_is_child(btinfo, spdid))) {
 		is_sched = (spdid == 0 || cap_info_is_sched_child(btinfo, spdid)) ? 1 : 0;
 
@@ -48,7 +49,7 @@ capmgr_comp_info_init(struct cap_comp_info *rci, spdid_t spdid)
 	}
 
 	rci_sched = cap_info_comp_find(sched_spdid);
-	assert(rci_sched && cap_info_init_check(rci_sched));
+	assert(rci_sched);
 	rci_cpu->parent = rci_sched;
 	rci_cpu->thd_used = 1;
 	if (cos_cpuid() != INIT_CORE) cap_info_cpu_initdcb_init(rci);
diff --git a/src/components/implementation/no_interface/llbooter/boot_deps.h b/src/components/implementation/no_interface/llbooter/boot_deps.h
index f6a961dc0e..59e11fadf8 100644
--- a/src/components/implementation/no_interface/llbooter/boot_deps.h
+++ b/src/components/implementation/no_interface/llbooter/boot_deps.h
@@ -544,6 +544,17 @@ boot_comp_cap_cpy_at(spdid_t dstid, capid_t dstslot, spdid_t srcid, cap_t captyp
 	return ret;
 }
 
+static inline int
+boot_comp_sched_get(spdid_t dstid, spdid_t srcid)
+{
+	struct comp_sched_info *si = NULL;
+
+	if (srcid > num_cobj || dstid > num_cobj) return -EINVAL;
+	si = boot_spd_comp_schedinfo_get(srcid);
+
+	return si->parent_spdid;
+}
+
 static inline int
 boot_comp_initaep_get(spdid_t dstid, spdid_t srcid, thdcap_t thdslot, arcvcap_t rcvslot, tcap_t tcslot, spdid_t *parent)
 {
@@ -599,9 +610,9 @@ boot_root_initaep_set(spdid_t dstid, spdid_t srcid, thdcap_t thd, arcvcap_t rcv,
 	a->rcv = cos_cap_cpy(b, c, CAP_ARCV, rcv);
 	assert(a->rcv);
 
+done:
 	boot_comp_sched_set(srcid);
 
-done:
 	return 0;
 }
 
@@ -850,6 +861,15 @@ hypercall_entry(word_t *ret2, word_t *ret3, int op, word_t arg3, word_t arg4, wo
 
 		break;
 	}
+	case HYPERCALL_COMP_SCHED_GET:
+	{
+		spdid_t srcid = arg3;
+
+		if (!__hypercall_resource_access_check(client, srcid, 1)) return -EACCES;
+		ret1 = boot_comp_sched_get(client, srcid);
+
+		break;
+	}
 	case HYPERCALL_NUMCOMPS_GET:
 	{
 		ret1 = num_cobj + 1; /* including booter */
diff --git a/src/components/include/hypercall.h b/src/components/include/hypercall.h
index aa545ff77f..ee3caeb312 100644
--- a/src/components/include/hypercall.h
+++ b/src/components/include/hypercall.h
@@ -17,6 +17,7 @@ enum hypercall_cntl {
 	HYPERCALL_COMP_INITAEP_GET,
 	HYPERCALL_COMP_CHILD_NEXT,
 	HYPERCALL_COMP_CPUBITMAP_GET,
+	HYPERCALL_COMP_SCHED_GET,
 
 	HYPERCALL_NUMCOMPS_GET,
 
@@ -190,6 +191,12 @@ hypercall_comp_pgtblcap_get(spdid_t spdid)
 	return ptslot;
 }
 
+static inline spdid_t
+hypercall_comp_sched_get(spdid_t spdid)
+{
+	return cos_sinv(BOOT_CAPTBL_SINV_CAP, HYPERCALL_COMP_SCHED_GET, spdid, 0, 0);
+}
+
 static inline int
 hypercall_comp_cpubitmap_get(spdid_t spdid, u32_t *bmp)
 {

From d15bdf893acf856303429e2357c2a57736591a5f Mon Sep 17 00:00:00 2001
From: Phani <phanikishoreg@gwu.edu>
Date: Fri, 13 Sep 2019 16:07:51 -0400
Subject: [PATCH 108/127] debugging sched thread user-level switch

---
 .../implementation/sched/sched_init.c         |  2 +-
 .../implementation/tests/unit_slrcv/Makefile  |  4 +-
 .../implementation/tests/unit_slrcv/init.c    | 70 +++++++++----
 src/components/include/cos_ulsched_rcv.h      |  2 +-
 src/components/include/sl.h                   | 50 +++++++++-
 src/components/lib/sl/sl_sched.c              | 97 ++++++++++++-------
 src/kernel/capinv.c                           |  3 +
 src/kernel/include/shared/cos_sched.h         |  9 +-
 src/kernel/include/shared/cos_types.h         |  1 +
 src/kernel/include/thd.h                      | 16 ++-
 src/platform/i386/runscripts/unit_slite01.sh  |  7 ++
 11 files changed, 191 insertions(+), 70 deletions(-)
 create mode 100644 src/platform/i386/runscripts/unit_slite01.sh

diff --git a/src/components/implementation/sched/sched_init.c b/src/components/implementation/sched/sched_init.c
index a448247050..d528982980 100644
--- a/src/components/implementation/sched/sched_init.c
+++ b/src/components/implementation/sched/sched_init.c
@@ -37,7 +37,7 @@ schedinit_child(void)
 	if (!init) return 0;
 	tcur = sl_thd_curr();
 	if (!tcur) return 0;
-	assert(tcur->schedthd == init);
+	assert(tcur->schedthd == init || tcur == init);
 
 	/* thd retrieve */
 	do {
diff --git a/src/components/implementation/tests/unit_slrcv/Makefile b/src/components/implementation/tests/unit_slrcv/Makefile
index 0c3074e079..11d3890dde 100644
--- a/src/components/implementation/tests/unit_slrcv/Makefile
+++ b/src/components/implementation/tests/unit_slrcv/Makefile
@@ -1,8 +1,8 @@
 COMPONENT=unit_slrcvtest.o
 INTERFACES=
-DEPENDENCIES=
+DEPENDENCIES=capmgr schedinit
 IF_LIB=
-ADDITIONAL_LIBS=-lcobj_format $(LIBSLRAW) -lsl_mod_fprr -lsl_thd_static_backend -lcos_dcb
+ADDITIONAL_LIBS=-lcobj_format $(LIBSLCAPMGR) -lsl_mod_rr -lsl_thd_static_backend -lcos_dcb
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/tests/unit_slrcv/init.c b/src/components/implementation/tests/unit_slrcv/init.c
index 8a1fb73eaf..fde5b50a52 100644
--- a/src/components/implementation/tests/unit_slrcv/init.c
+++ b/src/components/implementation/tests/unit_slrcv/init.c
@@ -4,6 +4,7 @@
 #include <sl.h>
 #include <cos_dcb.h>
 #include <hypercall.h>
+#include <schedinit.h>
 #include "spinlib.h"
 
 static struct sl_xcore_thd *ping;
@@ -49,33 +50,60 @@ cos_init(void *d)
 	static volatile asndcap_t s = 0;
 	unsigned int cycs_per_us = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
 
-	assert(NUM_CPU == 2);
+	if (NUM_CPU == 2) {
+		if (cos_cpuid() == 0) {
+			cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
+			cos_defcompinfo_llinit();
+			cos_dcb_info_init_curr();
+			sl_init(SL_MIN_PERIOD_US);
+			spinlib_calib(cycs_per_us);
 
-	if (cos_cpuid() == 0) {
+			struct sl_thd *t = sl_thd_aep_alloc(pong_fn, NULL, 0, 0, 0, 0);
+			assert(t);
+			sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1));
+			r = sl_thd_rcvcap(t);
+			assert(r);
+		} else {
+			while (!ps_load(&init_done[0])) ;
+
+			cos_defcompinfo_sched_init();
+			cos_dcb_info_init_curr();
+			sl_init(SL_MIN_PERIOD_US);
+
+			struct sl_thd *t = sl_thd_alloc(ping_fn, (void *)&s);
+			assert(t);
+			sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1));
+
+			while (!r) ;
+			s = cos_asnd_alloc(ci, r, ci->captbl_cap);
+			assert(s);
+		}
+	} else {
+		assert(NUM_CPU == 1);
 		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
-		cos_defcompinfo_llinit();
-		cos_dcb_info_init_curr();
+		printc("%s:%d\n", __func__, __LINE__);
+		cos_defcompinfo_init();
+		printc("%s:%d\n", __func__, __LINE__);
+		//cos_dcb_info_init_curr();
+		printc("%s:%d\n", __func__, __LINE__);
 		sl_init(SL_MIN_PERIOD_US);
 		spinlib_calib(cycs_per_us);
 
-		struct sl_thd *t = sl_thd_aep_alloc(pong_fn, NULL, 0, 0, 0, 0);
-		assert(t);
-		sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1));
-		r = sl_thd_rcvcap(t);
+		printc("%s:%d\n", __func__, __LINE__);
+		struct sl_thd *rt = sl_thd_aep_alloc(pong_fn, NULL, 0, 0, 0, 0);
+		assert(rt);
+		sl_thd_param_set(rt, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1));
+		r = sl_thd_rcvcap(rt);
 		assert(r);
-	} else {
-		while (!ps_load(&init_done[0])) ;
-
-		cos_defcompinfo_sched_init();
-		cos_dcb_info_init_curr();
-		sl_init(SL_MIN_PERIOD_US);
-		
-		struct sl_thd *t = sl_thd_alloc(ping_fn, (void *)&s);
-		assert(t);
-		sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1));
+		printc("%s:%d\n", __func__, __LINE__);
+		struct sl_thd *st = sl_thd_alloc(ping_fn, (void *)&s);
+		assert(st);
+		sl_thd_param_set(st, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1));
+		printc("%s:%d\n", __func__, __LINE__);
 
-		while (!r) ;
-		s = cos_asnd_alloc(ci, r, ci->captbl_cap);
+		//s = cos_asnd_alloc(ci, r, ci->captbl_cap);
+		//assert(s);
+		s = capmgr_asnd_rcv_create(r);
 		assert(s);
 	}
 	ps_faa(&init_done[cos_cpuid()], 1);
@@ -84,6 +112,8 @@ cos_init(void *d)
 	for (i = 0; i < NUM_CPU; i++) {
 		while (!ps_load(&init_done[i])) ;
 	}
+	//hypercall_comp_init_done();
+	schedinit_child();
 	sl_sched_loop_nonblock();
 
 	PRINTC("Should never get here!\n");
diff --git a/src/components/include/cos_ulsched_rcv.h b/src/components/include/cos_ulsched_rcv.h
index 29a470e2b1..881d0da7f6 100644
--- a/src/components/include/cos_ulsched_rcv.h
+++ b/src/components/include/cos_ulsched_rcv.h
@@ -15,8 +15,8 @@ __cos_sched_event_consume(struct cos_sched_ring *r, struct cos_sched_event *e)
 	int f = 0;
 
 	if (!r || !e || !__cos_sched_events_present(r)) return 0;
-	*e = r->event_buf[f];
 	f = ps_upfaa((unsigned long *)&r->head, 1);
+	*e = r->event_buf[f];
 //	memcpy((void *)e, (void *)&(r->event_buf[f]), sizeof(struct cos_sched_event));
 
 	return 1;
diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index f6fe36cddf..54f3a10a33 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -41,7 +41,7 @@
 #include <heap.h>
 
 #define SL_CS
-#undef  SL_REPLENISH
+#define SL_REPLENISH
 
 /* Critical section (cs) API to protect scheduler data-structures */
 struct sl_cs {
@@ -574,7 +574,7 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 	struct sl_thd         *t = to;
 	struct sl_global_core *globals = sl__globals_core();
 	sched_tok_t            tok;
-//	cycles_t               now;
+	cycles_t               now;
 	s64_t                  offset;
 	int                    ret;
 
@@ -584,7 +584,7 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 #endif
 
 	tok    = cos_sched_sync();
-//	now    = sl_now();
+	now    = sl_now();
 
 	/* still wakeup without timeouts? that adds to dispatch overhead! */
 //	offset = (s64_t)(globals->timer_next - now);
@@ -893,14 +893,51 @@ sl_thd_yield_timeout(thdid_t tid, cycles_t abs_timeout)
 	}
 }
 
+static inline void
+sl_thd_event_info_reset(struct sl_thd *t)
+{
+	t->event_info.blocked      = 0;
+	t->event_info.elapsed_cycs = 0;
+	t->event_info.next_timeout = 0;
+	t->event_info.epoch        = 0;
+}
+
+static inline void
+sl_thd_event_enqueue(struct sl_thd *t, struct cos_thd_event *e)
+{
+	struct sl_global_core *g = sl__globals_core();
+
+	if (e->epoch <= t->event_info.epoch) return;
+
+	if (ps_list_singleton(t, SL_THD_EVENT_LIST)) ps_list_head_append(&g->event_head, t, SL_THD_EVENT_LIST);
+
+	t->event_info.blocked       = e->blocked;
+	t->event_info.elapsed_cycs += e->elapsed_cycs;
+	t->event_info.next_timeout  = e->next_timeout;
+}
+
+static inline void
+sl_thd_event_dequeue(struct sl_thd *t, struct cos_thd_event *e)
+{
+	ps_list_rem(t, SL_THD_EVENT_LIST);
+
+	e->blocked      = t->event_info.blocked;
+	e->elapsed_cycs = t->event_info.elapsed_cycs;
+	e->next_timeout = t->event_info.next_timeout;
+	sl_thd_event_info_reset(t);
+}
+
 static inline int
 sl_thd_rcv(rcv_flags_t flags)
 {
+	/* FIXME: elapsed_cycs accounting..?? */
+	struct cos_thd_event ev = { .blocked = 1, .next_timeout = 0, .epoch = 0, .elapsed_cycs = 0 };
 	struct sl_thd *t = sl_thd_curr();
 	unsigned long *p = &sl_thd_dcbinfo(t)->pending, q = 0;
 	int ret = 0;
 
 	assert(sl_thd_rcvcap(t));
+	assert(!(flags & RCV_ULSCHED_RCV));
 check:
 	sl_cs_enter();
 	/* there no pending event in the dcbinfo->pending */
@@ -911,7 +948,12 @@ sl_thd_rcv(rcv_flags_t flags)
 			goto done;
 		}
 
-		sl_thd_sched_block_no_cs(t, SL_THD_BLOCKED, 0);
+		ev.epoch = sl_now();
+		sl_thd_event_enqueue(t, &ev);
+		/*
+		 * TODO: add event so sched thread will do this?
+		 *  sl_thd_sched_block_no_cs(t, SL_THD_BLOCKED, 0);
+		 */
 		sl_cs_exit_switchto(sl__globals_core()->sched_thd);
 
 		goto check;
diff --git a/src/components/lib/sl/sl_sched.c b/src/components/lib/sl/sl_sched.c
index 60c07c1da1..4bde91be00 100644
--- a/src/components/lib/sl/sl_sched.c
+++ b/src/components/lib/sl/sl_sched.c
@@ -501,37 +501,6 @@ sl_thd_yield_intern_timeout(cycles_t abs_timeout)
 	sl_cs_exit_schedule_timeout(abs_timeout);
 }
 
-void
-sl_thd_event_info_reset(struct sl_thd *t)
-{
-	t->event_info.blocked      = 0;
-	t->event_info.elapsed_cycs = 0;
-	t->event_info.next_timeout = 0;
-}
-
-static inline void
-sl_thd_event_enqueue(struct sl_thd *t, struct cos_thd_event *e)
-{
-	struct sl_global_core *g = sl__globals_core();
-
-	if (ps_list_singleton(t, SL_THD_EVENT_LIST)) ps_list_head_append(&g->event_head, t, SL_THD_EVENT_LIST);
-
-	t->event_info.blocked       = e->blocked;
-	t->event_info.elapsed_cycs += e->elapsed_cycs;
-	t->event_info.next_timeout  = e->next_timeout;
-}
-
-static inline void
-sl_thd_event_dequeue(struct sl_thd *t, struct cos_thd_event *e)
-{
-	ps_list_rem(t, SL_THD_EVENT_LIST);
-
-	e->blocked      = t->event_info.blocked;
-	e->elapsed_cycs = t->event_info.elapsed_cycs;
-	e->next_timeout = t->event_info.next_timeout;
-	sl_thd_event_info_reset(t);
-}
-
 void
 sl_thd_exit()
 {
@@ -674,6 +643,67 @@ sl_init(microsec_t period)
 	sl_init_corebmp(period, corebmp);
 }
 
+static inline int
+__sl_sched_events_present(void)
+{
+	struct cos_scb_info *scb = sl_scb_info_core();
+	struct cos_sched_ring *ring = &scb->sched_events;
+
+	return __cos_sched_events_present(ring);
+}
+
+static inline int
+__sl_sched_event_consume(struct cos_sched_event *e)
+{
+	struct cos_scb_info *scb = sl_scb_info_core();
+	struct cos_sched_ring *ring = &scb->sched_events;
+
+	return __cos_sched_event_consume(ring, e);
+}
+
+static inline int
+__sl_sched_rcv(rcv_flags_t rf, struct cos_sched_event *e)
+{
+	struct sl_global_core *g = sl__globals_core();
+	struct sl_thd *curr = sl_thd_curr();
+	struct cos_dcb_info *cd = sl_thd_dcbinfo(curr);
+	int ret = 0;
+
+	assert(curr == g->sched_thd);
+	if (!cd) return cos_ul_sched_rcv(g->sched_rcv, rf, g->timeout_next, e);
+
+	rf |= RCV_ULSCHED_RCV;
+	
+	__asm__ __volatile__ (			\
+		"pushl %%ebp\n\t"		\
+		"movl %%esp, %%ebp\n\t"		\
+		"movl $1f, (%%eax)\n\t"		\
+		"movl %%esp, 4(%%eax)\n\t"	\
+		"movl $2f, %%ecx\n\t"		\
+		"movl %%edx, %%eax\n\t"		\
+		"inc %%eax\n\t"			\
+		"shl $16, %%eax\n\t"		\
+		"movl $0, %%edx\n\t"		\
+		"movl $0, %%esi\n\t"		\
+		"movl $0, %%edi\n\t"		\
+		"sysenter\n\t"			\
+		"jmp 2f\n\t"			\
+		".align 4\n\t"			\
+		"1:\n\t"			\
+		"movl $1, %%eax\n\t"		\
+		"2:\n\t"			\
+		"popl %%ebp\n\t"		\
+		:
+		: "a" (cd), "b" (rf), "c" (g->timeout_next), "d" (g->sched_rcv)
+		: "memory", "cc");
+
+	cd = sl_thd_dcbinfo(sl_thd_curr());
+	cd->sp = 0;
+
+	rf |= RCV_ULONLY;
+	return cos_ul_sched_rcv(g->sched_rcv, rf, g->timeout_next, e);
+}
+
 static void
 sl_sched_loop_intern(int non_block)
 {
@@ -696,7 +726,8 @@ sl_sched_loop_intern(int non_block)
 			 * states of it's child threads) and normal notifications (mainly activations from
 			 * it's parent scheduler).
 			 */
-			pending = cos_ul_sched_rcv(g->sched_rcv, rfl, g->timeout_next, &e);
+			//pending = cos_ul_sched_rcv(g->sched_rcv, rfl, g->timeout_next, &e);
+			pending = __sl_sched_rcv(rfl, &e);
 
 			if (pending < 0 || !e.tid) goto pending_events;
 
@@ -817,7 +848,7 @@ sl_thd_replenish_no_cs(struct sl_thd *t, cycles_t now)
 
 		/* tcap_transfer will assign sched_tcap's prio to t's tcap if t->prio == 0, which we don't want. */
 		assert(t->prio >= TCAP_PRIO_MAX && t->prio <= TCAP_PRIO_MIN);
-		ret = cos_tcap_transfer(sl_thd_rcvcap(t), globals->sched_tcap, transfer, t->prio);
+		ret = cos_tcap_transfer(sl_thd_rcvcap(t), sl__globals_core()->sched_tcap, transfer, t->prio);
 	}
 
 	if (likely(ret == 0)) t->last_replenish = replenish;
diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index c280a4cfda..30e525410a 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -539,7 +539,9 @@ notify_parent(struct thread *rcv_thd, int send)
 {
 	struct thread *curr_notif = NULL, *prev_notif = NULL, *arcv_notif = NULL;
 	int            depth = 0;
+	cycles_t       now; 
 
+	rdtscll(now);
 	/* hierarchical notifications - upto init (bounded by ARCV_NOTIF_DEPTH) */
 	prev_notif = rcv_thd;
 	curr_notif = arcv_notif = arcv_thd_notif(prev_notif);
@@ -547,6 +549,7 @@ notify_parent(struct thread *rcv_thd, int send)
 	while (curr_notif && curr_notif != prev_notif) {
 		assert(depth < ARCV_NOTIF_DEPTH);
 
+		prev_notif->event_epoch = now;
 		thd_rcvcap_evt_enqueue(curr_notif, prev_notif);
 		if (!(curr_notif->state & THD_STATE_RCVING)) break;
 
diff --git a/src/kernel/include/shared/cos_sched.h b/src/kernel/include/shared/cos_sched.h
index eef5664464..bf6b7ef6d1 100644
--- a/src/kernel/include/shared/cos_sched.h
+++ b/src/kernel/include/shared/cos_sched.h
@@ -4,9 +4,10 @@
 #include "./cos_types.h"
 
 struct cos_thd_event {
-	u16_t         blocked;
-	u32_t         next_timeout;
-	u64_t         elapsed_cycs;
+	u16_t blocked;
+	u32_t next_timeout;
+	u64_t elapsed_cycs;
+	u64_t epoch; 
 } __attribute__((packed));
 
 struct cos_sched_event {
@@ -25,7 +26,7 @@ struct cos_scb_info {
 	capid_t               curr_thd;
 	cycles_t              timer_next;
 	sched_tok_t           sched_tok;
-	struct cos_sched_ring sched_events;
+	struct cos_sched_ring sched_events; /* kernel-level events only */
 } CACHE_ALIGNED;
 
 struct cos_dcb_info {
diff --git a/src/kernel/include/shared/cos_types.h b/src/kernel/include/shared/cos_types.h
index afb92c1edb..e67708b7bf 100644
--- a/src/kernel/include/shared/cos_types.h
+++ b/src/kernel/include/shared/cos_types.h
@@ -73,6 +73,7 @@ typedef enum {
 typedef enum {
 	RCV_NON_BLOCKING = 1,
 	RCV_ULONLY       = (1 << 1),
+	RCV_ULSCHED_RCV  = (1 << 2),
 } rcv_flags_t;
 
 #define BOOT_LIVENESS_ID_BASE 2
diff --git a/src/kernel/include/thd.h b/src/kernel/include/thd.h
index e507033db8..6c3396aa7d 100644
--- a/src/kernel/include/thd.h
+++ b/src/kernel/include/thd.h
@@ -75,6 +75,7 @@ struct thread {
 	struct rcvcap_info rcvcap;
 	struct list        event_head; /* all events for *this* end-point */
 	struct list_node   event_list; /* the list of events for another end-point */
+	u64_t              event_epoch; /* used by user-level for ULSCHED events.. */
 } CACHE_ALIGNED;
 
 /*
@@ -264,7 +265,7 @@ thd_rcvcap_pending_reset(struct thread *arcvt)
 }
 
 static inline int
-thd_state_evt_deliver(struct thread *t, unsigned long *thd_state, unsigned long *cycles, unsigned long *timeout)
+thd_state_evt_deliver(struct thread *t, unsigned long *thd_state, unsigned long *cycles, unsigned long *timeout, u64_t *epoch)
 {
 	struct thread *e = thd_rcvcap_evt_dequeue(t);
 
@@ -276,6 +277,8 @@ thd_state_evt_deliver(struct thread *t, unsigned long *thd_state, unsigned long
 	e->exec    = 0;
 	*timeout   = e->timeout;
 	e->timeout = 0;
+	*epoch     = e->event_epoch;
+	e->event_epoch = 0;
 
 	return 1;
 }
@@ -607,7 +610,7 @@ thd_sched_events_produce(struct thread *thd, struct cos_cpu_local_info *cos_info
 	if (unlikely(inv_top != 0 || thd->rcvcap.is_init == 0)) return 0;
 
 	c = thd_invstk_peek_compinfo(thd, cos_info, inv_top);
-	if (unlikely(!c || !c->scb_data)) return 0;
+	if (unlikely(!c || !c->scb_data)) return -ENOENT;
 
 	scb = ((c->scb_data) + get_cpuid());
 	r   = &(scb->sched_events);
@@ -623,7 +626,7 @@ thd_sched_events_produce(struct thread *thd, struct cos_cpu_local_info *cos_info
 		unsigned long thd_state;
 
 		if (!thd_state_evt_deliver(thd, &thd_state, (unsigned long *)&(e->evt.elapsed_cycs),
-					(unsigned long *)&(e->evt.next_timeout))) break;
+					(unsigned long *)&(e->evt.next_timeout), &(e->evt.epoch))) break;
 		e->tid         = (thd_state << 1) >> 1;
 		e->evt.blocked = (thd_state >> 31);
 
@@ -639,10 +642,13 @@ static inline void
 thd_rcvcap_pending_deliver(struct thread *thd, struct pt_regs *regs)
 {
 	unsigned long thd_state = 0, cycles = 0, timeout = 0;
+	u64_t epoch = 0;
 
-	thd_state_evt_deliver(thd, &thd_state, &cycles, &timeout);
+	/* events only in scb now, no return values... */
 	thd_rcvcap_pending_reset(thd);
-	thd_sched_events_produce(thd, cos_cpu_local_info());
+	if (thd_sched_events_produce(thd, cos_cpu_local_info()) == -ENOENT) {
+		thd_state_evt_deliver(thd, &thd_state, &cycles, &timeout, &epoch);
+	}
 	__userregs_setretvals(regs, thd_rcvcap_pending(thd), thd_state, cycles, timeout);
 }
 
diff --git a/src/platform/i386/runscripts/unit_slite01.sh b/src/platform/i386/runscripts/unit_slite01.sh
new file mode 100644
index 0000000000..511e793c8e
--- /dev/null
+++ b/src/platform/i386/runscripts/unit_slite01.sh
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+cp llboot_comp.o llboot.o
+cp root_fprr.o boot.o
+#cp unit_slrcvtest.o boot.o
+cp test_boot.o dummy1.o
+./cos_linker "llboot.o, ;*unit_slrcvtest.o, ;capmgr.o, ;dummy1.o, ;*boot.o, :boot.o-capmgr.o;unit_slrcvtest.o-boot.o|capmgr.o" ./gen_client_stub

From 6d588a855b0dc44e0714fef34de2b3089257c112 Mon Sep 17 00:00:00 2001
From: Phani <phanikishoreg@gwu.edu>
Date: Fri, 13 Sep 2019 16:07:51 -0400
Subject: [PATCH 109/127] possible working sl_sched_rcv

- TODO: test interrupt to sched (direct) switch..
---
 .../implementation/sched/sched_init.c         |  2 +-
 .../implementation/tests/unit_slrcv/Makefile  |  4 +-
 .../implementation/tests/unit_slrcv/init.c    | 66 +++++++++----
 .../implementation/tests/unit_slrcv/spinlib.c | 64 +++++++-----
 src/components/include/cos_ulsched_rcv.h      |  2 +-
 src/components/include/sl.h                   | 50 +++++++++-
 src/components/lib/sl/sl_sched.c              | 98 ++++++++++++-------
 src/kernel/capinv.c                           |  3 +
 src/kernel/include/shared/cos_sched.h         |  9 +-
 src/kernel/include/shared/cos_types.h         |  1 +
 src/kernel/include/thd.h                      | 16 ++-
 src/platform/i386/runscripts/unit_slite01.sh  |  7 ++
 12 files changed, 225 insertions(+), 97 deletions(-)
 create mode 100644 src/platform/i386/runscripts/unit_slite01.sh

diff --git a/src/components/implementation/sched/sched_init.c b/src/components/implementation/sched/sched_init.c
index a448247050..d528982980 100644
--- a/src/components/implementation/sched/sched_init.c
+++ b/src/components/implementation/sched/sched_init.c
@@ -37,7 +37,7 @@ schedinit_child(void)
 	if (!init) return 0;
 	tcur = sl_thd_curr();
 	if (!tcur) return 0;
-	assert(tcur->schedthd == init);
+	assert(tcur->schedthd == init || tcur == init);
 
 	/* thd retrieve */
 	do {
diff --git a/src/components/implementation/tests/unit_slrcv/Makefile b/src/components/implementation/tests/unit_slrcv/Makefile
index 0c3074e079..11d3890dde 100644
--- a/src/components/implementation/tests/unit_slrcv/Makefile
+++ b/src/components/implementation/tests/unit_slrcv/Makefile
@@ -1,8 +1,8 @@
 COMPONENT=unit_slrcvtest.o
 INTERFACES=
-DEPENDENCIES=
+DEPENDENCIES=capmgr schedinit
 IF_LIB=
-ADDITIONAL_LIBS=-lcobj_format $(LIBSLRAW) -lsl_mod_fprr -lsl_thd_static_backend -lcos_dcb
+ADDITIONAL_LIBS=-lcobj_format $(LIBSLCAPMGR) -lsl_mod_rr -lsl_thd_static_backend -lcos_dcb
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/tests/unit_slrcv/init.c b/src/components/implementation/tests/unit_slrcv/init.c
index 8a1fb73eaf..badd1165a4 100644
--- a/src/components/implementation/tests/unit_slrcv/init.c
+++ b/src/components/implementation/tests/unit_slrcv/init.c
@@ -4,6 +4,7 @@
 #include <sl.h>
 #include <cos_dcb.h>
 #include <hypercall.h>
+#include <schedinit.h>
 #include "spinlib.h"
 
 static struct sl_xcore_thd *ping;
@@ -49,33 +50,54 @@ cos_init(void *d)
 	static volatile asndcap_t s = 0;
 	unsigned int cycs_per_us = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
 
-	assert(NUM_CPU == 2);
+	if (NUM_CPU == 2) {
+		if (cos_cpuid() == 0) {
+			cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
+			cos_defcompinfo_llinit();
+			cos_dcb_info_init_curr();
+			sl_init(SL_MIN_PERIOD_US);
+			spinlib_calib(cycs_per_us);
 
-	if (cos_cpuid() == 0) {
+			struct sl_thd *t = sl_thd_aep_alloc(pong_fn, NULL, 0, 0, 0, 0);
+			assert(t);
+			sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1));
+			r = sl_thd_rcvcap(t);
+			assert(r);
+		} else {
+			while (!ps_load(&init_done[0])) ;
+
+			cos_defcompinfo_sched_init();
+			cos_dcb_info_init_curr();
+			sl_init(SL_MIN_PERIOD_US);
+
+			struct sl_thd *t = sl_thd_alloc(ping_fn, (void *)&s);
+			assert(t);
+			sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1));
+
+			while (!r) ;
+			s = cos_asnd_alloc(ci, r, ci->captbl_cap);
+			assert(s);
+		}
+	} else {
+		assert(NUM_CPU == 1);
 		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
-		cos_defcompinfo_llinit();
-		cos_dcb_info_init_curr();
+		cos_defcompinfo_init();
+		//cos_dcb_info_init_curr();
 		sl_init(SL_MIN_PERIOD_US);
 		spinlib_calib(cycs_per_us);
 
-		struct sl_thd *t = sl_thd_aep_alloc(pong_fn, NULL, 0, 0, 0, 0);
-		assert(t);
-		sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1));
-		r = sl_thd_rcvcap(t);
+		struct sl_thd *rt = sl_thd_aep_alloc(pong_fn, NULL, 0, 0, 0, 0);
+		assert(rt);
+		sl_thd_param_set(rt, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1));
+		r = sl_thd_rcvcap(rt);
 		assert(r);
-	} else {
-		while (!ps_load(&init_done[0])) ;
-
-		cos_defcompinfo_sched_init();
-		cos_dcb_info_init_curr();
-		sl_init(SL_MIN_PERIOD_US);
-		
-		struct sl_thd *t = sl_thd_alloc(ping_fn, (void *)&s);
-		assert(t);
-		sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1));
+		struct sl_thd *st = sl_thd_alloc(ping_fn, (void *)&s);
+		assert(st);
+		sl_thd_param_set(st, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1));
 
-		while (!r) ;
-		s = cos_asnd_alloc(ci, r, ci->captbl_cap);
+		//s = cos_asnd_alloc(ci, r, ci->captbl_cap);
+		//assert(s);
+		s = capmgr_asnd_rcv_create(r);
 		assert(s);
 	}
 	ps_faa(&init_done[cos_cpuid()], 1);
@@ -84,7 +106,9 @@ cos_init(void *d)
 	for (i = 0; i < NUM_CPU; i++) {
 		while (!ps_load(&init_done[i])) ;
 	}
-	sl_sched_loop_nonblock();
+	//hypercall_comp_init_done();
+	schedinit_child();
+	sl_sched_loop();
 
 	PRINTC("Should never get here!\n");
 	assert(0);
diff --git a/src/components/implementation/tests/unit_slrcv/spinlib.c b/src/components/implementation/tests/unit_slrcv/spinlib.c
index 22ff1218b3..e17341b5fe 100644
--- a/src/components/implementation/tests/unit_slrcv/spinlib.c
+++ b/src/components/implementation/tests/unit_slrcv/spinlib.c
@@ -76,35 +76,47 @@ spinlib_calib(unsigned int cycs_per_us)
 void
 spinlib_cycles(cycles_t cycs)
 {
-	unsigned int i = 0;
-	unsigned int iters = cycs / spinlib_cycs_per_spin_iters;
-	unsigned int left = cycs % spinlib_cycs_per_spin_iters;
-
-	assert(cycs >= spinlib_cycs_per_spin_iters);
-
-	/* round off to next cycs/spin */
-	if (left >= (spinlib_cycs_per_spin_iters / 2)) iters ++;
-
-	while (i < iters) {
-		spinlib_std_iters();
-		i ++;
-	}
+//	unsigned int i = 0;
+//	unsigned int iters = cycs / spinlib_cycs_per_spin_iters;
+//	unsigned int left = cycs % spinlib_cycs_per_spin_iters;
+//
+//	assert(cycs >= spinlib_cycs_per_spin_iters);
+//
+//	/* round off to next cycs/spin */
+//	if (left >= (spinlib_cycs_per_spin_iters / 2)) iters ++;
+//
+//	while (i < iters) {
+//		spinlib_std_iters();
+//		i ++;
+//	}
+	unsigned long long st, en;
+
+	rdtscll(st);
+	en = st + cycs;
+
+	// doesn't work with concurrency.. but don't care for now.
+	do {
+		rdtscll(st);
+	} while (st < en);
 }
 
 void
 spinlib_usecs(cycles_t usecs)
 {
-	unsigned int i = 0;
-	unsigned int iters = usecs / spinlib_usecs_per_spin_iters;
-	unsigned int left = usecs % spinlib_usecs_per_spin_iters;
-
-	assert(usecs >= spinlib_usecs_per_spin_iters);
-
-	/* round off to next usec */
-	if (left >= (spinlib_usecs_per_spin_iters / 2)) iters ++;
-
-	while (i < iters) {
-		spinlib_std_iters();
-		i ++;
-	}
+	unsigned long long cycs = sl_usec2cyc(usecs);
+
+	spinlib_cycles(cycs);
+//	unsigned int i = 0;
+//	unsigned int iters = usecs / spinlib_usecs_per_spin_iters;
+//	unsigned int left = usecs % spinlib_usecs_per_spin_iters;
+//
+//	assert(usecs >= spinlib_usecs_per_spin_iters);
+//
+//	/* round off to next usec */
+//	if (left >= (spinlib_usecs_per_spin_iters / 2)) iters ++;
+//
+//	while (i < iters) {
+//		spinlib_std_iters();
+//		i ++;
+//	}
 }
diff --git a/src/components/include/cos_ulsched_rcv.h b/src/components/include/cos_ulsched_rcv.h
index 29a470e2b1..881d0da7f6 100644
--- a/src/components/include/cos_ulsched_rcv.h
+++ b/src/components/include/cos_ulsched_rcv.h
@@ -15,8 +15,8 @@ __cos_sched_event_consume(struct cos_sched_ring *r, struct cos_sched_event *e)
 	int f = 0;
 
 	if (!r || !e || !__cos_sched_events_present(r)) return 0;
-	*e = r->event_buf[f];
 	f = ps_upfaa((unsigned long *)&r->head, 1);
+	*e = r->event_buf[f];
 //	memcpy((void *)e, (void *)&(r->event_buf[f]), sizeof(struct cos_sched_event));
 
 	return 1;
diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index f6fe36cddf..54f3a10a33 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -41,7 +41,7 @@
 #include <heap.h>
 
 #define SL_CS
-#undef  SL_REPLENISH
+#define SL_REPLENISH
 
 /* Critical section (cs) API to protect scheduler data-structures */
 struct sl_cs {
@@ -574,7 +574,7 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 	struct sl_thd         *t = to;
 	struct sl_global_core *globals = sl__globals_core();
 	sched_tok_t            tok;
-//	cycles_t               now;
+	cycles_t               now;
 	s64_t                  offset;
 	int                    ret;
 
@@ -584,7 +584,7 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 #endif
 
 	tok    = cos_sched_sync();
-//	now    = sl_now();
+	now    = sl_now();
 
 	/* still wakeup without timeouts? that adds to dispatch overhead! */
 //	offset = (s64_t)(globals->timer_next - now);
@@ -893,14 +893,51 @@ sl_thd_yield_timeout(thdid_t tid, cycles_t abs_timeout)
 	}
 }
 
+static inline void
+sl_thd_event_info_reset(struct sl_thd *t)
+{
+	t->event_info.blocked      = 0;
+	t->event_info.elapsed_cycs = 0;
+	t->event_info.next_timeout = 0;
+	t->event_info.epoch        = 0;
+}
+
+static inline void
+sl_thd_event_enqueue(struct sl_thd *t, struct cos_thd_event *e)
+{
+	struct sl_global_core *g = sl__globals_core();
+
+	if (e->epoch <= t->event_info.epoch) return;
+
+	if (ps_list_singleton(t, SL_THD_EVENT_LIST)) ps_list_head_append(&g->event_head, t, SL_THD_EVENT_LIST);
+
+	t->event_info.blocked       = e->blocked;
+	t->event_info.elapsed_cycs += e->elapsed_cycs;
+	t->event_info.next_timeout  = e->next_timeout;
+}
+
+static inline void
+sl_thd_event_dequeue(struct sl_thd *t, struct cos_thd_event *e)
+{
+	ps_list_rem(t, SL_THD_EVENT_LIST);
+
+	e->blocked      = t->event_info.blocked;
+	e->elapsed_cycs = t->event_info.elapsed_cycs;
+	e->next_timeout = t->event_info.next_timeout;
+	sl_thd_event_info_reset(t);
+}
+
 static inline int
 sl_thd_rcv(rcv_flags_t flags)
 {
+	/* FIXME: elapsed_cycs accounting..?? */
+	struct cos_thd_event ev = { .blocked = 1, .next_timeout = 0, .epoch = 0, .elapsed_cycs = 0 };
 	struct sl_thd *t = sl_thd_curr();
 	unsigned long *p = &sl_thd_dcbinfo(t)->pending, q = 0;
 	int ret = 0;
 
 	assert(sl_thd_rcvcap(t));
+	assert(!(flags & RCV_ULSCHED_RCV));
 check:
 	sl_cs_enter();
 	/* there no pending event in the dcbinfo->pending */
@@ -911,7 +948,12 @@ sl_thd_rcv(rcv_flags_t flags)
 			goto done;
 		}
 
-		sl_thd_sched_block_no_cs(t, SL_THD_BLOCKED, 0);
+		ev.epoch = sl_now();
+		sl_thd_event_enqueue(t, &ev);
+		/*
+		 * TODO: add event so sched thread will do this?
+		 *  sl_thd_sched_block_no_cs(t, SL_THD_BLOCKED, 0);
+		 */
 		sl_cs_exit_switchto(sl__globals_core()->sched_thd);
 
 		goto check;
diff --git a/src/components/lib/sl/sl_sched.c b/src/components/lib/sl/sl_sched.c
index 60c07c1da1..41dbd25ce0 100644
--- a/src/components/lib/sl/sl_sched.c
+++ b/src/components/lib/sl/sl_sched.c
@@ -501,37 +501,6 @@ sl_thd_yield_intern_timeout(cycles_t abs_timeout)
 	sl_cs_exit_schedule_timeout(abs_timeout);
 }
 
-void
-sl_thd_event_info_reset(struct sl_thd *t)
-{
-	t->event_info.blocked      = 0;
-	t->event_info.elapsed_cycs = 0;
-	t->event_info.next_timeout = 0;
-}
-
-static inline void
-sl_thd_event_enqueue(struct sl_thd *t, struct cos_thd_event *e)
-{
-	struct sl_global_core *g = sl__globals_core();
-
-	if (ps_list_singleton(t, SL_THD_EVENT_LIST)) ps_list_head_append(&g->event_head, t, SL_THD_EVENT_LIST);
-
-	t->event_info.blocked       = e->blocked;
-	t->event_info.elapsed_cycs += e->elapsed_cycs;
-	t->event_info.next_timeout  = e->next_timeout;
-}
-
-static inline void
-sl_thd_event_dequeue(struct sl_thd *t, struct cos_thd_event *e)
-{
-	ps_list_rem(t, SL_THD_EVENT_LIST);
-
-	e->blocked      = t->event_info.blocked;
-	e->elapsed_cycs = t->event_info.elapsed_cycs;
-	e->next_timeout = t->event_info.next_timeout;
-	sl_thd_event_info_reset(t);
-}
-
 void
 sl_thd_exit()
 {
@@ -674,6 +643,68 @@ sl_init(microsec_t period)
 	sl_init_corebmp(period, corebmp);
 }
 
+static inline int
+__sl_sched_events_present(void)
+{
+	struct cos_scb_info *scb = sl_scb_info_core();
+	struct cos_sched_ring *ring = &scb->sched_events;
+
+	return __cos_sched_events_present(ring);
+}
+
+static inline int
+__sl_sched_event_consume(struct cos_sched_event *e)
+{
+	struct cos_scb_info *scb = sl_scb_info_core();
+	struct cos_sched_ring *ring = &scb->sched_events;
+
+	return __cos_sched_event_consume(ring, e);
+}
+
+static inline int
+__sl_sched_rcv(rcv_flags_t rf, struct cos_sched_event *e)
+{
+	struct sl_global_core *g = sl__globals_core();
+	struct sl_thd *curr = sl_thd_curr();
+	struct cos_dcb_info *cd = sl_thd_dcbinfo(curr);
+	int ret = 0;
+
+	assert(curr == g->sched_thd);
+	if (!cd) return cos_ul_sched_rcv(g->sched_rcv, rf, g->timeout_next, e);
+
+	rf |= RCV_ULSCHED_RCV;
+	
+	__asm__ __volatile__ (			\
+		"pushl %%ebp\n\t"		\
+		"movl %%esp, %%ebp\n\t"		\
+		"movl $1f, (%%eax)\n\t"		\
+		"movl %%esp, 4(%%eax)\n\t"	\
+		"movl $2f, %%ecx\n\t"		\
+		"movl %%edx, %%eax\n\t"		\
+		"inc %%eax\n\t"			\
+		"shl $16, %%eax\n\t"		\
+		"movl $0, %%edx\n\t"		\
+		"movl $0, %%edi\n\t"		\
+		"sysenter\n\t"			\
+		"jmp 2f\n\t"			\
+		".align 4\n\t"			\
+		"1:\n\t"			\
+		"movl $1, %%eax\n\t"		\
+		".align 4\n\t"			\
+		"2:\n\t"			\
+		"popl %%ebp\n\t"		\
+		: "=a" (ret)
+		: "a" (cd), "b" (rf), "S" (g->timeout_next), "d" (g->sched_rcv)
+		: "memory", "cc", "ecx", "edi");
+
+//	if (cos_thdid() == 7) PRINTC("%s:%d %d\n", __func__, __LINE__, ret);
+	cd = sl_thd_dcbinfo(sl_thd_curr());
+	cd->sp = 0;
+
+	rf |= RCV_ULONLY;
+	return cos_ul_sched_rcv(g->sched_rcv, rf, g->timeout_next, e);
+}
+
 static void
 sl_sched_loop_intern(int non_block)
 {
@@ -696,7 +727,8 @@ sl_sched_loop_intern(int non_block)
 			 * states of it's child threads) and normal notifications (mainly activations from
 			 * it's parent scheduler).
 			 */
-			pending = cos_ul_sched_rcv(g->sched_rcv, rfl, g->timeout_next, &e);
+			//pending = cos_ul_sched_rcv(g->sched_rcv, rfl, g->timeout_next, &e);
+			pending = __sl_sched_rcv(rfl, &e);
 
 			if (pending < 0 || !e.tid) goto pending_events;
 
@@ -817,7 +849,7 @@ sl_thd_replenish_no_cs(struct sl_thd *t, cycles_t now)
 
 		/* tcap_transfer will assign sched_tcap's prio to t's tcap if t->prio == 0, which we don't want. */
 		assert(t->prio >= TCAP_PRIO_MAX && t->prio <= TCAP_PRIO_MIN);
-		ret = cos_tcap_transfer(sl_thd_rcvcap(t), globals->sched_tcap, transfer, t->prio);
+		ret = cos_tcap_transfer(sl_thd_rcvcap(t), sl__globals_core()->sched_tcap, transfer, t->prio);
 	}
 
 	if (likely(ret == 0)) t->last_replenish = replenish;
diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index c280a4cfda..30e525410a 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -539,7 +539,9 @@ notify_parent(struct thread *rcv_thd, int send)
 {
 	struct thread *curr_notif = NULL, *prev_notif = NULL, *arcv_notif = NULL;
 	int            depth = 0;
+	cycles_t       now; 
 
+	rdtscll(now);
 	/* hierarchical notifications - upto init (bounded by ARCV_NOTIF_DEPTH) */
 	prev_notif = rcv_thd;
 	curr_notif = arcv_notif = arcv_thd_notif(prev_notif);
@@ -547,6 +549,7 @@ notify_parent(struct thread *rcv_thd, int send)
 	while (curr_notif && curr_notif != prev_notif) {
 		assert(depth < ARCV_NOTIF_DEPTH);
 
+		prev_notif->event_epoch = now;
 		thd_rcvcap_evt_enqueue(curr_notif, prev_notif);
 		if (!(curr_notif->state & THD_STATE_RCVING)) break;
 
diff --git a/src/kernel/include/shared/cos_sched.h b/src/kernel/include/shared/cos_sched.h
index eef5664464..bf6b7ef6d1 100644
--- a/src/kernel/include/shared/cos_sched.h
+++ b/src/kernel/include/shared/cos_sched.h
@@ -4,9 +4,10 @@
 #include "./cos_types.h"
 
 struct cos_thd_event {
-	u16_t         blocked;
-	u32_t         next_timeout;
-	u64_t         elapsed_cycs;
+	u16_t blocked;
+	u32_t next_timeout;
+	u64_t elapsed_cycs;
+	u64_t epoch; 
 } __attribute__((packed));
 
 struct cos_sched_event {
@@ -25,7 +26,7 @@ struct cos_scb_info {
 	capid_t               curr_thd;
 	cycles_t              timer_next;
 	sched_tok_t           sched_tok;
-	struct cos_sched_ring sched_events;
+	struct cos_sched_ring sched_events; /* kernel-level events only */
 } CACHE_ALIGNED;
 
 struct cos_dcb_info {
diff --git a/src/kernel/include/shared/cos_types.h b/src/kernel/include/shared/cos_types.h
index afb92c1edb..e67708b7bf 100644
--- a/src/kernel/include/shared/cos_types.h
+++ b/src/kernel/include/shared/cos_types.h
@@ -73,6 +73,7 @@ typedef enum {
 typedef enum {
 	RCV_NON_BLOCKING = 1,
 	RCV_ULONLY       = (1 << 1),
+	RCV_ULSCHED_RCV  = (1 << 2),
 } rcv_flags_t;
 
 #define BOOT_LIVENESS_ID_BASE 2
diff --git a/src/kernel/include/thd.h b/src/kernel/include/thd.h
index e507033db8..6c3396aa7d 100644
--- a/src/kernel/include/thd.h
+++ b/src/kernel/include/thd.h
@@ -75,6 +75,7 @@ struct thread {
 	struct rcvcap_info rcvcap;
 	struct list        event_head; /* all events for *this* end-point */
 	struct list_node   event_list; /* the list of events for another end-point */
+	u64_t              event_epoch; /* used by user-level for ULSCHED events.. */
 } CACHE_ALIGNED;
 
 /*
@@ -264,7 +265,7 @@ thd_rcvcap_pending_reset(struct thread *arcvt)
 }
 
 static inline int
-thd_state_evt_deliver(struct thread *t, unsigned long *thd_state, unsigned long *cycles, unsigned long *timeout)
+thd_state_evt_deliver(struct thread *t, unsigned long *thd_state, unsigned long *cycles, unsigned long *timeout, u64_t *epoch)
 {
 	struct thread *e = thd_rcvcap_evt_dequeue(t);
 
@@ -276,6 +277,8 @@ thd_state_evt_deliver(struct thread *t, unsigned long *thd_state, unsigned long
 	e->exec    = 0;
 	*timeout   = e->timeout;
 	e->timeout = 0;
+	*epoch     = e->event_epoch;
+	e->event_epoch = 0;
 
 	return 1;
 }
@@ -607,7 +610,7 @@ thd_sched_events_produce(struct thread *thd, struct cos_cpu_local_info *cos_info
 	if (unlikely(inv_top != 0 || thd->rcvcap.is_init == 0)) return 0;
 
 	c = thd_invstk_peek_compinfo(thd, cos_info, inv_top);
-	if (unlikely(!c || !c->scb_data)) return 0;
+	if (unlikely(!c || !c->scb_data)) return -ENOENT;
 
 	scb = ((c->scb_data) + get_cpuid());
 	r   = &(scb->sched_events);
@@ -623,7 +626,7 @@ thd_sched_events_produce(struct thread *thd, struct cos_cpu_local_info *cos_info
 		unsigned long thd_state;
 
 		if (!thd_state_evt_deliver(thd, &thd_state, (unsigned long *)&(e->evt.elapsed_cycs),
-					(unsigned long *)&(e->evt.next_timeout))) break;
+					(unsigned long *)&(e->evt.next_timeout), &(e->evt.epoch))) break;
 		e->tid         = (thd_state << 1) >> 1;
 		e->evt.blocked = (thd_state >> 31);
 
@@ -639,10 +642,13 @@ static inline void
 thd_rcvcap_pending_deliver(struct thread *thd, struct pt_regs *regs)
 {
 	unsigned long thd_state = 0, cycles = 0, timeout = 0;
+	u64_t epoch = 0;
 
-	thd_state_evt_deliver(thd, &thd_state, &cycles, &timeout);
+	/* events only in scb now, no return values... */
 	thd_rcvcap_pending_reset(thd);
-	thd_sched_events_produce(thd, cos_cpu_local_info());
+	if (thd_sched_events_produce(thd, cos_cpu_local_info()) == -ENOENT) {
+		thd_state_evt_deliver(thd, &thd_state, &cycles, &timeout, &epoch);
+	}
 	__userregs_setretvals(regs, thd_rcvcap_pending(thd), thd_state, cycles, timeout);
 }
 
diff --git a/src/platform/i386/runscripts/unit_slite01.sh b/src/platform/i386/runscripts/unit_slite01.sh
new file mode 100644
index 0000000000..511e793c8e
--- /dev/null
+++ b/src/platform/i386/runscripts/unit_slite01.sh
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+cp llboot_comp.o llboot.o
+cp root_fprr.o boot.o
+#cp unit_slrcvtest.o boot.o
+cp test_boot.o dummy1.o
+./cos_linker "llboot.o, ;*unit_slrcvtest.o, ;capmgr.o, ;dummy1.o, ;*boot.o, :boot.o-capmgr.o;unit_slrcvtest.o-boot.o|capmgr.o" ./gen_client_stub

From 1268b9bd539e75f5607958758a415dfecedbcb25 Mon Sep 17 00:00:00 2001
From: Phani <phanikishoreg@gwu.edu>
Date: Mon, 16 Sep 2019 14:13:24 -0400
Subject: [PATCH 110/127] work : work generator using spinning

---
 .../implementation/capmgr/naive/Makefile      |  2 +-
 .../implementation/capmgr/naive/init.c        |  5 +-
 .../unit_slrcv => capmgr/naive}/spinlib.c     | 64 ++++++++-----------
 .../unit_slrcv => capmgr/naive}/spinlib.h     |  0
 .../implementation/capmgr/naive/work.c        | 38 +++++++++++
 .../implementation/tests/unit_slrcv/Makefile  |  2 +-
 .../implementation/tests/unit_slrcv/init.c    |  7 +-
 src/components/interface/work/Makefile        |  4 ++
 src/components/interface/work/stubs/c_stub.c  | 37 +++++++++++
 src/components/interface/work/stubs/s_stub.S  | 12 ++++
 src/components/interface/work/work.h          | 12 ++++
 11 files changed, 138 insertions(+), 45 deletions(-)
 rename src/components/implementation/{tests/unit_slrcv => capmgr/naive}/spinlib.c (66%)
 rename src/components/implementation/{tests/unit_slrcv => capmgr/naive}/spinlib.h (100%)
 create mode 100644 src/components/implementation/capmgr/naive/work.c
 create mode 100644 src/components/interface/work/Makefile
 create mode 100644 src/components/interface/work/stubs/c_stub.c
 create mode 100644 src/components/interface/work/stubs/s_stub.S
 create mode 100644 src/components/interface/work/work.h

diff --git a/src/components/implementation/capmgr/naive/Makefile b/src/components/implementation/capmgr/naive/Makefile
index 171178b7c5..4a6a2129f4 100644
--- a/src/components/implementation/capmgr/naive/Makefile
+++ b/src/components/implementation/capmgr/naive/Makefile
@@ -1,7 +1,7 @@
 C_OBJS=cap_mgr.c mem_mgr.c init.c
 ASM_OBJS=
 COMPONENT=capmgr.o
-INTERFACES=capmgr channel
+INTERFACES=capmgr channel work
 DEPENDENCIES=
 IF_LIB=
 ADDITIONAL_LIBS=$(LIBSLRAW) -lsl_mod_fprr -lsl_thd_static_backend
diff --git a/src/components/implementation/capmgr/naive/init.c b/src/components/implementation/capmgr/naive/init.c
index b150101e29..0512aab8f3 100644
--- a/src/components/implementation/capmgr/naive/init.c
+++ b/src/components/implementation/capmgr/naive/init.c
@@ -13,6 +13,7 @@
 #include <cap_info.h>
 #include <hypercall.h>
 #include <sl.h>
+#include "spinlib.h"
 
 static volatile int capmgr_init_core_done = 0;
 
@@ -173,8 +174,9 @@ cos_init(void)
 	spdid_t child;
 	comp_flag_t ch_flags;
 	int ret = 0, i;
+	unsigned int cycs_per_us = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
 
-	PRINTLOG(PRINT_DEBUG, "CPU cycles per sec: %u\n", cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE));
+	PRINTLOG(PRINT_DEBUG, "CPU cycles per sec: %u\n", cycs_per_us);
 	ret = hypercall_comp_frontier_get(cos_spd_id(), &heap_frontier, &cap_frontier);
 	assert(ret == 0);
 
@@ -186,6 +188,7 @@ cos_init(void)
 		cap_info_init();
 		cos_dcb_info_init_curr();
 		sl_init(SL_MIN_PERIOD_US);
+		spinlib_calib(cycs_per_us);
 		capmgr_comp_info_iter();
 	} else {
 		while (!capmgr_init_core_done) ; /* WAIT FOR INIT CORE TO BE DONE */
diff --git a/src/components/implementation/tests/unit_slrcv/spinlib.c b/src/components/implementation/capmgr/naive/spinlib.c
similarity index 66%
rename from src/components/implementation/tests/unit_slrcv/spinlib.c
rename to src/components/implementation/capmgr/naive/spinlib.c
index e17341b5fe..22ff1218b3 100644
--- a/src/components/implementation/tests/unit_slrcv/spinlib.c
+++ b/src/components/implementation/capmgr/naive/spinlib.c
@@ -76,47 +76,35 @@ spinlib_calib(unsigned int cycs_per_us)
 void
 spinlib_cycles(cycles_t cycs)
 {
-//	unsigned int i = 0;
-//	unsigned int iters = cycs / spinlib_cycs_per_spin_iters;
-//	unsigned int left = cycs % spinlib_cycs_per_spin_iters;
-//
-//	assert(cycs >= spinlib_cycs_per_spin_iters);
-//
-//	/* round off to next cycs/spin */
-//	if (left >= (spinlib_cycs_per_spin_iters / 2)) iters ++;
-//
-//	while (i < iters) {
-//		spinlib_std_iters();
-//		i ++;
-//	}
-	unsigned long long st, en;
-
-	rdtscll(st);
-	en = st + cycs;
-
-	// doesn't work with concurrency.. but don't care for now.
-	do {
-		rdtscll(st);
-	} while (st < en);
+	unsigned int i = 0;
+	unsigned int iters = cycs / spinlib_cycs_per_spin_iters;
+	unsigned int left = cycs % spinlib_cycs_per_spin_iters;
+
+	assert(cycs >= spinlib_cycs_per_spin_iters);
+
+	/* round off to next cycs/spin */
+	if (left >= (spinlib_cycs_per_spin_iters / 2)) iters ++;
+
+	while (i < iters) {
+		spinlib_std_iters();
+		i ++;
+	}
 }
 
 void
 spinlib_usecs(cycles_t usecs)
 {
-	unsigned long long cycs = sl_usec2cyc(usecs);
-
-	spinlib_cycles(cycs);
-//	unsigned int i = 0;
-//	unsigned int iters = usecs / spinlib_usecs_per_spin_iters;
-//	unsigned int left = usecs % spinlib_usecs_per_spin_iters;
-//
-//	assert(usecs >= spinlib_usecs_per_spin_iters);
-//
-//	/* round off to next usec */
-//	if (left >= (spinlib_usecs_per_spin_iters / 2)) iters ++;
-//
-//	while (i < iters) {
-//		spinlib_std_iters();
-//		i ++;
-//	}
+	unsigned int i = 0;
+	unsigned int iters = usecs / spinlib_usecs_per_spin_iters;
+	unsigned int left = usecs % spinlib_usecs_per_spin_iters;
+
+	assert(usecs >= spinlib_usecs_per_spin_iters);
+
+	/* round off to next usec */
+	if (left >= (spinlib_usecs_per_spin_iters / 2)) iters ++;
+
+	while (i < iters) {
+		spinlib_std_iters();
+		i ++;
+	}
 }
diff --git a/src/components/implementation/tests/unit_slrcv/spinlib.h b/src/components/implementation/capmgr/naive/spinlib.h
similarity index 100%
rename from src/components/implementation/tests/unit_slrcv/spinlib.h
rename to src/components/implementation/capmgr/naive/spinlib.h
diff --git a/src/components/implementation/capmgr/naive/work.c b/src/components/implementation/capmgr/naive/work.c
new file mode 100644
index 0000000000..ffd63ca16a
--- /dev/null
+++ b/src/components/implementation/capmgr/naive/work.c
@@ -0,0 +1,38 @@
+#include <work.h>
+#include <sl.h>
+#include "spinlib.h"
+
+int
+work_cycs_cserialized(unsigned long *hielapsed, unsigned long *loelapsed, unsigned long hi_cycs, unsigned long lo_cycs)
+{
+	cycles_t st, end, elapsed, cycs_input = (((cycles_t)hi_cycs << 32) | (cycles_t)lo_cycs);
+
+	rdtscll(st);
+	spinlib_cycles(cycs_input);
+	rdtscll(end);
+	elapsed = end - st;
+
+	*hielapsed = (elapsed >> 32);
+	*loelapsed = ((elapsed << 32) >> 32);
+
+	return 0;
+}
+
+int
+work_usecs_cserialized(unsigned long *hielapsed, unsigned long *loelapsed, unsigned long hi_us, unsigned long lo_us)
+{
+	cycles_t st, end;
+	microsec_t elapsed, usecs_input = (((microsec_t)hi_us << 32) | (microsec_t)lo_us);
+
+	rdtscll(st);
+	spinlib_usecs(usecs_input);
+	rdtscll(end);
+	/* perhaps use spinlib to return the elapsed or use sl.. */
+	elapsed = sl_cyc2usec(end - st);
+
+	*hielapsed = (elapsed >> 32);
+	*loelapsed = ((elapsed << 32) >> 32);
+
+	return 0;
+
+}
diff --git a/src/components/implementation/tests/unit_slrcv/Makefile b/src/components/implementation/tests/unit_slrcv/Makefile
index 11d3890dde..c72ea0a131 100644
--- a/src/components/implementation/tests/unit_slrcv/Makefile
+++ b/src/components/implementation/tests/unit_slrcv/Makefile
@@ -1,6 +1,6 @@
 COMPONENT=unit_slrcvtest.o
 INTERFACES=
-DEPENDENCIES=capmgr schedinit
+DEPENDENCIES=capmgr schedinit work
 IF_LIB=
 ADDITIONAL_LIBS=-lcobj_format $(LIBSLCAPMGR) -lsl_mod_rr -lsl_thd_static_backend -lcos_dcb
 
diff --git a/src/components/implementation/tests/unit_slrcv/init.c b/src/components/implementation/tests/unit_slrcv/init.c
index badd1165a4..2aedb5dd08 100644
--- a/src/components/implementation/tests/unit_slrcv/init.c
+++ b/src/components/implementation/tests/unit_slrcv/init.c
@@ -5,7 +5,7 @@
 #include <cos_dcb.h>
 #include <hypercall.h>
 #include <schedinit.h>
-#include "spinlib.h"
+#include <work.h>
 
 static struct sl_xcore_thd *ping;
 static struct sl_xcore_thd *pong;
@@ -22,7 +22,7 @@ ping_fn(void *d)
 		int r = cos_asnd(s, 0);
 
 		assert(r == 0);
-		spinlib_usecs(WORK_US);
+		work_usecs(WORK_US);
 	}
 	sl_thd_exit();
 }
@@ -51,12 +51,12 @@ cos_init(void *d)
 	unsigned int cycs_per_us = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
 
 	if (NUM_CPU == 2) {
+		assert(0); // need to rework.. 
 		if (cos_cpuid() == 0) {
 			cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
 			cos_defcompinfo_llinit();
 			cos_dcb_info_init_curr();
 			sl_init(SL_MIN_PERIOD_US);
-			spinlib_calib(cycs_per_us);
 
 			struct sl_thd *t = sl_thd_aep_alloc(pong_fn, NULL, 0, 0, 0, 0);
 			assert(t);
@@ -84,7 +84,6 @@ cos_init(void *d)
 		cos_defcompinfo_init();
 		//cos_dcb_info_init_curr();
 		sl_init(SL_MIN_PERIOD_US);
-		spinlib_calib(cycs_per_us);
 
 		struct sl_thd *rt = sl_thd_aep_alloc(pong_fn, NULL, 0, 0, 0, 0);
 		assert(rt);
diff --git a/src/components/interface/work/Makefile b/src/components/interface/work/Makefile
new file mode 100644
index 0000000000..800adb919e
--- /dev/null
+++ b/src/components/interface/work/Makefile
@@ -0,0 +1,4 @@
+B_OBJS=
+LIBS=$(LIB_OBJS:%.o=%.a)
+
+include ../Makefile.subdir
diff --git a/src/components/interface/work/stubs/c_stub.c b/src/components/interface/work/stubs/c_stub.c
new file mode 100644
index 0000000000..aafec59e63
--- /dev/null
+++ b/src/components/interface/work/stubs/c_stub.c
@@ -0,0 +1,37 @@
+/**
+ * Redistribution of this file is permitted under the BSD two clause license.
+ *
+ * Copyright 2018, The George Washington University
+ * Author: Phani Gadepalli, phanikishoreg@gwu.edu
+ */
+
+#include <work.h>
+
+int work_cycs_cserialized(unsigned long *hielpased, unsigned long *loelapsed, unsigned long hi_cycs, unsigned long lo_cycs);
+int work_usecs_cserialized(unsigned long *hielpased, unsigned long *loelapsed, unsigned long hi_usecs, unsigned long lo_usecs);
+
+cycles_t
+work_cycs(cycles_t ncycs)
+{
+	unsigned long hi_in, lo_in, hi_out, lo_out;
+
+	hi_in = (ncycs >> 32);
+	lo_in = ((ncycs << 32) >> 32);
+
+	work_cycs_cserialized(&hi_out, &lo_out, hi_in, lo_in);
+
+	return (((cycles_t) hi_out << 32) | (cycles_t)lo_out);
+}
+
+microsec_t
+work_usecs(microsec_t nusecs)
+{
+	unsigned long hi_in, lo_in, hi_out, lo_out;
+
+	hi_in = (nusecs >> 32);
+	lo_in = ((nusecs << 32) >> 32);
+
+	work_usecs_cserialized(&hi_out, &lo_out, hi_in, lo_in);
+
+	return (((microsec_t) hi_out << 32) | (microsec_t)lo_out);
+}
diff --git a/src/components/interface/work/stubs/s_stub.S b/src/components/interface/work/stubs/s_stub.S
new file mode 100644
index 0000000000..d3245b4e75
--- /dev/null
+++ b/src/components/interface/work/stubs/s_stub.S
@@ -0,0 +1,12 @@
+/**
+ * Redistribution of this file is permitted under the BSD two clause license.
+ *
+ * Copyright 2018, The George Washington University
+ * Author: Phani Gadepalli, phanikishoreg@gwu.edu
+ */
+
+#include <cos_asm_server_stub_simple_stack.h>
+
+.text
+cos_asm_server_stub_rets(work_cycs_cserialized)
+cos_asm_server_stub_rets(work_usecs_cserialized)
diff --git a/src/components/interface/work/work.h b/src/components/interface/work/work.h
new file mode 100644
index 0000000000..9768993ceb
--- /dev/null
+++ b/src/components/interface/work/work.h
@@ -0,0 +1,12 @@
+#ifndef WORK_H
+#define WORK_H
+
+#include <cos_types.h>
+
+/* @return: number of actual cycles elapsed */
+cycles_t work_cycs(cycles_t ncycs);
+/* @return: number of actual usecs elapsed */
+microsec_t work_usecs(microsec_t nusecs);
+
+
+#endif /* WORK_H */

From dcd1d5e0432beff9a3953d7e1be4fd89829ed9cf Mon Sep 17 00:00:00 2001
From: Phani <phanikishoreg@gwu.edu>
Date: Mon, 16 Sep 2019 14:43:04 -0400
Subject: [PATCH 111/127] hpet attach api in cos_kernel_api

---
 .../tests/micro_booter/mb_tests.c             | 48 ++++++++++++-
 src/components/include/cos_kernel_api.h       |  1 +
 src/components/lib/cos_kernel_api.c           |  8 +++
 src/kernel/capinv.c                           | 11 ++-
 src/kernel/include/chal.h                     |  3 +
 src/platform/i386/hpet.c                      | 68 ++++++++++++++++++-
 6 files changed, 132 insertions(+), 7 deletions(-)

diff --git a/src/components/implementation/tests/micro_booter/mb_tests.c b/src/components/implementation/tests/micro_booter/mb_tests.c
index 0277a2cfbf..388dab77b8 100644
--- a/src/components/implementation/tests/micro_booter/mb_tests.c
+++ b/src/components/implementation/tests/micro_booter/mb_tests.c
@@ -368,6 +368,50 @@ spinner(void *d)
 		;
 }
 
+#define TEST_USEC_INTERVAL 1000 /* in microseconds */
+#define TEST_HPET_ITERS   1000
+cycles_t iat_vals[TEST_HPET_ITERS - 1];
+
+static void
+test_hpet_timer(void)
+{
+	int      i;
+	thdcap_t tc;
+	cycles_t c = 0, p = 0, t = 0;
+
+	PRINTC("Starting HPET timer test.\n");
+	tc = cos_thd_alloc(&booter_info, booter_info.comp_cap, spinner, NULL, 0, 0);
+	cos_hw_periodic_attach(BOOT_CAPTBL_SELF_INITHW_BASE, HW_HPET_PERIODIC, BOOT_CAPTBL_SELF_INITRCV_BASE, TEST_USEC_INTERVAL);
+
+
+	for (i = 0 ; i <= TEST_HPET_ITERS ; i++) {
+		thdid_t     tid;
+		int         blocked;
+		cycles_t    cycles;
+
+		cos_switch(tc, BOOT_CAPTBL_SELF_INITTCAP_BASE, 0, TCAP_TIME_NIL, 0, cos_sched_sync());
+		p     = c;
+		rdtscll(c);
+		if (i > 0) {
+			t += c-p;
+			iat_vals[i - 1] = c - p;
+		}
+
+		//while (cos_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_BASE, &tid, &blocked, &cycles) != 0) ;
+	}
+
+	cos_hw_detach(BOOT_CAPTBL_SELF_INITHW_BASE, HW_HPET_PERIODIC);
+
+	for (i = 0 ; i < TEST_HPET_ITERS ; i += 10) {
+		PRINTC("%llu ", iat_vals[i]);
+	}
+
+	PRINTC("\nAverage inter-arrival time (%d microseconds) = %lld\n",
+	       TEST_USEC_INTERVAL, t/TEST_HPET_ITERS);
+
+	PRINTC("Timer test completed.\nSuccess.\n");
+}
+
 static void
 test_timer(void)
 {
@@ -966,10 +1010,10 @@ test_run_mb(void)
 //	test_async_endpoints_perf();
 //
 //	test_inv();
-	test_inv_perf();
+//	test_inv_perf();
 //
 //	test_captbl_expand();
-
+	test_hpet_timer();
 	/*
 	 * FIXME: Preemption stack mechanism in the kernel is disabled.
 	 * test_wakeup();
diff --git a/src/components/include/cos_kernel_api.h b/src/components/include/cos_kernel_api.h
index fac998ae47..542290774d 100644
--- a/src/components/include/cos_kernel_api.h
+++ b/src/components/include/cos_kernel_api.h
@@ -203,6 +203,7 @@ int cos_tcap_merge(tcap_t dst, tcap_t rm);
 /* Hardware (interrupts) operations */
 hwcap_t cos_hw_alloc(struct cos_compinfo *ci, u32_t bitmap);
 int     cos_hw_attach(hwcap_t hwc, hwid_t hwid, arcvcap_t rcvcap);
+int     cos_hw_periodic_attach(hwcap_t hwc, hwid_t hwid, arcvcap_t rcvcap, unsigned int period);
 int     cos_hw_detach(hwcap_t hwc, hwid_t hwid);
 void   *cos_hw_map(struct cos_compinfo *ci, hwcap_t hwc, paddr_t pa, unsigned int len);
 int     cos_hw_cycles_per_usec(hwcap_t hwc);
diff --git a/src/components/lib/cos_kernel_api.c b/src/components/lib/cos_kernel_api.c
index cbd7f01dfd..8b0e8d4cd5 100644
--- a/src/components/lib/cos_kernel_api.c
+++ b/src/components/lib/cos_kernel_api.c
@@ -1110,6 +1110,14 @@ cos_hw_attach(hwcap_t hwc, hwid_t hwid, arcvcap_t arcv)
 	return call_cap_op(hwc, CAPTBL_OP_HW_ATTACH, hwid, arcv, 0, 0);
 }
 
+int
+cos_hw_periodic_attach(hwcap_t hwc, hwid_t hwid, arcvcap_t arcv, unsigned int period)
+{
+	assert(hwid == HW_HPET_PERIODIC);
+
+	return call_cap_op(hwc, CAPTBL_OP_HW_ATTACH, hwid, arcv, period, 0);
+}
+
 int
 cos_hw_detach(hwcap_t hwc, hwid_t hwid)
 {
diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index 30e525410a..c223c38bc9 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -1820,12 +1820,16 @@ static int __attribute__((noinline)) composite_syscall_slowpath(struct pt_regs *
 			struct cap_arcv *rcvc;
 			hwid_t           hwid   = __userregs_get1(regs);
 			capid_t          rcvcap = __userregs_get2(regs);
+			u32_t period = __userregs_get3(regs);
 
 			rcvc = (struct cap_arcv *)captbl_lkup(ci->captbl, rcvcap);
 			if (!CAP_TYPECHK(rcvc, CAP_ARCV)) cos_throw(err, -EINVAL);
 
 			ret = hw_attach_rcvcap((struct cap_hw *)ch, hwid, rcvc, rcvcap);
-			if (!ret) ret = chal_irq_enable(hwid, get_cpuid());
+			if (!ret) {
+				if (hwid == HW_HPET_PERIODIC || hwid == HW_HPET_ONESHOT) chal_hpet_periodic_set(hwid, period);
+				ret = chal_irq_enable(hwid, get_cpuid());
+			}
 
 			break;
 		}
@@ -1833,7 +1837,10 @@ static int __attribute__((noinline)) composite_syscall_slowpath(struct pt_regs *
 			hwid_t hwid = __userregs_get1(regs);
 
 			ret = hw_detach_rcvcap((struct cap_hw *)ch, hwid);
-			if (!ret) ret = chal_irq_disable(hwid, get_cpuid());
+			if (!ret) {
+				if (hwid == HW_HPET_PERIODIC || hwid == HW_HPET_ONESHOT) chal_hpet_disable(hwid);
+				ret = chal_irq_disable(hwid, get_cpuid());
+			}
 
 			break;
 		}
diff --git a/src/kernel/include/chal.h b/src/kernel/include/chal.h
index 2b68fe2837..b7a4683587 100644
--- a/src/kernel/include/chal.h
+++ b/src/kernel/include/chal.h
@@ -94,6 +94,9 @@ void chal_send_ipi(int cpu_id);
 void chal_idle(void);
 void chal_timer_set(cycles_t cycles);
 void chal_timer_disable(void);
+void     chal_hpet_periodic_set(hwid_t, unsigned long);
+void     chal_hpet_disable(hwid_t);
+cycles_t chal_hpet_first_period(void);
 
 int chal_irq_disable(int irqline, cpuid_t cpu_id);
 int chal_irq_enable(int irqline, cpuid_t cpu_id);
diff --git a/src/platform/i386/hpet.c b/src/platform/i386/hpet.c
index 15304bfede..c6d0c4bacd 100644
--- a/src/platform/i386/hpet.c
+++ b/src/platform/i386/hpet.c
@@ -99,6 +99,8 @@ static int           hpet_calibration_init   = 0;
 static unsigned long hpet_cpucyc_per_hpetcyc = HPET_ERROR_BOUND_FACTOR;
 static unsigned long hpet_cpucyc_per_tick;
 static unsigned long hpet_hpetcyc_per_tick;
+static unsigned long hpet_periodicity_curr[2] = { 0 };
+static cycles_t hpet_first_hpet_period = 0; /* for timer 0 = HPET_PERIODIC */
 extern u32_t chal_msr_mhz;
 
 static inline u64_t
@@ -174,6 +176,7 @@ hpet_calibration(void)
 
 		hpet_disable(HPET_PERIODIC);
 		hpet_disable(HPET_PERIODIC);
+		chal_irq_disable(HW_HPET_PERIODIC, 0);
 	}
 	cnt++;
 }
@@ -181,9 +184,11 @@ hpet_calibration(void)
 int
 chal_cyc_usec(void)
 {
-	if (lapic_timer_calib_init) return 0;
+	if (unlikely(lapic_timer_calib_init || hpet_calibration_init)) return 0;
 
-	return hpet_cpucyc_per_tick / HPET_DEFAULT_PERIOD_US;
+	if (likely(hpet_cpucyc_per_tick)) return hpet_cpucyc_per_tick / HPET_DEFAULT_PERIOD_US;
+
+	return 0;
 }
 
 int
@@ -191,9 +196,10 @@ hpet_periodic_handler(struct pt_regs *regs)
 {
 	int preempt = 1;
 
+	lapic_ack();
 	if (unlikely(hpet_calibration_init)) hpet_calibration();
+	if (unlikely(hpet_periodicity_curr[HPET_PERIODIC] && !hpet_first_hpet_period)) rdtscll(hpet_first_hpet_period);
 
-	lapic_ack();
 	preempt = cap_hw_asnd(&hw_asnd_caps[get_cpuid()][HW_HPET_PERIODIC], regs);
 	HPET_INT_ENABLE(HPET_PERIODIC);
 
@@ -271,6 +277,53 @@ hpet_find(void *timer)
 	return 0;
 }
 
+void
+chal_hpet_periodic_set(hwid_t hwid, unsigned long usecs_period)
+{
+	hpet_type_t type = 0;
+
+	assert(hwid == HW_HPET_PERIODIC);
+	type = HPET_PERIODIC;
+
+	if (hpet_periodicity_curr[type] != usecs_period) {
+		hpet_disable(type);
+		hpet_disable(type);
+
+		hpet_periodicity_curr[type] = 0;
+	}
+
+	if (hpet_periodicity_curr[type] == 0) {
+		unsigned long tick_multiple = 0;
+		cycles_t hpetcyc_per_period = 0;
+
+		assert(hpet_calibration_init == 0);
+		assert((usecs_period >= HPET_DEFAULT_PERIOD_US) && (usecs_period % HPET_DEFAULT_PERIOD_US == 0));
+
+		tick_multiple = usecs_period / HPET_DEFAULT_PERIOD_US;
+		hpetcyc_per_period = (cycles_t)hpet_hpetcyc_per_tick * (cycles_t)tick_multiple;
+		hpet_periodicity_curr[type] = usecs_period;
+		if (type == HPET_PERIODIC) hpet_first_hpet_period = 0;
+		hpet_set(type, hpetcyc_per_period);
+		printk("Setting HPET [%u:%u] Periodicity:%lu hpetcyc_per_period:%llu\n", hwid, type, usecs_period, hpetcyc_per_period);
+	}
+}
+
+cycles_t
+chal_hpet_first_period(void)
+{
+	return hpet_first_hpet_period;
+}
+
+void
+chal_hpet_disable(hwid_t hwid)
+{
+	printk("Disabling HPET %u\n", hwid);
+	hpet_type_t type = (hwid == HW_HPET_PERIODIC ? HPET_PERIODIC : HPET_ONESHOT);
+
+	hpet_disable(type);
+	hpet_disable(type);
+}
+
 void
 hpet_set_page(u32_t page)
 {
@@ -295,6 +348,15 @@ hpet_init(void)
 
 	printk("Enabling timer @ %p with tick granularity %ld picoseconds\n", hpet, pico_per_hpetcyc);
 
+	/*
+	 * FIXME: For some reason, setting to non-legacy mode isn't working well.
+	 * Periodicity of the HPET fired is wrong and any interval configuration
+	 * is still producing the same wrong interval timing.
+	 *
+	 * So, Enable legacy interrupt routing like we had before!
+	 */
+	*hpet_config |= HPET_LEG_RT_CNF;
+
 	/*
 	 * Set the timer as specified.  This assumes that the cycle
 	 * specification is in hpet cycles (not cpu cycles).

From 94e3a5415eaf8f76cae14f4bf48f62cabf046ca6 Mon Sep 17 00:00:00 2001
From: Phani <phanikishoreg@gwu.edu>
Date: Thu, 19 Sep 2019 18:03:36 -0400
Subject: [PATCH 112/127] debugging userlevel rcv and sched

---
 .../implementation/capmgr/naive/cap_mgr.c     |  44 ++++
 .../implementation/capmgr/naive/spinlib.c     |   2 +-
 .../no_interface/llbooter/boot_deps.h         |  22 +-
 .../implementation/sched/root_fprr/init.c     |  25 +-
 .../tests/micro_booter/mb_tests.c             |   2 +-
 .../implementation/tests/spin_comp/Makefile   |  10 +
 .../implementation/tests/spin_comp/init.c     |  17 ++
 .../tests/unit_schedtests/unit_schedlib.c     |   2 +-
 .../implementation/tests/unit_slrcv/Makefile  |   2 +-
 .../implementation/tests/unit_slrcv/init.c    |  89 ++++++-
 src/components/include/sl.h                   | 243 ++++++++++++------
 src/components/interface/capmgr/capmgr.h      |   4 +
 .../interface/capmgr/stubs/s_stub.S           |   4 +
 src/components/lib/sl/sl_sched.c              |  86 ++++---
 src/kernel/capinv.c                           |  69 +++--
 src/kernel/include/thd.h                      |  16 +-
 src/platform/i386/hpet.c                      |  24 +-
 src/platform/i386/qemu-kvm.sh                 |   2 +-
 src/platform/i386/runscripts/unit_slite01.sh  |   5 +-
 19 files changed, 495 insertions(+), 173 deletions(-)
 create mode 100644 src/components/implementation/tests/spin_comp/Makefile
 create mode 100644 src/components/implementation/tests/spin_comp/init.c

diff --git a/src/components/implementation/capmgr/naive/cap_mgr.c b/src/components/implementation/capmgr/naive/cap_mgr.c
index 1bd6bb61aa..3886ac4919 100644
--- a/src/components/implementation/capmgr/naive/cap_mgr.c
+++ b/src/components/implementation/capmgr/naive/cap_mgr.c
@@ -563,3 +563,47 @@ capmgr_asnd_key_create(cos_channelkey_t key)
 
 	return (asndcap_t)capret;
 }
+
+int
+capmgr_hw_attach(hwid_t hwid, thdid_t tid)
+{
+	spdid_t                 cur     = cos_inv_token();
+	struct cos_defcompinfo *cap_dci = cos_defcompinfo_curr_get();
+	struct cos_compinfo    *cap_ci  = cos_compinfo_get(cap_dci);
+	struct cap_comp_info   *rc      = cap_info_comp_find(cur);
+	struct sl_thd          *ti      = cap_info_thd_find(rc, tid);
+
+	if (!rc || !cap_info_init_check(rc)) return -EINVAL;
+	if (!ti || !sl_thd_rcvcap(ti)) return -EINVAL;
+
+	return cos_hw_attach(BOOT_CAPTBL_SELF_INITHW_BASE, hwid, sl_thd_rcvcap(ti));
+}
+
+int
+capmgr_hw_periodic_attach(hwid_t hwid, thdid_t tid, unsigned int period_us)
+{
+	spdid_t                 cur     = cos_inv_token();
+	struct cos_defcompinfo *cap_dci = cos_defcompinfo_curr_get();
+	struct cos_compinfo    *cap_ci  = cos_compinfo_get(cap_dci);
+	struct cap_comp_info   *rc      = cap_info_comp_find(cur);
+	struct sl_thd          *ti      = cap_info_thd_find(rc, tid);
+
+	if (period_us == 0) return -EINVAL;
+	if (!rc || !cap_info_init_check(rc)) return -EINVAL;
+	if (!ti || !sl_thd_rcvcap(ti)) return -EINVAL;
+
+	return cos_hw_periodic_attach(BOOT_CAPTBL_SELF_INITHW_BASE, hwid, sl_thd_rcvcap(ti), period_us);
+}
+
+int
+capmgr_hw_detach(hwid_t hwid)
+{
+	spdid_t                 cur     = cos_inv_token();
+	struct cos_defcompinfo *cap_dci = cos_defcompinfo_curr_get();
+	struct cos_compinfo    *cap_ci  = cos_compinfo_get(cap_dci);
+	struct cap_comp_info   *rc      = cap_info_comp_find(cur);
+
+	if (!rc || !cap_info_init_check(rc)) return -EINVAL;
+
+	return cos_hw_detach(BOOT_CAPTBL_SELF_INITHW_BASE, hwid);
+}
diff --git a/src/components/implementation/capmgr/naive/spinlib.c b/src/components/implementation/capmgr/naive/spinlib.c
index 22ff1218b3..782cdc3c6f 100644
--- a/src/components/implementation/capmgr/naive/spinlib.c
+++ b/src/components/implementation/capmgr/naive/spinlib.c
@@ -29,7 +29,7 @@ spinlib_calib_test(void)
 		rdtscll(end);
 		elapsed_cycs = end - st;
 
-		PRINTC("SPIN %lluus => elapsed :%lluus %llucycs\n", test_us[i], elapsed_cycs, sl_cyc2usec(elapsed_cycs));
+		PRINTC("SPIN %lluus => elapsed :%llucycs %lluus\n", test_us[i], elapsed_cycs, sl_cyc2usec(elapsed_cycs));
 	}
 }
 
diff --git a/src/components/implementation/no_interface/llbooter/boot_deps.h b/src/components/implementation/no_interface/llbooter/boot_deps.h
index 59e11fadf8..70a1654f53 100644
--- a/src/components/implementation/no_interface/llbooter/boot_deps.h
+++ b/src/components/implementation/no_interface/llbooter/boot_deps.h
@@ -435,6 +435,21 @@ boot_bootcomp_init(void)
 	bootsi->flags |= COMP_FLAG_SCHED;
 }
 
+static void
+boot_root_sched_transfer(void)
+{
+	struct cos_aep_info *root_aep = NULL;
+	int ret;
+
+	if (!root_spdid[cos_cpuid()]) return;
+
+	root_aep = boot_spd_initaep_get(root_spdid[cos_cpuid()]);
+
+	PRINTLOG(PRINT_DEBUG, "Root scheduler is %u, transferring INF budget now!\n", root_spdid[cos_cpuid()]);
+	ret = cos_tcap_transfer(root_aep->rcv, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF, LLBOOT_ROOTSCHED_PRIO);
+	assert(ret == 0);
+}
+
 static void
 boot_done(void)
 {
@@ -448,7 +463,6 @@ void
 boot_root_sched_run(void)
 {
 	struct cos_aep_info *root_aep = NULL;
-	int ret;
 
 	if (!root_spdid[cos_cpuid()]) {
 		PRINTLOG(PRINT_WARN, "No root scheduler!\n");
@@ -459,10 +473,7 @@ boot_root_sched_run(void)
 	root_aep = boot_spd_initaep_get(root_spdid[cos_cpuid()]);
 
 	PRINTLOG(PRINT_DEBUG, "Root scheduler is %u, switching to it now!\n", root_spdid[cos_cpuid()]);
-	ret = cos_tcap_transfer(root_aep->rcv, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF, LLBOOT_ROOTSCHED_PRIO);
-	assert(ret == 0);
-
-	ret = cos_switch(root_aep->thd, root_aep->tc, LLBOOT_ROOTSCHED_PRIO, TCAP_TIME_NIL, 0, cos_sched_sync());
+	cos_switch(root_aep->thd, root_aep->tc, LLBOOT_ROOTSCHED_PRIO, TCAP_TIME_NIL, 0, cos_sched_sync());
 	PRINTLOG(PRINT_ERROR, "Root scheduler returned.\n");
 	assert(0);
 }
@@ -609,6 +620,7 @@ boot_root_initaep_set(spdid_t dstid, spdid_t srcid, thdcap_t thd, arcvcap_t rcv,
 	assert(a->tc);
 	a->rcv = cos_cap_cpy(b, c, CAP_ARCV, rcv);
 	assert(a->rcv);
+	if (root_spdid[cos_cpuid()] == srcid) boot_root_sched_transfer();
 
 done:
 	boot_comp_sched_set(srcid);
diff --git a/src/components/implementation/sched/root_fprr/init.c b/src/components/implementation/sched/root_fprr/init.c
index 3b2abc7221..abfc035718 100644
--- a/src/components/implementation/sched/root_fprr/init.c
+++ b/src/components/implementation/sched/root_fprr/init.c
@@ -16,9 +16,9 @@ u32_t cycs_per_usec = 0;
 #define INITIALIZE_PERIOD_MS (4000)
 #define INITIALIZE_BUDGET_MS (2000)
 
-#define FIXED_PRIO 2
-#define FIXED_PERIOD_MS (10000)
-#define FIXED_BUDGET_MS (4000)
+#define FIXED_PRIO 1
+#define FIXED_PERIOD_MS (100000)
+#define FIXED_BUDGET_MS (100000)
 
 static struct sl_thd *__initializer_thd[NUM_CPU] CACHE_ALIGNED;
 
@@ -46,14 +46,23 @@ void
 sched_child_init(struct sched_childinfo *schedci)
 {
 	vaddr_t dcbaddr;
+	struct sl_thd *initthd;
 
 	assert(schedci);
 	schedci->initthd = sl_thd_initaep_alloc(sched_child_defci_get(schedci), NULL, schedci->flags & COMP_FLAG_SCHED, schedci->flags & COMP_FLAG_SCHED ? 1 : 0, 0, 0, 0, &dcbaddr);
         assert(schedci->initthd);
-
-	sl_thd_param_set(schedci->initthd, sched_param_pack(SCHEDP_PRIO, FIXED_PRIO));
-	sl_thd_param_set(schedci->initthd, sched_param_pack(SCHEDP_WINDOW, FIXED_PERIOD_MS));
-	sl_thd_param_set(schedci->initthd, sched_param_pack(SCHEDP_BUDGET, FIXED_BUDGET_MS));
+	initthd = schedci->initthd;
+
+	if (schedci->flags & COMP_FLAG_SCHED) {
+		if (cos_tcap_transfer(sl_thd_rcvcap(initthd), BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF, FIXED_PRIO)) {
+			PRINTC("Failed to transfer INF budget\n");
+			assert(0);
+		}
+		sl_thd_param_set(initthd, sched_param_pack(SCHEDP_WINDOW, FIXED_PERIOD_MS));
+		sl_thd_param_set(initthd, sched_param_pack(SCHEDP_BUDGET, FIXED_BUDGET_MS));
+	}
+	if (schedci->id == 1) sl_thd_param_set(initthd, sched_param_pack(SCHEDP_PRIO, FIXED_PRIO));
+	else                  sl_thd_param_set(initthd, sched_param_pack(SCHEDP_PRIO, FIXED_PRIO+1));
 }
 
 thdid_t
@@ -103,7 +112,7 @@ cos_init(void)
 		while (!ps_load((unsigned long *)&init_done[i])) ;
 	}
 
-	sl_init_corebmp(SL_MIN_PERIOD_US, cpubmp);
+	sl_init_corebmp(100*SL_MIN_PERIOD_US, cpubmp);
 	sched_childinfo_init();
 	__initializer_thd[cos_cpuid()] = sl_thd_alloc(__init_done, NULL);
 	assert(__initializer_thd[cos_cpuid()]);
diff --git a/src/components/implementation/tests/micro_booter/mb_tests.c b/src/components/implementation/tests/micro_booter/mb_tests.c
index 388dab77b8..1eedd26023 100644
--- a/src/components/implementation/tests/micro_booter/mb_tests.c
+++ b/src/components/implementation/tests/micro_booter/mb_tests.c
@@ -368,7 +368,7 @@ spinner(void *d)
 		;
 }
 
-#define TEST_USEC_INTERVAL 1000 /* in microseconds */
+#define TEST_USEC_INTERVAL 5000 /* in microseconds */
 #define TEST_HPET_ITERS   1000
 cycles_t iat_vals[TEST_HPET_ITERS - 1];
 
diff --git a/src/components/implementation/tests/spin_comp/Makefile b/src/components/implementation/tests/spin_comp/Makefile
new file mode 100644
index 0000000000..bb7f30634e
--- /dev/null
+++ b/src/components/implementation/tests/spin_comp/Makefile
@@ -0,0 +1,10 @@
+C_OBJS=init.o
+ASM_OBJS=
+COMPONENT=spin_comp.o
+INTERFACES=
+DEPENDENCIES=capmgr schedinit
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/tests/spin_comp/init.c b/src/components/implementation/tests/spin_comp/init.c
new file mode 100644
index 0000000000..15cdd385f5
--- /dev/null
+++ b/src/components/implementation/tests/spin_comp/init.c
@@ -0,0 +1,17 @@
+#include <cos_kernel_api.h>
+#include <llprint.h>
+#include <cos_types.h>
+#include <bitmap.h>
+#include <schedinit.h>
+
+void
+cos_init(void)
+{
+	PRINTC("Spin Init!\n");
+	schedinit_child();
+
+	while (1) ;
+
+	PRINTLOG(PRINT_ERROR, "Cannot reach here!\n");
+	assert(0);
+}
diff --git a/src/components/implementation/tests/unit_schedtests/unit_schedlib.c b/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
index 19821aa5a8..770493803e 100644
--- a/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
+++ b/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
@@ -37,7 +37,7 @@
 
 #define MAGIC_RET 0xDEADBEEF
 
-#define INV_TEST
+#undef INV_TEST
 static volatile cycles_t mid_cycs = 0;
 static volatile int testing = 1;
 
diff --git a/src/components/implementation/tests/unit_slrcv/Makefile b/src/components/implementation/tests/unit_slrcv/Makefile
index c72ea0a131..3500d01777 100644
--- a/src/components/implementation/tests/unit_slrcv/Makefile
+++ b/src/components/implementation/tests/unit_slrcv/Makefile
@@ -2,7 +2,7 @@ COMPONENT=unit_slrcvtest.o
 INTERFACES=
 DEPENDENCIES=capmgr schedinit work
 IF_LIB=
-ADDITIONAL_LIBS=-lcobj_format $(LIBSLCAPMGR) -lsl_mod_rr -lsl_thd_static_backend -lcos_dcb
+ADDITIONAL_LIBS=-lcobj_format $(LIBSLCAPMGR) -lsl_mod_fprr -lsl_thd_static_backend -lcos_dcb -lsl_blkpt
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/tests/unit_slrcv/init.c b/src/components/implementation/tests/unit_slrcv/init.c
index 2aedb5dd08..64fe2a3b76 100644
--- a/src/components/implementation/tests/unit_slrcv/init.c
+++ b/src/components/implementation/tests/unit_slrcv/init.c
@@ -2,15 +2,20 @@
 #include <cos_defkernel_api.h>
 #include <llprint.h>
 #include <sl.h>
+#include <cos_types.h>
 #include <cos_dcb.h>
 #include <hypercall.h>
 #include <schedinit.h>
 #include <work.h>
+#include <capmgr.h>
+#include <crt_chan.h>
 
 static struct sl_xcore_thd *ping;
 static struct sl_xcore_thd *pong;
 
-#define WORK_US (10*1000*1000)
+#define HPET_PERIOD_TEST_US 2000
+
+#define WORK_US (1000)
 
 static inline void
 ping_fn(void *d)
@@ -27,14 +32,72 @@ ping_fn(void *d)
 	sl_thd_exit();
 }
 
+unsigned int iter = 0;
+cycles_t st = 0, en = 0, tot = 0, wc = 0;
+CRT_CHAN_STATIC_ALLOC(c0, int, 4);
+CRT_CHAN_TYPE_PROTOTYPES(test, int, 4);
+
+static inline void
+work_fn(void *x)
+{
+		int rcv;
+	while (1) {
+	//	printc("a");
+		//sl_thd_block(0);
+		crt_chan_recv_test(c0, &rcv);
+	//	printc("b");
+		rdtscll(en);
+		if (unlikely(!st)) continue;
+		assert(en > st);
+		cycles_t diff = en - st;
+		if (diff > wc) wc = diff;
+		tot += diff;
+		iter ++;
+		if (unlikely(iter == 1000)) {
+			PRINTC("%llu %llu\n", tot / iter, wc);
+			iter = 0;
+			wc = tot = 0;
+		}
+	}
+}
+
+struct sl_thd *wt = NULL;
+thdid_t wtid = 0;
+
 static inline void
 pong_fn(arcvcap_t r, void *d)
 {
+	//printc("#");
+	int a = capmgr_hw_periodic_attach(HW_HPET_PERIODIC, cos_thdid(), HPET_PERIOD_TEST_US);
+	assert(a == 0);
+	int snd = 0x1234;
+	//printc("!");
+
+	int i = 0;
 	while (1) {
+	//	printc("c");
 		int p = sl_thd_rcv(RCV_ULONLY);
+		rdtscll(st);
+	//	printc("d");
 		//int p = cos_rcv(r, 0);
 
-		printc("%d", p);
+		//printc("[%d] ", i++);
+		//sl_thd_wakeup(wtid);
+		crt_chan_send_test(c0, &snd);
+//		printc("e");
+//		rdtscll(en);
+//		//if (unlikely(!st)) continue;
+//		assert(en > st);
+//		cycles_t diff = en - st;
+//		if (diff > wc) wc = diff;
+//		tot += diff;
+//		iter ++;
+//		if (unlikely(iter == 1000)) {
+//			PRINTC("%llu %llu\n", tot / iter, wc);
+//			iter = 0;
+//			wc = tot = 0;
+//		}
+
 	}
 	sl_thd_exit();
 }
@@ -83,21 +146,27 @@ cos_init(void *d)
 		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
 		cos_defcompinfo_init();
 		//cos_dcb_info_init_curr();
-		sl_init(SL_MIN_PERIOD_US);
+		sl_init(SL_MIN_PERIOD_US*100);
+
+		crt_chan_init_test(c0);
 
+		wt = sl_thd_alloc(work_fn, NULL);
+		assert(wt);
+		wtid = sl_thd_thdid(wt);
 		struct sl_thd *rt = sl_thd_aep_alloc(pong_fn, NULL, 0, 0, 0, 0);
 		assert(rt);
+		sl_thd_param_set(wt, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1));
 		sl_thd_param_set(rt, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1));
-		r = sl_thd_rcvcap(rt);
-		assert(r);
-		struct sl_thd *st = sl_thd_alloc(ping_fn, (void *)&s);
-		assert(st);
-		sl_thd_param_set(st, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1));
+//		r = sl_thd_rcvcap(rt);
+//		assert(r);
+		//struct sl_thd *st = sl_thd_alloc(ping_fn, (void *)&s);
+		//assert(st);
+		//sl_thd_param_set(st, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1));
 
 		//s = cos_asnd_alloc(ci, r, ci->captbl_cap);
 		//assert(s);
-		s = capmgr_asnd_rcv_create(r);
-		assert(s);
+//		s = capmgr_asnd_rcv_create(r);
+//		assert(s);
 	}
 	ps_faa(&init_done[cos_cpuid()], 1);
 
diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index 54f3a10a33..a5198f5b1a 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -436,7 +436,7 @@ sl_thd_is_runnable(struct sl_thd *t)
 int sl_thd_kern_dispatch(thdcap_t t);
 
 static inline int
-sl_thd_activate(struct sl_thd *t, sched_tok_t tok, tcap_time_t timeout)
+sl_thd_activate_old(struct sl_thd *t, sched_tok_t tok, tcap_time_t timeout)
 {
 	struct cos_defcompinfo *dci = cos_defcompinfo_curr_get();
 	struct cos_compinfo    *ci  = &dci->ci;
@@ -459,18 +459,65 @@ sl_thd_activate(struct sl_thd *t, sched_tok_t tok, tcap_time_t timeout)
 		 * Force switch to the scheduler with current tcap.
 		 */
 		return cos_switch(g->sched_thdcap, g->sched_tcap, t->prio,
-				  timeout, g->sched_rcv, tok);
+			          timeout, g->sched_rcv, tok);
 	}
 }
 
 static inline int
-sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
+sl_thd_dispatch_kern(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr, tcap_time_t timeout, tcap_t tc, tcap_prio_t p)
 {
+	/* FIXME: cannot handle prio here for now! */
 	volatile struct cos_scb_info *scb = sl_scb_info_core();
+	struct sl_global_core *g = sl__globals_core();
 	struct cos_dcb_info *cd = sl_thd_dcbinfo(curr), *nd = sl_thd_dcbinfo(next);
+	word_t a = ((sl_thd_thdcap(next)  + 1) << COS_CAPABILITY_OFFSET) + (tok >> 16);
+	word_t b = (tc << 16) | g->sched_rcv;
+	word_t S = (p << 32) >> 32;
+	word_t D = (((p << 16) >> 48) << 16) | ((tok << 16) >> 16);
+	word_t d = timeout; 
+	int ret = 0;
+	//printc("%u %u %u %u %llu %lu\n", sl_thd_thdid(curr), sl_thd_thdid(next), g->sched_rcv, tc, p, timeout);
 
+//	if (cos_spd_id() != 4) printc("F");
 	assert(curr != next);
-	if (unlikely(!cd || !nd)) return sl_thd_activate(next, tok, sl__globals_core()->timeout_next);
+	if (unlikely(!cd || !nd)) return cos_switch(sl_thd_thdcap(next), sl_thd_tcap(next), next->prio, timeout, g->sched_rcv, tok);
+
+	__asm__ __volatile__ (				\
+		"pushl %%ebp\n\t"			\
+		"movl %%esp, %%ebp\n\t"			\
+		"movl $1f, (%%esi)\n\t"			\
+		"movl %%esp, 4(%%esi)\n\t"		\
+		"movl %%ecx, %%esi\n\t"			\
+		"movl $2f, %%ecx\n\t"			\
+		"sysenter\n\t"				\
+		"jmp 2f\n\t"				\
+		".align 4\n\t"				\
+		"1:\n\t"				\
+		"movl $0, %%eax\n\t"			\
+		".align 4\n\t"				\
+		"2:\n\t"				\
+		"popl %%ebp\n\t"			\
+		: "=a" (ret)
+		: "a" (a), "b" (b), "S" (cd), "D" (D), "d" (d), "c" (S)
+		: "memory", "cc");
+
+	scb = sl_scb_info_core();
+	cd = sl_thd_dcbinfo(sl_thd_curr());
+	cd->sp = 0;
+	if (unlikely(ps_load(&scb->sched_tok) != tok)) return -EAGAIN;
+
+	return ret;
+}
+
+static inline int
+sl_thd_dispatch_usr(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
+{
+	volatile struct cos_scb_info *scb = sl_scb_info_core();
+	struct cos_dcb_info *cd = sl_thd_dcbinfo(curr), *nd = sl_thd_dcbinfo(next);
+
+//	if (cos_spd_id() != 4) printc("E");
+	assert(curr != next);
+	if (unlikely(!cd || !nd)) return cos_defswitch(sl_thd_thdcap(next), next->prio, sl__globals_core()->timeout_next, tok);
 
 	/*
 	 * jump labels in the asm routine:
@@ -524,11 +571,46 @@ sl_thd_dispatch(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 		: "memory", "cc");
 
 	scb = sl_scb_info_core();
+	assert(sl_thd_dcbinfo(curr)->sp == 0);
 	if (unlikely(ps_load(&scb->sched_tok) != tok)) return -EAGAIN;
 
 	return 0;
 }
 
+static inline int
+sl_thd_activate_c(struct sl_thd *t, sched_tok_t tok, tcap_time_t timeout, tcap_prio_t prio, struct sl_thd *curr, struct sl_global_core *g)
+{
+	//printc(":%d\n", __LINE__);
+	if (unlikely(t->properties & SL_THD_PROPERTY_SEND)) {
+	//printc(":%d\n", __LINE__);
+		return cos_sched_asnd(t->sndcap, g->timeout_next, g->sched_rcv, tok);
+	} else if (unlikely(t->properties & SL_THD_PROPERTY_OWN_TCAP)) {
+	//printc(":%d\n", __LINE__);
+		return sl_thd_dispatch_kern(t, tok, curr, timeout, sl_thd_tcap(t), t->prio);
+//		return cos_switch(sl_thd_thdcap(t), sl_thd_tcap(t), prio, timeout, g->sched_rcv, tok);//sl_thd_dispatch_kern(t, tok, curr, timeout, g->sched_tcap, prio);
+	}
+
+	if (unlikely(timeout || prio)) {
+	//printc(":%d\n", __LINE__);
+		//return cos_switch(sl_thd_thdcap(t), g->sched_tcap, prio, timeout, g->sched_rcv, tok);
+		//return sl_thd_dispatch_kern(t, tok, curr, timeout, g->sched_tcap, prio);
+		return sl_thd_dispatch_usr(t, tok, curr);
+	} else {
+	//printc(":%d\n", __LINE__);
+		return sl_thd_dispatch_usr(t, tok, curr);
+	}
+	/* TODO: prio change? */
+}
+
+
+static inline int
+sl_thd_activate(struct sl_thd *t, sched_tok_t tok, tcap_time_t timeout, tcap_prio_t prio)
+{
+	struct sl_global_core *g = sl__globals_core();
+
+	return sl_thd_activate_c(t, tok, timeout, prio, sl_thd_curr(), g);
+}
+
 static inline int
 sl_cs_exit_schedule_nospin_arg_c(struct sl_thd *curr, struct sl_thd *next)
 {
@@ -540,7 +622,10 @@ sl_cs_exit_schedule_nospin_arg_c(struct sl_thd *curr, struct sl_thd *next)
 #ifdef SL_CS
 	sl_cs_exit();
 #endif
-	return sl_thd_dispatch(next, tok, curr);
+//	return sl_thd_dispatch(next, tok, curr);
+	return sl_thd_activate_c(next, tok, 0, 0, curr, sl__globals_core());
+	//return sl_thd_activate_old(next, tok);
+	//return sl_thd_dispatch_usr(next, tok, curr);
 }
 
 void sl_thd_replenish_no_cs(struct sl_thd *t, cycles_t now);
@@ -571,7 +656,7 @@ void sl_thd_replenish_no_cs(struct sl_thd *t, cycles_t now);
 static inline int
 sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 {
-	struct sl_thd         *t = to;
+	struct sl_thd         *t = to, *c = sl_thd_curr();
 	struct sl_global_core *globals = sl__globals_core();
 	sched_tok_t            tok;
 	cycles_t               now;
@@ -619,17 +704,19 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 #ifdef SL_CS
 	sl_cs_exit();
 #endif
-	if (unlikely(t == sl_thd_curr())) return 0;
-
-	/*
-	 * if the periodic timer is already ahead,
-	 * don't reprogram it!
-	 */
-//	if (likely(offset > globals->cyc_per_usec && globals->timer_prev)) {
-		ret = sl_thd_dispatch(t, tok, sl_thd_curr());
-//	} else {
-//		ret = sl_thd_activate(t, tok, globals->timeout_next);
-//	}
+	if (unlikely(t == c)) return 0;
+
+//	/*
+//	 * if the periodic timer is already ahead,
+//	 * don't reprogram it!
+//	 */
+////	if (likely(offset > globals->cyc_per_usec && globals->timer_prev)) {
+//		ret = sl_thd_dispatch(t, tok, sl_thd_curr());
+////	} else {
+////		ret = sl_thd_activate(t, tok, globals->timeout_next);
+////	}
+	ret = sl_thd_activate_c(t, tok, 0, 0, c, globals);
+	//ret = sl_thd_dispatch_usr(t, tok, c);
 
 	/*
 	 * one observation, in slowpath switch:
@@ -654,7 +741,7 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 	 * that returns to this thread when it is not runnable.
 	 * something!!!!
 	 */
-	if (unlikely(!sl_thd_is_runnable(sl_thd_curr()))) return -EAGAIN;
+	if (unlikely(!sl_thd_is_runnable(c))) return -EAGAIN;
 
 #ifdef SL_REPLENISH 
 	/*
@@ -665,13 +752,14 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 	 * the inter-component delegations), block till next timeout and try again.
 	 */
 	if (unlikely(ret == -EPERM)) {
+		//printc("h");
 		assert(t != globals->sched_thd && t != globals->idle_thd);
 		sl_thd_block_expiry(t);
-		if (unlikely(sl_thd_curr() != globals->sched_thd)) ret = sl_thd_activate(globals->sched_thd, tok, globals->timeout_next);
+		if (unlikely(sl_thd_curr() != globals->sched_thd)) ret = sl_thd_activate(globals->sched_thd, tok, globals->timeout_next, 0);
 	}
 #endif
 	/* either this thread is runnable at this point or a switch failed */
-	assert(sl_thd_is_runnable(sl_thd_curr()) || ret);
+	assert(sl_thd_is_runnable(c) || ret);
 
 	return ret;
 }
@@ -679,12 +767,14 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 static inline int
 sl_cs_exit_schedule_nospin_arg_timeout(struct sl_thd *to, cycles_t abs_timeout)
 {
-	struct sl_thd         *t = to;
+	struct sl_thd         *t = to, *c = sl_thd_curr();
 	struct sl_global_core *globals = sl__globals_core();
 	sched_tok_t            tok;
 	cycles_t               now;
 	s64_t                  offset;
 	int                    ret;
+	struct cos_dcb_info *cb;
+	tcap_time_t            timeout = 0;
 
 	/* Don't abuse this, it is only to enable the tight loop around this function for races... */
 #ifdef SL_CS
@@ -713,7 +803,7 @@ sl_cs_exit_schedule_nospin_arg_timeout(struct sl_thd *to, cycles_t abs_timeout)
 		struct sl_thd_policy *pt = sl_mod_schedule();
 
 		if (unlikely(!pt))
-			t = globals->idle_thd;
+			t = globals->sched_thd;
 		else
 			t = sl_mod_thd_get(pt);
 	}
@@ -723,10 +813,17 @@ sl_cs_exit_schedule_nospin_arg_timeout(struct sl_thd *to, cycles_t abs_timeout)
 #endif
 
 	assert(t && sl_thd_is_runnable(t));
+	if (unlikely(!(offset > globals->cyc_per_usec && globals->timer_prev
+		   && abs_timeout > globals->timer_next))) {
+		timeout = abs_timeout < globals->timer_next 
+				      ? tcap_cyc2time(abs_timeout) : globals->timeout_next;
+		//printc("X");
+	}
+
 #ifdef SL_CS
 	sl_cs_exit();
 #endif
-	if (unlikely(t == sl_thd_curr())) return 0;
+	if (unlikely(t == c)) return 0;
 
 	/* 
 	 * if the requested timeout is greater than next timeout 
@@ -735,14 +832,17 @@ sl_cs_exit_schedule_nospin_arg_timeout(struct sl_thd *to, cycles_t abs_timeout)
 	 *
 	 * else, reprogram for an earlier timeout requested.
 	 */
-	if (likely(offset > globals->cyc_per_usec && globals->timer_prev
-		   && abs_timeout > globals->timer_next)) {
-		ret = sl_thd_dispatch(t, tok, sl_thd_curr());
-	} else {
-		ret = sl_thd_activate(t, tok, abs_timeout < globals->timer_next 
-				      ? tcap_cyc2time(abs_timeout) : globals->timeout_next);
-	}
-	if (unlikely(!sl_thd_is_runnable(sl_thd_curr()))) return -EAGAIN;
+	assert(sl_thd_dcbinfo(sl_thd_curr())->sp == 0);
+//	if (likely(offset > globals->cyc_per_usec && globals->timer_prev
+//		   && abs_timeout > globals->timer_next)) {
+//		ret = sl_thd_dispatch_usr(t, tok, sl_thd_curr());
+//	} else {
+//		ret = sl_thd_activate_old(t, tok, abs_timeout < globals->timer_next 
+//				      ? tcap_cyc2time(abs_timeout) : globals->timeout_next);
+//	}
+
+	ret = sl_thd_activate_c(t, tok, timeout, 0, c, globals);
+	if (unlikely(!sl_thd_is_runnable(c))) return -EAGAIN;
 
 #ifdef SL_REPLENISH 
 	/*
@@ -753,9 +853,10 @@ sl_cs_exit_schedule_nospin_arg_timeout(struct sl_thd *to, cycles_t abs_timeout)
 	 * the inter-component delegations), block till next timeout and try again.
 	 */
 	if (unlikely(ret == -EPERM)) {
+		//printc("H");
 		assert(t != globals->sched_thd && t != globals->idle_thd);
 		sl_thd_block_expiry(t);
-		if (unlikely(sl_thd_curr() != globals->sched_thd)) ret = sl_thd_activate(globals->sched_thd, tok, globals->timeout_next);
+		if (unlikely(sl_thd_curr() != globals->sched_thd)) ret = sl_thd_activate_old(globals->sched_thd, tok, globals->timeout_next);
 	}
 #endif
 
@@ -907,7 +1008,10 @@ sl_thd_event_enqueue(struct sl_thd *t, struct cos_thd_event *e)
 {
 	struct sl_global_core *g = sl__globals_core();
 
-	if (e->epoch <= t->event_info.epoch) return;
+	if (e->epoch <= t->event_info.epoch) {
+		printc("<%d>", sl_thd_thdid(t));
+		return;
+	}
 
 	if (ps_list_singleton(t, SL_THD_EVENT_LIST)) ps_list_head_append(&g->event_head, t, SL_THD_EVENT_LIST);
 
@@ -930,45 +1034,40 @@ sl_thd_event_dequeue(struct sl_thd *t, struct cos_thd_event *e)
 static inline int
 sl_thd_rcv(rcv_flags_t flags)
 {
-	/* FIXME: elapsed_cycs accounting..?? */
-	struct cos_thd_event ev = { .blocked = 1, .next_timeout = 0, .epoch = 0, .elapsed_cycs = 0 };
-	struct sl_thd *t = sl_thd_curr();
-	unsigned long *p = &sl_thd_dcbinfo(t)->pending, q = 0;
-	int ret = 0;
-
-	assert(sl_thd_rcvcap(t));
-	assert(!(flags & RCV_ULSCHED_RCV));
-check:
-	sl_cs_enter();
-	/* there no pending event in the dcbinfo->pending */
-	if ((q = ps_load(p)) == 0) {
-		if (unlikely(!(flags & RCV_ULONLY))) goto rcv;
-		if (unlikely(flags & RCV_NON_BLOCKING)) {
-			ret = -EAGAIN;
-			goto done;
-		}
-
-		ev.epoch = sl_now();
-		sl_thd_event_enqueue(t, &ev);
-		/*
-		 * TODO: add event so sched thread will do this?
-		 *  sl_thd_sched_block_no_cs(t, SL_THD_BLOCKED, 0);
-		 */
-		sl_cs_exit_switchto(sl__globals_core()->sched_thd);
-
-		goto check;
-	}
-
-	/* cas may fail. but we got an event right now! */
-	ps_upcas(p, q, 0);
-done:
-	sl_cs_exit();
-
-	return ret;
-rcv:
-	sl_cs_exit();
-
-	return cos_rcv(sl_thd_rcvcap(t), flags);
+	return cos_rcv(sl_thd_rcvcap(sl_thd_curr()), flags);
+//	/* FIXME: elapsed_cycs accounting..?? */
+//	struct cos_thd_event ev = { .blocked = 1, .next_timeout = 0, .epoch = 0, .elapsed_cycs = 0 };
+//	struct sl_thd *t = sl_thd_curr();
+//	unsigned long *p = &sl_thd_dcbinfo(t)->pending, q = 0;
+//	int ret = 0;
+//
+//	assert(sl_thd_rcvcap(t));
+//	assert(!(flags & RCV_ULSCHED_RCV));
+//
+//recheck:
+//	if ((q = ps_load(p)) == 0) {
+//		if (!(flags & RCV_ULONLY)) {
+//			ret = cos_rcv(sl_thd_rcvcap(t), flags);
+//			q = ps_load(p);
+//			goto done;
+//		}
+//		if (unlikely(flags & RCV_NON_BLOCKING)) return -EAGAIN;
+//
+//		sl_cs_enter();
+//		ev.epoch = sl_now();
+//		sl_thd_event_enqueue(t, &ev);
+//		sl_thd_sched_block_no_cs(t, SL_THD_BLOCKED, 0);
+//		sl_cs_exit_switchto(sl__globals_core()->sched_thd);
+//		goto recheck;
+//		//q = ps_load(p);
+//	}
+//	assert(sl_thd_dcbinfo(t)->sp == 0);
+//	assert(q == 1);
+//
+//done:
+//	ps_upcas(p, q, 0);
+////if (cos_spd_id() != 4) printc("[R%u]", cos_thdid()); 
+//	return ret;
 }
 
 #endif /* SL_H */
diff --git a/src/components/interface/capmgr/capmgr.h b/src/components/interface/capmgr/capmgr.h
index 7e1c873414..09fc89acbf 100644
--- a/src/components/interface/capmgr/capmgr.h
+++ b/src/components/interface/capmgr/capmgr.h
@@ -26,4 +26,8 @@ asndcap_t capmgr_asnd_key_create(cos_channelkey_t key);
 
 int capmgr_thd_migrate(thdid_t tid, thdcap_t tc, cpuid_t core);
 
+int capmgr_hw_attach(hwid_t hwid, thdid_t tid);
+int capmgr_hw_periodic_attach(hwid_t hwid, thdid_t tid, unsigned int period_us);
+int capmgr_hw_detach(hwid_t hwid);
+
 #endif /* CAPMGR_H */
diff --git a/src/components/interface/capmgr/stubs/s_stub.S b/src/components/interface/capmgr/stubs/s_stub.S
index ef2d82a56a..4059d6a5db 100644
--- a/src/components/interface/capmgr/stubs/s_stub.S
+++ b/src/components/interface/capmgr/stubs/s_stub.S
@@ -23,6 +23,10 @@ cos_asm_server_stub(capmgr_asnd_rcv_create)
 cos_asm_server_stub(capmgr_asnd_key_create)
 cos_asm_server_stub(capmgr_thd_migrate)
 
+cos_asm_server_stub(capmgr_hw_attach)
+cos_asm_server_stub(capmgr_hw_periodic_attach)
+cos_asm_server_stub(capmgr_hw_detach)
+
 cos_asm_server_stub(memmgr_heap_page_allocn)
 cos_asm_server_stub_rets(memmgr_shared_page_allocn_cserialized)
 cos_asm_server_stub_rets(memmgr_shared_page_map_cserialized)
diff --git a/src/components/lib/sl/sl_sched.c b/src/components/lib/sl/sl_sched.c
index 41dbd25ce0..fb90cb3e6d 100644
--- a/src/components/lib/sl/sl_sched.c
+++ b/src/components/lib/sl/sl_sched.c
@@ -665,43 +665,45 @@ static inline int
 __sl_sched_rcv(rcv_flags_t rf, struct cos_sched_event *e)
 {
 	struct sl_global_core *g = sl__globals_core();
-	struct sl_thd *curr = sl_thd_curr();
-	struct cos_dcb_info *cd = sl_thd_dcbinfo(curr);
-	int ret = 0;
-
-	assert(curr == g->sched_thd);
-	if (!cd) return cos_ul_sched_rcv(g->sched_rcv, rf, g->timeout_next, e);
-
-	rf |= RCV_ULSCHED_RCV;
-	
-	__asm__ __volatile__ (			\
-		"pushl %%ebp\n\t"		\
-		"movl %%esp, %%ebp\n\t"		\
-		"movl $1f, (%%eax)\n\t"		\
-		"movl %%esp, 4(%%eax)\n\t"	\
-		"movl $2f, %%ecx\n\t"		\
-		"movl %%edx, %%eax\n\t"		\
-		"inc %%eax\n\t"			\
-		"shl $16, %%eax\n\t"		\
-		"movl $0, %%edx\n\t"		\
-		"movl $0, %%edi\n\t"		\
-		"sysenter\n\t"			\
-		"jmp 2f\n\t"			\
-		".align 4\n\t"			\
-		"1:\n\t"			\
-		"movl $1, %%eax\n\t"		\
-		".align 4\n\t"			\
-		"2:\n\t"			\
-		"popl %%ebp\n\t"		\
-		: "=a" (ret)
-		: "a" (cd), "b" (rf), "S" (g->timeout_next), "d" (g->sched_rcv)
-		: "memory", "cc", "ecx", "edi");
-
-//	if (cos_thdid() == 7) PRINTC("%s:%d %d\n", __func__, __LINE__, ret);
-	cd = sl_thd_dcbinfo(sl_thd_curr());
-	cd->sp = 0;
-
-	rf |= RCV_ULONLY;
+//	struct sl_thd *curr = sl_thd_curr();
+//	struct cos_dcb_info *cd = sl_thd_dcbinfo(curr);
+//	int ret = 0;
+////	if (cos_spd_id() != 4) printc("D");
+//
+//	assert(curr == g->sched_thd);
+//	if (!cd) return cos_ul_sched_rcv(g->sched_rcv, rf, g->timeout_next, e);
+//
+//	rf |= RCV_ULSCHED_RCV;
+//	
+//	__asm__ __volatile__ (			\
+//		"pushl %%ebp\n\t"		\
+//		"movl %%esp, %%ebp\n\t"		\
+//		"movl $1f, (%%eax)\n\t"		\
+//		"movl %%esp, 4(%%eax)\n\t"	\
+//		"movl $2f, %%ecx\n\t"		\
+//		"movl %%edx, %%eax\n\t"		\
+//		"inc %%eax\n\t"			\
+//		"shl $16, %%eax\n\t"		\
+//		"movl $0, %%edx\n\t"		\
+//		"movl $0, %%edi\n\t"		\
+//		"sysenter\n\t"			\
+//		"jmp 2f\n\t"			\
+//		".align 4\n\t"			\
+//		"1:\n\t"			\
+//		"movl $1, %%eax\n\t"		\
+//		".align 4\n\t"			\
+//		"2:\n\t"			\
+//		"popl %%ebp\n\t"		\
+//		: "=a" (ret)
+//		: "a" (cd), "b" (rf), "S" (g->timeout_next), "d" (g->sched_rcv)
+//		: "memory", "cc", "ecx", "edi");
+//
+////	if (cos_spd_id() != 4) printc("E");
+////	if (cos_thdid() == 7) PRINTC("%s:%d %d\n", __func__, __LINE__, ret);
+//	cd = sl_thd_dcbinfo(sl_thd_curr());
+//	cd->sp = 0;
+//
+//	rf |= RCV_ULONLY;
 	return cos_ul_sched_rcv(g->sched_rcv, rf, g->timeout_next, e);
 }
 
@@ -722,13 +724,23 @@ sl_sched_loop_intern(int non_block)
 			struct sl_child_notification notif;
 			struct cos_sched_event e = { .tid = 0 };
 
+			
+	struct sl_thd *curr = sl_thd_curr();
+	struct cos_dcb_info *cd = sl_thd_dcbinfo(curr);
+			assert(cd->sp == 0);
 			/*
 			 * a child scheduler may receive both scheduling notifications (block/unblock
 			 * states of it's child threads) and normal notifications (mainly activations from
 			 * it's parent scheduler).
 			 */
 			//pending = cos_ul_sched_rcv(g->sched_rcv, rfl, g->timeout_next, &e);
+//			if (cos_spd_id() != 4) printc("L");
+			//else                   printc("l");
 			pending = __sl_sched_rcv(rfl, &e);
+			assert(cd->sp == 0);
+//			if (cos_spd_id() != 4) printc("M");
+
+			//else                   printc("m");
 
 			if (pending < 0 || !e.tid) goto pending_events;
 
diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index c223c38bc9..53e9f641d7 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -20,6 +20,8 @@
 
 #define COS_DEFAULT_RET_CAP 0
 
+static int hw_asnd_call = 0;
+
 /*
  * TODO: switch to a dedicated TLB flush thread (in a separate
  * protection domain) to do this.
@@ -501,6 +503,9 @@ cap_thd_switch(struct pt_regs *regs, struct thread *curr, struct thread *next, s
 	struct comp_info *   next_ci = &(next->invstk[next->invstk_top].comp_info);
 	int                  preempt = 0;
 
+	if (hw_asnd_call) {
+		//printk("[%d %d]\n", curr->tid, next->tid);
+	}
 	assert(next_ci && curr && next);
 	assert(curr->cpuid == get_cpuid() && next->cpuid == get_cpuid());
 	if (unlikely(curr == next)) return thd_switch_update(curr, regs, 1);
@@ -527,7 +532,7 @@ cap_thd_switch(struct pt_regs *regs, struct thread *curr, struct thread *next, s
 
 	preempt = thd_switch_update(next, &next->regs, 0);
 	/* if switching to the preempted/awoken thread clear cpu local next_thdinfo */
-	if (nti->thd && nti->thd == next) thd_next_thdinfo_update(cos_info, 0, 0, 0, 0);
+	//if (nti->thd && nti->thd == next) thd_next_thdinfo_update(cos_info, 0, 0, 0, 0);
 
 	copy_all_regs(&next->regs, regs);
 
@@ -714,10 +719,12 @@ cap_thd_op(struct cap_thd *thd_cap, struct thread *thd, struct pt_regs *regs, st
 	struct tcap *tcap    = tcap_current(cos_info);
 	int          ret;
 
+	//printk("\n\n%u:%u %lu %lu %llu %lu %lu\n", thd->tid, next->tid, arcv, tc, prio, usr_counter, timeout);
 	if (thd_cap->cpuid != get_cpuid() || thd_cap->cpuid != next->cpuid) return -EINVAL;
 	if (unlikely(thd->dcbinfo && thd->dcbinfo->sp)) {
-		assert((unsigned long)regs->cx == thd->dcbinfo->ip + DCB_IP_KERN_OFF);
-		assert((unsigned long)regs->bp == thd->dcbinfo->sp);
+		//printk("\n%u: %u %lx %lx %lx\n", thd->tid, next->tid, regs->cx, thd->dcbinfo->ip, thd->dcbinfo->ip + DCB_IP_KERN_OFF);
+//		assert((unsigned long)regs->cx == thd->dcbinfo->ip + DCB_IP_KERN_OFF);
+//		assert((unsigned long)regs->bp == thd->dcbinfo->sp);
 	}
 
 	if (arcv) {
@@ -757,7 +764,8 @@ cap_thd_op(struct cap_thd *thd_cap, struct thread *thd, struct pt_regs *regs, st
 	}
 
 	ret = cap_switch(regs, thd, next, tcap, timeout, ci, cos_info);
-	if (tc && tcap_current(cos_info) == tcap) tcap_setprio(tcap, prio);
+	if (tc && tcap_current(cos_info) == tcap && prio) tcap_setprio(tcap, prio);
+	//printk("\n\n%u:%u-%d\n", thd->tid, next->tid,ret);
 
 	return ret;
 }
@@ -926,12 +934,19 @@ cap_hw_asnd(struct cap_asnd *asnd, struct pt_regs *regs)
 	rcv_thd  = arcv->thd;
 	rcv_tcap = rcv_thd->rcvcap.rcvcap_tcap;
 	assert(rcv_tcap && tcap);
+	hw_asnd_call = 1;
 
 	next = asnd_process(rcv_thd, thd, rcv_tcap, tcap, &tcap_next, 0, cos_info);
-	if (next == thd) return 1;
+	assert(next == rcv_thd);
+	if (next == thd) {
+		hw_asnd_call = 0;
+		return 1;
+	}
 	thd->state |= THD_STATE_PREEMPTED;
 
-	return cap_switch(regs, thd, next, tcap_next, TCAP_TIME_NIL, ci, cos_info);
+	int p = cap_switch(regs, thd, next, tcap_next, TCAP_TIME_NIL, ci, cos_info);
+	hw_asnd_call = 0;
+	return p;
 }
 
 int
@@ -994,11 +1009,13 @@ cap_arcv_op(struct cap_arcv *arcv, struct thread *thd, struct pt_regs *regs, str
 	if (thd_rcvcap_pending(thd)) {
 		__userregs_set(regs, 0, __userregs_getsp(regs), __userregs_getip(regs));
 		thd_rcvcap_pending_deliver(thd, regs);
+		if (thd->dcbinfo) thd->dcbinfo->sp = 0;
 
 		return 0;
 	} else if (rflags & RCV_NON_BLOCKING) {
 		__userregs_set(regs, 0, __userregs_getsp(regs), __userregs_getip(regs));
 		__userregs_setretvals(regs, -EAGAIN, 0, 0, 0);
+		if (thd->dcbinfo) thd->dcbinfo->sp = 0;
 
 		return 0;
 	}
@@ -1009,20 +1026,20 @@ cap_arcv_op(struct cap_arcv *arcv, struct thread *thd, struct pt_regs *regs, str
 	if (unlikely(tc_next != thd_rcvcap_tcap(thd))) tc_next = thd_rcvcap_tcap(thd);
 
 	/* if preempted/awoken thread is waiting, switch to that */
-	if (nti->thd) {
-		assert(nti->tc);
-
-		next    = nti->thd;
-		tc_next = nti->tc;
-		tcap_setprio(nti->tc, nti->prio);
-		if (nti->budget) {
-			/* convert budget to timeout */
-			cycles_t now;
-			rdtscll(now);
-			swtimeout = tcap_cyc2time(now + nti->budget);
-		}
-		thd_next_thdinfo_update(cos_info, 0, 0, 0, 0);
-	}
+	//if (nti->thd) {
+	//	assert(nti->tc);
+
+	//	next    = nti->thd;
+	//	tc_next = nti->tc;
+	//	tcap_setprio(nti->tc, nti->prio);
+	//	if (nti->budget) {
+	//		/* convert budget to timeout */
+	//		cycles_t now;
+	//		rdtscll(now);
+	//		swtimeout = tcap_cyc2time(now + nti->budget);
+	//	}
+	//	thd_next_thdinfo_update(cos_info, 0, 0, 0, 0);
+	//}
 
 	/* FIXME:  for now, lets just ignore this path...need to plumb tcaps into it */
 	thd->interrupted_thread = NULL;
@@ -1065,6 +1082,13 @@ cap_introspect(struct captbl *ct, capid_t capid, u32_t op, unsigned long *retval
 
 #define ENABLE_KERNEL_PRINT
 
+#define cos_thd_throw(label, thd, errno) 				\
+        {								\
+                ret = (errno); 						\
+		if (unlikely(thd->dcbinfo)) thd->dcbinfo->sp = 0; 	\
+                goto label;						\
+        } 
+
 static int composite_syscall_slowpath(struct pt_regs *regs, int *thd_switch);
 
 COS_SYSCALL __attribute__((section("__ipc_entry"))) int
@@ -1130,7 +1154,8 @@ composite_syscall_handler(struct pt_regs *regs)
 	switch (ch->type) {
 	case CAP_THD:
 		ret = cap_thd_op((struct cap_thd *)ch, thd, regs, ci, cos_info);
-		if (ret < 0) cos_throw(done, ret);
+		//printk("[%d]\n", ret);
+		if (ret < 0) cos_thd_throw(done, thd, ret);
 		return ret;
 	case CAP_ASND:
 		ret = cap_asnd_op((struct cap_asnd *)ch, thd, regs, ci, cos_info);
@@ -1138,7 +1163,7 @@ composite_syscall_handler(struct pt_regs *regs)
 		return ret;
 	case CAP_ARCV:
 		ret = cap_arcv_op((struct cap_arcv *)ch, thd, regs, ci, cos_info);
-		if (ret < 0) cos_throw(done, ret);
+		if (ret < 0) cos_thd_throw(done, thd, ret);
 		return ret;
 	default:
 		break;
diff --git a/src/kernel/include/thd.h b/src/kernel/include/thd.h
index 6c3396aa7d..acbfdcf8a4 100644
--- a/src/kernel/include/thd.h
+++ b/src/kernel/include/thd.h
@@ -253,15 +253,17 @@ thd_rcvcap_set_counter(struct thread *t, sched_tok_t cntr)
 static void
 thd_rcvcap_pending_set(struct thread *arcvt)
 {
-	if (likely(arcvt->dcbinfo)) arcvt->dcbinfo->pending = 1;
-	else                        arcvt->rcvcap.pending = 1;
+	if (likely(arcvt->dcbinfo)) {
+		arcvt->dcbinfo->pending = 1;
+	//printk("%u:%d\n", arcvt->tid, arcvt->dcbinfo->pending);
+	}
+	else arcvt->rcvcap.pending = 1;
 }
 
 static void
 thd_rcvcap_pending_reset(struct thread *arcvt)
 {
 	arcvt->rcvcap.pending = 0;
-	if (likely(arcvt->dcbinfo)) arcvt->dcbinfo->pending = 0;
 }
 
 static inline int
@@ -660,10 +662,14 @@ thd_switch_update(struct thread *thd, struct pt_regs *regs, int issame)
 	/* TODO: check FPU */
 	/* fpu_save(thd); */
 	if (thd->state & THD_STATE_PREEMPTED) {
-		assert(!(thd->state & THD_STATE_RCVING));
+		/* TODO: assert that its a scheduler thread */
+		/* assert(!(thd->state & THD_STATE_RCVING)); */
 		thd->state &= ~THD_STATE_PREEMPTED;
 		preempt = 1;
-	} else if (thd->state & THD_STATE_RCVING) {
+	}
+
+	/* FIXME: can the thread be in race with the kernel? */
+	if (thd->state & THD_STATE_RCVING) {
 		assert(!(thd->state & THD_STATE_PREEMPTED));
 		thd->state &= ~THD_STATE_RCVING;
 		thd_rcvcap_pending_deliver(thd, regs);
diff --git a/src/platform/i386/hpet.c b/src/platform/i386/hpet.c
index c6d0c4bacd..b12ad9c2cc 100644
--- a/src/platform/i386/hpet.c
+++ b/src/platform/i386/hpet.c
@@ -195,12 +195,21 @@ int
 hpet_periodic_handler(struct pt_regs *regs)
 {
 	int preempt = 1;
+static int count = 0;
 
 	lapic_ack();
 	if (unlikely(hpet_calibration_init)) hpet_calibration();
-	if (unlikely(hpet_periodicity_curr[HPET_PERIODIC] && !hpet_first_hpet_period)) rdtscll(hpet_first_hpet_period);
+	if (unlikely(hpet_periodicity_curr[HPET_PERIODIC] && !hpet_first_hpet_period)) {
+	count++;
+
+	//printk("Y");
+	if (count < 5) goto done;
+		rdtscll(hpet_first_hpet_period);
+	}
+	//printk("H");
 
 	preempt = cap_hw_asnd(&hw_asnd_caps[get_cpuid()][HW_HPET_PERIODIC], regs);
+done:
 	HPET_INT_ENABLE(HPET_PERIODIC);
 
 	return preempt;
@@ -304,6 +313,7 @@ chal_hpet_periodic_set(hwid_t hwid, unsigned long usecs_period)
 		hpet_periodicity_curr[type] = usecs_period;
 		if (type == HPET_PERIODIC) hpet_first_hpet_period = 0;
 		hpet_set(type, hpetcyc_per_period);
+		chal_irq_enable(HW_HPET_PERIODIC, 0);
 		printk("Setting HPET [%u:%u] Periodicity:%lu hpetcyc_per_period:%llu\n", hwid, type, usecs_period, hpetcyc_per_period);
 	}
 }
@@ -361,13 +371,13 @@ hpet_init(void)
 	 * Set the timer as specified.  This assumes that the cycle
 	 * specification is in hpet cycles (not cpu cycles).
 	 */
-	if (chal_msr_mhz && !lapic_timer_calib_init) {
-		hpet_cpucyc_per_tick    = chal_msr_mhz * HPET_DEFAULT_PERIOD_US;
-		hpet_cpucyc_per_hpetcyc = hpet_cpucyc_per_tick / hpet_hpetcyc_per_tick;
-		printk("Timer not calibrated, instead computed using MSR frequency value\n");
+	//if (chal_msr_mhz && !lapic_timer_calib_init) {
+	//	hpet_cpucyc_per_tick    = chal_msr_mhz * HPET_DEFAULT_PERIOD_US;
+	//	hpet_cpucyc_per_hpetcyc = hpet_cpucyc_per_tick / hpet_hpetcyc_per_tick;
+	//	printk("Timer not calibrated, instead computed using MSR frequency value\n");
 
-		return;
-	}
+	//	return;
+	//}
 
 	hpet_calibration_init = 1;
 	hpet_set(HPET_PERIODIC, hpet_hpetcyc_per_tick);
diff --git a/src/platform/i386/qemu-kvm.sh b/src/platform/i386/qemu-kvm.sh
index d1d7c43efe..5fb559c299 100755
--- a/src/platform/i386/qemu-kvm.sh
+++ b/src/platform/i386/qemu-kvm.sh
@@ -12,4 +12,4 @@ fi
 MODULES=$(sh $1 | awk '/^Writing image/ { print $3; }' | tr '\n' ' ')
 
 #qemu-system-i386 -m 768 -nographic -kernel kernel.img -no-reboot -s -initrd "$(echo $MODULES | tr ' ' ',')"
-qemu-system-i386 -enable-kvm -rtc base=localtime,clock=host,driftfix=none -smp sockets=1,cores=6,threads=1 -cpu host -nographic -m 2048 -kernel kernel.img -initrd "$(echo $MODULES | tr ' ' ',')"
+qemu-system-i386 -enable-kvm -rtc base=localtime,clock=host,driftfix=none -smp sockets=1,cores=2,threads=1 -cpu host -nographic -m 800 -kernel kernel.img -initrd "$(echo $MODULES | tr ' ' ',')"
diff --git a/src/platform/i386/runscripts/unit_slite01.sh b/src/platform/i386/runscripts/unit_slite01.sh
index 511e793c8e..00ee72b414 100644
--- a/src/platform/i386/runscripts/unit_slite01.sh
+++ b/src/platform/i386/runscripts/unit_slite01.sh
@@ -3,5 +3,6 @@
 cp llboot_comp.o llboot.o
 cp root_fprr.o boot.o
 #cp unit_slrcvtest.o boot.o
-cp test_boot.o dummy1.o
-./cos_linker "llboot.o, ;*unit_slrcvtest.o, ;capmgr.o, ;dummy1.o, ;*boot.o, :boot.o-capmgr.o;unit_slrcvtest.o-boot.o|capmgr.o" ./gen_client_stub
+#cp test_boot.o dummy2.o
+./cos_linker "llboot.o, ;*unit_slrcvtest.o, ;capmgr.o, ;*spin_comp.o, ;*boot.o, :boot.o-capmgr.o;unit_slrcvtest.o-boot.o|capmgr.o;spin_comp.o-boot.o|capmgr.o" ./gen_client_stub
+#./cos_linker "llboot.o, ;dummy2.o, ;capmgr.o, ;dummy1.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub

From b8a791e6505b4a117e11aed865bf25bda2bd814d Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Tue, 24 Sep 2019 16:05:05 -0400
Subject: [PATCH 113/127] crt_chan interface for standard in and out

* TODO! invocation + switch!
---
 src/components/implementation/sched/Makefile  |   2 +-
 .../implementation/sched/chan_backend.c       |  45 +++++
 .../implementation/sched/hier_fprr/Makefile   |   2 +-
 .../implementation/sched/root_fprr/Makefile   |   2 +-
 .../sched/root_fprr_raw/Makefile              |   2 +-
 .../implementation/sched/sched_info.c         |   3 +
 .../implementation/sched/test_sched/Makefile  |  10 +
 .../implementation/sched/test_sched/init.c    | 180 ++++++++++++++++++
 .../tests/test_schedinv/Makefile              |   8 +
 .../tests/test_schedinv/test_schedinv.c       | 111 +++++++++++
 src/components/interface/crt/Makefile         |   4 +
 src/components/interface/crt/chan_crt.h       |  11 ++
 src/components/interface/crt/stubs/s_stub.S   |  20 ++
 src/platform/i386/runscripts/test_slite02.sh  |  11 ++
 14 files changed, 407 insertions(+), 4 deletions(-)
 create mode 100644 src/components/implementation/sched/chan_backend.c
 create mode 100644 src/components/implementation/sched/test_sched/Makefile
 create mode 100644 src/components/implementation/sched/test_sched/init.c
 create mode 100644 src/components/implementation/tests/test_schedinv/Makefile
 create mode 100644 src/components/implementation/tests/test_schedinv/test_schedinv.c
 create mode 100644 src/components/interface/crt/Makefile
 create mode 100644 src/components/interface/crt/chan_crt.h
 create mode 100644 src/components/interface/crt/stubs/s_stub.S
 create mode 100644 src/platform/i386/runscripts/test_slite02.sh

diff --git a/src/components/implementation/sched/Makefile b/src/components/implementation/sched/Makefile
index c0cd0d6743..43e1e0e4f1 100644
--- a/src/components/implementation/sched/Makefile
+++ b/src/components/implementation/sched/Makefile
@@ -1,3 +1,3 @@
-INTERFACES=sched schedinit
+INTERFACES=sched schedinit crt
 
 include ../Makefile.subdir
diff --git a/src/components/implementation/sched/chan_backend.c b/src/components/implementation/sched/chan_backend.c
new file mode 100644
index 0000000000..20cc564459
--- /dev/null
+++ b/src/components/implementation/sched/chan_backend.c
@@ -0,0 +1,45 @@
+#include <chan_crt.h>
+
+#include <crt_chan.h>
+
+struct __sched_inout_chan {
+	struct crt_chan *in, *out;
+} __sched_thds[NUM_CPU][MAX_NUM_THREADS];
+
+CRT_CHAN_TYPE_PROTOTYPES(LU, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
+
+void
+__sched_stdio_init(void)
+{
+	memset(__sched_thds[cos_cpuid()], 0, MAX_NUM_THREADS * sizeof(struct __sched_inout_chan));
+}
+
+void
+__sched_stdio_thd_init(thdid_t tid, struct crt_chan *in, struct crt_chan *out)
+{
+	__sched_thds[cos_cpuid()][tid].in  = in;
+	__sched_thds[cos_cpuid()][tid].out = out;
+}
+
+int
+chan_out(unsigned long item)
+{
+	struct crt_chan *co = __sched_thds[cos_cpuid()][cos_thdid()].out;
+
+	assert(co != NULL);
+	return crt_chan_send_LU(co, &item);
+}
+
+unsigned long chan_in(void)
+{
+	unsigned long item = 0;
+	int ret = 0;
+	struct crt_chan *ci = __sched_thds[cos_cpuid()][cos_thdid()].in;
+
+	assert(ci != NULL);
+
+	ret = crt_chan_recv_LU(ci, &item);
+	assert(ret == 0);
+
+	return item;	
+}
diff --git a/src/components/implementation/sched/hier_fprr/Makefile b/src/components/implementation/sched/hier_fprr/Makefile
index 5be22a8cbd..f19b907991 100644
--- a/src/components/implementation/sched/hier_fprr/Makefile
+++ b/src/components/implementation/sched/hier_fprr/Makefile
@@ -5,7 +5,7 @@ INTERFACES=sched schedinit
 DEPENDENCIES=capmgr sched schedinit
 IF_LIB=
 FN_PREPEND=parent_
-ADDITIONAL_LIBS=$(LIBSLCAPMGR) -lsl_thd_static_backend -lsl_mod_fprr
+ADDITIONAL_LIBS=$(LIBSLCAPMGR) -lsl_thd_static_backend -lsl_mod_fprr -lsl_blkpt
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/sched/root_fprr/Makefile b/src/components/implementation/sched/root_fprr/Makefile
index ec245e6d9f..e811b24382 100644
--- a/src/components/implementation/sched/root_fprr/Makefile
+++ b/src/components/implementation/sched/root_fprr/Makefile
@@ -4,7 +4,7 @@ COMPONENT=root_fprr.o
 INTERFACES=sched schedinit
 DEPENDENCIES=capmgr
 IF_LIB=
-ADDITIONAL_LIBS=$(LIBSLCAPMGR) -lsl_thd_static_backend -lsl_mod_fprr 
+ADDITIONAL_LIBS=$(LIBSLCAPMGR) -lsl_thd_static_backend -lsl_mod_fprr -lsl_blkpt
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/sched/root_fprr_raw/Makefile b/src/components/implementation/sched/root_fprr_raw/Makefile
index 5061883b7f..206a88478a 100644
--- a/src/components/implementation/sched/root_fprr_raw/Makefile
+++ b/src/components/implementation/sched/root_fprr_raw/Makefile
@@ -4,7 +4,7 @@ COMPONENT=root_fprr_raw.o
 INTERFACES=sched schedinit
 DEPENDENCIES=
 IF_LIB=
-ADDITIONAL_LIBS=$(LIBSLRAW) -lsl_thd_static_backend -lsl_mod_fprr
+ADDITIONAL_LIBS=$(LIBSLRAW) -lsl_thd_static_backend -lsl_mod_fprr -lsl_blkpt
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/sched/sched_info.c b/src/components/implementation/sched/sched_info.c
index 442705a6db..5a9bb457a2 100644
--- a/src/components/implementation/sched/sched_info.c
+++ b/src/components/implementation/sched/sched_info.c
@@ -70,6 +70,8 @@ sched_num_childsched_get(void)
 	return sched_num_childsched[cos_cpuid()];
 }
 
+extern void __sched_stdio_init(void);
+
 static void
 sched_childinfo_init_intern(int is_raw)
 {
@@ -78,6 +80,7 @@ sched_childinfo_init_intern(int is_raw)
 	comp_flag_t childflags;
 
 	memset(childinfo[cos_cpuid()], 0, sizeof(struct sched_childinfo) * SCHED_MAX_CHILD_COMPS);
+	__sched_stdio_init();
 
 	while ((remaining = hypercall_comp_child_next(cos_spd_id(), &child, &childflags)) >= 0) {
 		struct sched_childinfo *schedinfo = NULL;
diff --git a/src/components/implementation/sched/test_sched/Makefile b/src/components/implementation/sched/test_sched/Makefile
new file mode 100644
index 0000000000..b6383ecc8c
--- /dev/null
+++ b/src/components/implementation/sched/test_sched/Makefile
@@ -0,0 +1,10 @@
+C_OBJS=
+ASM_OBJS=
+COMPONENT=test_sched.o
+INTERFACES=sched schedinit crt
+DEPENDENCIES=capmgr channel
+IF_LIB=
+ADDITIONAL_LIBS=$(LIBSLCAPMGR) -lsl_thd_static_backend -lsl_mod_fprr -lsl_blkpt
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/sched/test_sched/init.c b/src/components/implementation/sched/test_sched/init.c
new file mode 100644
index 0000000000..948ae26e93
--- /dev/null
+++ b/src/components/implementation/sched/test_sched/init.c
@@ -0,0 +1,180 @@
+/**
+ * Redistribution of this file is permitted under the BSD two clause license.
+ *
+ * Copyright 2018, The George Washington University
+ * Author: Phani Gadepalli, phanikishoreg@gwu.edu
+ */
+
+#include <sl.h>
+#include <res_spec.h>
+#include <hypercall.h>
+#include <sched_info.h>
+#include <crt_chan.h>
+#include <chan_crt.h>
+#include <channel.h>
+
+u32_t cycs_per_usec = 0;
+cycles_t *int_start = NULL;
+
+void
+sched_child_init(struct sched_childinfo *schedci)
+{
+	vaddr_t dcbaddr;
+	struct sl_thd *initthd;
+
+	assert(schedci);
+	assert(!(schedci->flags & COMP_FLAG_SCHED));
+	schedci->initthd = sl_thd_initaep_alloc(sched_child_defci_get(schedci), NULL, 0, 0, 0, 0, 0, &dcbaddr);
+        assert(schedci->initthd);
+	initthd = schedci->initthd;
+
+	sl_thd_param_set(initthd, sched_param_pack(SCHEDP_PRIO, 1));
+}
+
+extern void __sched_stdio_thd_init(thdid_t, struct crt_chan *, struct crt_chan *);
+#define MAX_PIPE_SZ 4
+CRT_CHAN_STATIC_ALLOC(c0, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
+CRT_CHAN_STATIC_ALLOC(c1, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
+CRT_CHAN_STATIC_ALLOC(c2, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
+CRT_CHAN_STATIC_ALLOC(c3, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
+
+#define SPDID_INT 1
+#define SPDID_W1  3
+#define SPDID_W3  5
+
+#define PRIO_INT  MAX_PIPE_SZ + 1
+#define PRIO_W0   MAX_PIPE_SZ + 1 - 1
+#define PRIO_W1   MAX_PIPE_SZ + 1 - 2
+#define PRIO_W2   MAX_PIPE_SZ + 1 - 3
+#define PRIO_W3   MAX_PIPE_SZ + 1 - 4
+
+#define SND_DATA 0x1234
+
+#define SHMCHANNEL_KEY 0x2020
+#define MAX_ITERS 100
+int iters = 0;
+cycles_t tot = 0, wc = 0;
+
+static void
+work_thd_fn(void *data)
+{
+	int is_last = (int)data;
+	unsigned long i = 0;
+
+	while (1) {
+		i = chan_in();
+		if (unlikely(is_last)) {
+			//printc("[E%u]", cos_thdid());
+			cycles_t end, diff;
+			rdtscll(end);
+			assert(int_start);
+			diff = end - *int_start;
+			if (wc < diff) wc = diff;
+			tot += diff;
+			iters++;
+
+			if (iters == MAX_ITERS) {
+				printc("%llu, %llu\n", tot / iters, wc);
+				tot = wc = 0;
+				iters = 0;
+			}
+			continue;
+		} else {
+			//printc("[W%u]", cos_thdid());
+		}
+		chan_out(SND_DATA);
+	}
+}
+
+thdid_t
+sched_child_thd_create(struct sched_childinfo *schedci, thdclosure_index_t idx)
+{
+	vaddr_t addr;
+	struct sl_thd *t = sl_thd_aep_alloc_ext(sched_child_defci_get(schedci), NULL, idx, 0, 0, 0, 0, 0, &addr, NULL);
+	assert(t);
+	if (cos_inv_token() == SPDID_W1) {
+		sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, PRIO_W1));
+		__sched_stdio_thd_init(sl_thd_thdid(t), c1, c2);
+	} else if (cos_inv_token() == SPDID_W3) {
+		sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, PRIO_W3));
+		__sched_stdio_thd_init(sl_thd_thdid(t), c3, NULL);
+	}
+
+	return t ? sl_thd_thdid(t) : 0;
+}
+
+thdid_t
+sched_child_aep_create(struct sched_childinfo *schedci, thdclosure_index_t idx, int owntc, cos_channelkey_t key, microsec_t ipiwin, u32_t ipimax, arcvcap_t *extrcv)
+{
+	assert(cos_inv_token() == SPDID_INT);
+	int first = 1;
+	vaddr_t addr;
+	/* only 1 aep */
+	if (!ps_cas(&first, 1, 0)) assert(0);
+	struct sl_thd *t = sl_thd_aep_alloc_ext(sched_child_defci_get(schedci), NULL, idx, 1, owntc, key, ipiwin, ipimax, &addr, extrcv);
+	assert(t);
+	sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, PRIO_INT));
+	__sched_stdio_thd_init(sl_thd_thdid(t), NULL, c0);
+
+	return t ? sl_thd_thdid(t) : 0;
+}
+
+void
+test_pipes_init(void)
+{
+	struct sl_thd *t = sl_thd_alloc(work_thd_fn, 1);
+	assert(t);
+	sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, PRIO_W0));
+	__sched_stdio_thd_init(sl_thd_thdid(t), c0, NULL);
+	//__sched_stdio_thd_init(sl_thd_thdid(t), c0, c1);
+//	t = sl_thd_alloc(work_thd_fn, 0);
+//	assert(t);
+//	sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, PRIO_W2));
+//	__sched_stdio_thd_init(sl_thd_thdid(t), c2, c3);
+}
+
+void
+cos_init(void)
+{
+	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
+	struct cos_compinfo    *ci    = cos_compinfo_get(defci);
+	static volatile unsigned long first = NUM_CPU + 1, init_done[NUM_CPU] = { 0 };
+	static u32_t cpubmp[NUM_CPU_BMP_WORDS] = { 0 };
+	int i;
+
+	PRINTLOG(PRINT_DEBUG, "CPU cycles per sec: %u\n", cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE));
+
+	if (ps_cas((unsigned long *)&first, NUM_CPU + 1, cos_cpuid())) {
+		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
+		cos_defcompinfo_init();
+		cos_init_args_cpubmp(cpubmp);
+	} else {
+		while (!ps_load((unsigned long *)&init_done[first])) ;
+
+		cos_defcompinfo_sched_init();
+	}
+	ps_faa((unsigned long *)&init_done[cos_cpuid()], 1);
+
+	/* make sure the INITTHD of the scheduler is created on all cores.. for cross-core sl initialization to work! */
+	for (i = 0; i < NUM_CPU; i++) {
+		if (!bitmap_check(cpubmp, i)) continue;
+
+		while (!ps_load((unsigned long *)&init_done[i])) ;
+	}
+
+	sl_init_corebmp(100*SL_MIN_PERIOD_US, cpubmp);
+	vaddr_t tscaddr = 0;
+	cbuf_t id = channel_shared_page_alloc(SHMCHANNEL_KEY, &tscaddr);
+	assert(id >= 0);
+	int_start = (cycles_t *)tscaddr;
+	*int_start = 0ULL;
+	sched_childinfo_init();
+	test_pipes_init();
+	self_init[cos_cpuid()] = 1;
+	hypercall_comp_init_done();
+
+	sl_sched_loop_nonblock();
+
+	PRINTLOG(PRINT_ERROR, "Should never have reached this point!!!\n");
+	assert(0);
+}
diff --git a/src/components/implementation/tests/test_schedinv/Makefile b/src/components/implementation/tests/test_schedinv/Makefile
new file mode 100644
index 0000000000..859fb3dd71
--- /dev/null
+++ b/src/components/implementation/tests/test_schedinv/Makefile
@@ -0,0 +1,8 @@
+COMPONENT=test_sched_inv.o
+INTERFACES=
+DEPENDENCIES= crt sched capmgr channel
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api #required for cos_sinv in llboot.h!
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/tests/test_schedinv/test_schedinv.c b/src/components/implementation/tests/test_schedinv/test_schedinv.c
new file mode 100644
index 0000000000..6b09d581af
--- /dev/null
+++ b/src/components/implementation/tests/test_schedinv/test_schedinv.c
@@ -0,0 +1,111 @@
+/*
+ * Copyright 2018, Phani Gadepalli and Gabriel Parmer, GWU, gparmer@gwu.edu.
+ *
+ * This uses a two clause BSD License.
+ */
+
+#include <llprint.h>
+#include <res_spec.h>
+#include <hypercall.h>
+#include <sched.h>
+#include <capmgr.h>
+#include <chan_crt.h>
+#include <channel.h>
+
+#define SPDID_INT 1
+#define SPDID_W1  3
+#define SPDID_W3  5
+
+static u32_t cycs_per_usec = 0;
+
+#define SND_DATA 0x4321
+#define HPET_PERIOD_TEST_US 5000
+
+#define SHMCHANNEL_KEY 0x2020
+static cycles_t *sttsc = NULL;
+static void
+__test_int_fn(arcvcap_t rcv, void *data)
+{
+	int a = capmgr_hw_periodic_attach(HW_HPET_PERIODIC, cos_thdid(), HPET_PERIOD_TEST_US);
+	assert(a == 0);
+
+	/* TODO: register to HPET */
+	while (1) {
+		cos_rcv(rcv, 0);
+		rdtscll(*sttsc);
+		//printc("[i%u]", cos_thdid());
+		chan_out(SND_DATA);
+	}
+
+	sched_thd_exit();
+}
+
+#define ITERS 100
+cycles_t tot = 0, wc = 0;
+int iters = 0;
+
+static void
+__test_wrk_fn(void *data)
+{
+	int e = (int) data;
+	while (1) {
+		chan_in();
+
+		if (unlikely(e)) {
+			//printc("[e%u]", cos_thdid());
+			cycles_t en, diff;
+
+			rdtscll(en);
+			assert(sttsc);
+			diff = en - *sttsc;
+			if (diff > wc) wc = diff;
+			tot += diff;
+			iters++;
+			if (iters == ITERS) {
+				printc("%llu, %llu\n", tot / ITERS, wc);	
+				tot = wc = 0;
+				iters = 0;
+			}
+			continue;
+		} else {
+			//printc("[w%u]", cos_thdid());
+		}
+		chan_out(SND_DATA);
+	}
+}
+
+struct cos_aep_info intaep;
+
+static void
+test_aeps(void)
+{
+	thdid_t tid;
+	int ret;
+	int i = 0;
+
+	if (cos_spd_id() == SPDID_INT) {
+		tid = sched_aep_create(&intaep, __test_int_fn, (void *)0, 0, 0, 0, 0);
+	} else {
+		tid = sched_thd_create(__test_wrk_fn, cos_spd_id() == SPDID_W3 ? (void *)1: (void *)0);
+	}
+	assert(tid);
+}
+
+void
+cos_init(void)
+{
+	spdid_t child;
+	comp_flag_t childflags;
+
+	vaddr_t addr = 0;
+	unsigned long pages = 0;
+	cbuf_t id =  channel_shared_page_map(SHMCHANNEL_KEY, &addr, &pages);
+	assert(id >= 0 && addr && pages == 1);
+	sttsc = (cycles_t *)addr;
+	cycs_per_usec = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
+
+	assert(hypercall_comp_child_next(cos_spd_id(), &child, &childflags) == -1);
+	test_aeps();
+
+	sched_thd_exit();
+}
diff --git a/src/components/interface/crt/Makefile b/src/components/interface/crt/Makefile
new file mode 100644
index 0000000000..6015b0c902
--- /dev/null
+++ b/src/components/interface/crt/Makefile
@@ -0,0 +1,4 @@
+LIB_OBJS=
+LIBS=$(LIB_OBJS:%.o=%.a)
+
+include ../Makefile.subdir
diff --git a/src/components/interface/crt/chan_crt.h b/src/components/interface/crt/chan_crt.h
new file mode 100644
index 0000000000..16386def5b
--- /dev/null
+++ b/src/components/interface/crt/chan_crt.h
@@ -0,0 +1,11 @@
+#ifndef CHAN_CRT_H
+#define CHAN_CRT_H
+
+#define CHAN_CRT_NSLOTS 4
+#define CHAN_CRT_ITEM_TYPE unsigned long
+#define CHAN_CRT_ITEM_SZ sizeof(CHAN_CRT_ITEM_TYPE)
+
+int           chan_out(unsigned long item);
+unsigned long chan_in(void);
+
+#endif /* CHAN_CRT_H */
diff --git a/src/components/interface/crt/stubs/s_stub.S b/src/components/interface/crt/stubs/s_stub.S
new file mode 100644
index 0000000000..806aea9e19
--- /dev/null
+++ b/src/components/interface/crt/stubs/s_stub.S
@@ -0,0 +1,20 @@
+/**
+ * Redistribution of this file is permitted under the BSD two clause license.
+ *
+ * Copyright 2019, The George Washington University
+ * Author: Phani Gadepalli, phanikishoreg@gwu.edu
+ */
+
+#include <cos_asm_server_stub_simple_stack.h>
+
+.text
+cos_asm_server_stub(chan_out)
+cos_asm_server_stub(chan_in)
+//cos_asm_server_stub(chan_init)
+//cos_asm_server_stub(chan_teardown)
+//cos_asm_server_stub(chan_in_get)
+//cos_asm_server_stub(chan_out_get)
+//cos_asm_server_stub(chan_send)
+//cos_asm_server_stub(chan_recv)
+//cos_asm_server_stub(chan_async_send)
+//cos_asm_server_stub(chan_async_recv)
diff --git a/src/platform/i386/runscripts/test_slite02.sh b/src/platform/i386/runscripts/test_slite02.sh
new file mode 100644
index 0000000000..14a90e1aa3
--- /dev/null
+++ b/src/platform/i386/runscripts/test_slite02.sh
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+cp llboot_comp.o llboot.o
+cp test_sched.o boot.o
+cp test_sched_inv.o intcomp.o
+#cp test_sched_inv.o w1comp.o
+#cp test_sched_inv.o w3comp.o
+#./cos_linker "llboot.o, ;intcomp.o, ;capmgr.o, ;w1comp.o, ;*boot.o, ;w3comp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o;w1comp.o-boot.o|capmgr.o;w3comp.o-boot.o|capmgr.o" ./gen_client_stub
+
+cp test_boot.o dummy.o
+./cos_linker "llboot.o, ;intcomp.o, ;capmgr.o, ;dummy.o, ;*boot.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o" ./gen_client_stub

From 6520d9e3ea0e0a8bff1691ea12792dc89e25d691 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Thu, 26 Sep 2019 17:41:38 -0400
Subject: [PATCH 114/127] invocations + slite switch

* if ulthd != kthd, get the comp_info of the ulthd to proceed from
  lazyupdate!!
---
 .../implementation/capmgr/naive/cap_mgr.c     |  7 +-
 .../no_interface/llbooter/boot_deps.h         | 16 +++-
 .../implementation/sched/chan_backend.c       |  3 +-
 .../implementation/sched/test_sched/init.c    | 41 ++++-----
 .../tests/test_schedinv/test_schedinv.c       | 21 +++--
 src/components/include/part.h                 |  3 +-
 src/components/include/sl.h                   |  6 +-
 src/components/lib/sl/sl_capmgr.c             | 10 ++-
 src/components/lib/sl/sl_sched.c              | 83 ++++++++++---------
 src/kernel/capinv.c                           | 24 +-----
 src/kernel/include/scb.h                      |  5 +-
 src/kernel/include/thd.h                      |  8 +-
 src/platform/i386/hpet.c                      |  4 +-
 src/platform/i386/runscripts/test_slite02.sh  | 15 ++--
 14 files changed, 130 insertions(+), 116 deletions(-)

diff --git a/src/components/implementation/capmgr/naive/cap_mgr.c b/src/components/implementation/capmgr/naive/cap_mgr.c
index 3886ac4919..fcf938b936 100644
--- a/src/components/implementation/capmgr/naive/cap_mgr.c
+++ b/src/components/implementation/capmgr/naive/cap_mgr.c
@@ -67,7 +67,8 @@ capmgr_thd_create_ext_cserialized(struct cos_dcb_info **dcb, thdid_t *tid, spdid
 	if (cap_info_is_sched(s)) return 0;
 	if (idx <= 0) return 0;
 
-	dcbcap = cos_dcb_info_alloc(cap_info_cpu_dcbdata(cap_info_cpu_local(rs)), &dcboff, &dcbaddr);
+	/* s is not a scheduler, dcbinfo will be in the scheduler component */
+	dcbcap = cos_dcb_info_alloc(cap_info_cpu_dcbdata(cap_info_cpu_local(rc)), &dcboff, &dcbaddr);
 	if (!dcbcap || !dcbaddr || !dcboff) return 0; /* dcboff == 0 for initthd in that comp! */
 	t = sl_thd_aep_alloc_ext_dcb(cap_info_dci(rs), NULL, idx, 0, 0, 0, dcbcap, dcboff, 0, 0, NULL);
 	if (!t) return 0;
@@ -259,7 +260,9 @@ capmgr_aep_create_ext_cserialized(struct cos_dcb_info **dcb, u32_t *rcvtcret, u3
 	rinit = cap_info_initthd(rc);
 	if (!rinit) return 0;
 
-	dcbcap = cos_dcb_info_alloc(cap_info_cpu_dcbdata(cap_info_cpu_local(rs)), &dcboff, &dcbaddr);
+	/* if s is not a scheduler, dcbinfo will be in the scheduler component */
+	//if (cap_info_is_sched(s)) dcbcap = cos_dcb_info_alloc(cap_info_cpu_dcbdata(cap_info_cpu_local(rs)), &dcboff, &dcbaddr);
+	/*else*/ dcbcap = cos_dcb_info_alloc(cap_info_cpu_dcbdata(cap_info_cpu_local(rc)), &dcboff, &dcbaddr);
 	if (!dcbcap || !dcbaddr || !dcboff) return 0; /* dcboff == 0 for initthd in that comp! */
 	t = sl_thd_aep_alloc_ext_dcb(cap_info_dci(rs), rinit, tidx, 1, owntc, 0, dcbcap, dcboff, ipiwin, ipimax, &srcrcv);
 	if (!t) return 0;
diff --git a/src/components/implementation/no_interface/llbooter/boot_deps.h b/src/components/implementation/no_interface/llbooter/boot_deps.h
index 70a1654f53..8c2fab7cbe 100644
--- a/src/components/implementation/no_interface/llbooter/boot_deps.h
+++ b/src/components/implementation/no_interface/llbooter/boot_deps.h
@@ -39,6 +39,7 @@ struct comp_cap_info {
 	u32_t                             cpu_bitmap[NUM_CPU_BMP_WORDS];
 	struct comp_sched_info           *schedinfo[NUM_CPU];
 	struct cos_component_information *cobj_info;
+	scbcap_t                          scbcap;
 } new_comp_cap_info[MAX_NUM_SPDS];
 
 int                   schedule[NUM_CPU][MAX_NUM_SPDS];
@@ -374,6 +375,7 @@ boot_newcomp_create(spdid_t spdid, struct cos_compinfo *comp_info)
 	struct cos_compinfo *boot_info = boot_spd_compinfo_curr_get();
 	struct comp_cap_info *spdinfo  = boot_spd_compcapinfo_get(spdid);
 	struct cos_component_information *cobj_info = boot_spd_comp_cobj_info_get(spdid);
+	struct comp_sched_info *spdsi = boot_spd_comp_schedinfo_get(spdid);
 	captblcap_t ct = compinfo->captbl_cap;
 	pgtblcap_t  pt = compinfo->pgtbl_cap;
 	compcap_t   cc;
@@ -385,10 +387,16 @@ boot_newcomp_create(spdid_t spdid, struct cos_compinfo *comp_info)
 	vaddr_t    scb_uaddr = 0;
 	scbcap_t   scbcap    = 0;
 
-	scbcap = cos_scb_alloc(boot_info);
-	assert(scbcap);
-	scb_uaddr = cos_page_bump_intern_valloc(compinfo, COS_SCB_SIZE);
-	assert(scb_uaddr);
+	if (spdsi->flags & COMP_FLAG_SCHED) { 
+		scbcap = cos_scb_alloc(boot_info);
+		assert(scbcap);
+		spdinfo->scbcap = scbcap;
+		scb_uaddr = cos_page_bump_intern_valloc(compinfo, COS_SCB_SIZE);
+		assert(scb_uaddr);
+	} else if (spdsi->parent_spdid) {
+		struct comp_cap_info *psi = boot_spd_compcapinfo_get(spdsi->parent_spdid);
+		scbcap = psi->scbcap;
+	}
 
 	if (spdinfo->initdcbpgs == 0) {
 		vaddr_t  dcbaddr = 0;
diff --git a/src/components/implementation/sched/chan_backend.c b/src/components/implementation/sched/chan_backend.c
index 20cc564459..ee3712a623 100644
--- a/src/components/implementation/sched/chan_backend.c
+++ b/src/components/implementation/sched/chan_backend.c
@@ -30,7 +30,8 @@ chan_out(unsigned long item)
 	return crt_chan_send_LU(co, &item);
 }
 
-unsigned long chan_in(void)
+unsigned long
+chan_in(void)
 {
 	unsigned long item = 0;
 	int ret = 0;
diff --git a/src/components/implementation/sched/test_sched/init.c b/src/components/implementation/sched/test_sched/init.c
index 948ae26e93..1dfa424e70 100644
--- a/src/components/implementation/sched/test_sched/init.c
+++ b/src/components/implementation/sched/test_sched/init.c
@@ -15,6 +15,7 @@
 
 u32_t cycs_per_usec = 0;
 cycles_t *int_start = NULL;
+volatile unsigned long *rdy = NULL;
 
 void
 sched_child_init(struct sched_childinfo *schedci)
@@ -38,15 +39,15 @@ CRT_CHAN_STATIC_ALLOC(c1, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
 CRT_CHAN_STATIC_ALLOC(c2, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
 CRT_CHAN_STATIC_ALLOC(c3, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
 
-#define SPDID_INT 1
-#define SPDID_W1  3
-#define SPDID_W3  5
+#define SPDID_INT 5
+#define SPDID_W1  6
+#define SPDID_W3  7
 
-#define PRIO_INT  MAX_PIPE_SZ + 1
-#define PRIO_W0   MAX_PIPE_SZ + 1 - 1
-#define PRIO_W1   MAX_PIPE_SZ + 1 - 2
-#define PRIO_W2   MAX_PIPE_SZ + 1 - 3
-#define PRIO_W3   MAX_PIPE_SZ + 1 - 4
+#define PRIO_INT  MAX_PIPE_SZ + 4
+#define PRIO_W0   MAX_PIPE_SZ + 4 - 1
+#define PRIO_W1   MAX_PIPE_SZ + 4 - 2
+#define PRIO_W2   MAX_PIPE_SZ + 4 - 3
+#define PRIO_W3   MAX_PIPE_SZ + 4 - 4
 
 #define SND_DATA 0x1234
 
@@ -61,10 +62,11 @@ work_thd_fn(void *data)
 	int is_last = (int)data;
 	unsigned long i = 0;
 
+	ps_faa(rdy, 1);
+
 	while (1) {
 		i = chan_in();
 		if (unlikely(is_last)) {
-			//printc("[E%u]", cos_thdid());
 			cycles_t end, diff;
 			rdtscll(end);
 			assert(int_start);
@@ -79,8 +81,6 @@ work_thd_fn(void *data)
 				iters = 0;
 			}
 			continue;
-		} else {
-			//printc("[W%u]", cos_thdid());
 		}
 		chan_out(SND_DATA);
 	}
@@ -122,15 +122,16 @@ sched_child_aep_create(struct sched_childinfo *schedci, thdclosure_index_t idx,
 void
 test_pipes_init(void)
 {
-	struct sl_thd *t = sl_thd_alloc(work_thd_fn, 1);
+	struct sl_thd *t = sl_thd_alloc(work_thd_fn, 0);
 	assert(t);
 	sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, PRIO_W0));
-	__sched_stdio_thd_init(sl_thd_thdid(t), c0, NULL);
-	//__sched_stdio_thd_init(sl_thd_thdid(t), c0, c1);
-//	t = sl_thd_alloc(work_thd_fn, 0);
-//	assert(t);
-//	sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, PRIO_W2));
-//	__sched_stdio_thd_init(sl_thd_thdid(t), c2, c3);
+//	__sched_stdio_thd_init(sl_thd_thdid(t), c0, NULL);
+	__sched_stdio_thd_init(sl_thd_thdid(t), c0, c1);
+	t = sl_thd_alloc(work_thd_fn, 0);
+	assert(t);
+	sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, PRIO_W2));
+	__sched_stdio_thd_init(sl_thd_thdid(t), c2, c3);
+//	__sched_stdio_thd_init(sl_thd_thdid(t), c2, NULL);
 }
 
 void
@@ -162,12 +163,14 @@ cos_init(void)
 		while (!ps_load((unsigned long *)&init_done[i])) ;
 	}
 
-	sl_init_corebmp(100*SL_MIN_PERIOD_US, cpubmp);
+	sl_init_corebmp(500*SL_MIN_PERIOD_US, cpubmp);
 	vaddr_t tscaddr = 0;
 	cbuf_t id = channel_shared_page_alloc(SHMCHANNEL_KEY, &tscaddr);
 	assert(id >= 0);
 	int_start = (cycles_t *)tscaddr;
 	*int_start = 0ULL;
+	rdy = (volatile unsigned long *)(int_start + 1);
+	*rdy = 0;
 	sched_childinfo_init();
 	test_pipes_init();
 	self_init[cos_cpuid()] = 1;
diff --git a/src/components/implementation/tests/test_schedinv/test_schedinv.c b/src/components/implementation/tests/test_schedinv/test_schedinv.c
index 6b09d581af..e71231edcc 100644
--- a/src/components/implementation/tests/test_schedinv/test_schedinv.c
+++ b/src/components/implementation/tests/test_schedinv/test_schedinv.c
@@ -11,21 +11,29 @@
 #include <capmgr.h>
 #include <chan_crt.h>
 #include <channel.h>
+#include <cos_time.h>
 
-#define SPDID_INT 1
-#define SPDID_W1  3
-#define SPDID_W3  5
+#define SPDID_INT 5
+#define SPDID_W1  6
+#define SPDID_W3  7
 
 static u32_t cycs_per_usec = 0;
 
+#define MAX_PIPE_SZ 4
+
 #define SND_DATA 0x4321
 #define HPET_PERIOD_TEST_US 5000
 
 #define SHMCHANNEL_KEY 0x2020
 static cycles_t *sttsc = NULL;
+volatile unsigned long *rdy = NULL;
+
 static void
 __test_int_fn(arcvcap_t rcv, void *data)
 {
+	ps_faa(rdy, 1);
+
+	while (ps_load(rdy) <= MAX_PIPE_SZ) sched_thd_block_timeout(0, time_now() + time_usec2cyc(HPET_PERIOD_TEST_US));
 	int a = capmgr_hw_periodic_attach(HW_HPET_PERIODIC, cos_thdid(), HPET_PERIOD_TEST_US);
 	assert(a == 0);
 
@@ -33,7 +41,6 @@ __test_int_fn(arcvcap_t rcv, void *data)
 	while (1) {
 		cos_rcv(rcv, 0);
 		rdtscll(*sttsc);
-		//printc("[i%u]", cos_thdid());
 		chan_out(SND_DATA);
 	}
 
@@ -48,11 +55,11 @@ static void
 __test_wrk_fn(void *data)
 {
 	int e = (int) data;
+	ps_faa(rdy, 1);
 	while (1) {
 		chan_in();
 
 		if (unlikely(e)) {
-			//printc("[e%u]", cos_thdid());
 			cycles_t en, diff;
 
 			rdtscll(en);
@@ -67,8 +74,6 @@ __test_wrk_fn(void *data)
 				iters = 0;
 			}
 			continue;
-		} else {
-			//printc("[w%u]", cos_thdid());
 		}
 		chan_out(SND_DATA);
 	}
@@ -102,6 +107,8 @@ cos_init(void)
 	cbuf_t id =  channel_shared_page_map(SHMCHANNEL_KEY, &addr, &pages);
 	assert(id >= 0 && addr && pages == 1);
 	sttsc = (cycles_t *)addr;
+	rdy = (volatile unsigned long *)(sttsc + 1);
+
 	cycs_per_usec = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
 
 	assert(hypercall_comp_child_next(cos_spd_id(), &child, &childflags) == -1);
diff --git a/src/components/include/part.h b/src/components/include/part.h
index 77decb709f..f4ea8cc9bc 100644
--- a/src/components/include/part.h
+++ b/src/components/include/part.h
@@ -307,7 +307,7 @@ static inline void
 part_task_barrier(struct part_task *t, int is_end)
 {
 	struct sl_thd *ts = sl_thd_curr();
-	unsigned cbc = 0, cbep = 0;
+	unsigned cbc = 0;
 	int is_master = t->master == PART_CURR_THD ? 1 : 0;
 
 	assert(t->type != PART_TASK_T_NONE);
@@ -343,7 +343,6 @@ part_task_barrier(struct part_task *t, int is_end)
 
 	assert(t->type == PART_TASK_T_WORKSHARE);
 
-	cbep = ps_load(&t->barrier_epoch);
 	cbc = ps_faa(&t->barrier, -1);
 	if (cbc > 1) {
 		sl_thd_block(0);
diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index d8ba28fa23..058293d33d 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -41,7 +41,7 @@
 #include <heap.h>
 
 #define SL_CS
-#define SL_REPLENISH
+#undef SL_REPLENISH
 
 /* Critical section (cs) API to protect scheduler data-structures */
 struct sl_cs {
@@ -612,7 +612,9 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 	struct sl_thd         *t = to, *c = sl_thd_curr();
 	struct sl_global_core *globals = sl__globals_core();
 	sched_tok_t            tok;
+#ifdef SL_REPLENISH
 	cycles_t               now;
+#endif
 	s64_t                  offset;
 	int                    ret;
 
@@ -622,7 +624,9 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 #endif
 
 	tok    = cos_sched_sync();
+#ifdef SL_REPLENISH
 	now    = sl_now();
+#endif
 
 	/*
 	 * Once we exit, we can't trust t's memory as it could be
diff --git a/src/components/lib/sl/sl_capmgr.c b/src/components/lib/sl/sl_capmgr.c
index e0e5be9b01..d160c2fadc 100644
--- a/src/components/lib/sl/sl_capmgr.c
+++ b/src/components/lib/sl/sl_capmgr.c
@@ -151,6 +151,7 @@ sl_thd_alloc_ext_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx, vad
 	struct cos_compinfo    *compci = cos_compinfo_get(comp);
 	struct sl_thd          *t      = NULL;
 	struct cos_aep_info    *aep    = NULL;
+	struct cos_dcb_info    *dcb = NULL;
 
 	if (comp == NULL || comp->id == 0) goto done;
 
@@ -158,11 +159,11 @@ sl_thd_alloc_ext_no_cs(struct cos_defcompinfo *comp, thdclosure_index_t idx, vad
 		aep = sl_thd_alloc_aep_backend();
 		if (!aep) goto done;
 
-		aep->thd = capmgr_thd_create_ext(comp->id, idx, &aep->tid, (struct cos_dcb_info **)dcbuaddr);
+		aep->thd = capmgr_thd_create_ext(comp->id, idx, &aep->tid, &dcb);
 		if (!aep->thd) goto done;
 		aep->tc  = sl_thd_tcap(sl__globals_core()->sched_thd);
 
-		t = sl_thd_alloc_init(aep, 0, 0, NULL);
+		t = sl_thd_alloc_init(aep, 0, 0, dcb);
 		sl_mod_thd_create(sl_mod_thd_policy_get(t));
 		ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1);
 	} else {
@@ -186,6 +187,7 @@ sl_thd_aep_alloc_ext_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, t
 {
 	struct cos_aep_info *aep = NULL;
 	struct sl_thd       *t   = NULL;
+	struct cos_dcb_info *dcb = NULL;
 	asndcap_t            snd = 0;
 	int                  ret = 0, owntc = 0;
 
@@ -207,10 +209,10 @@ sl_thd_aep_alloc_ext_no_cs(struct cos_defcompinfo *comp, struct sl_thd *sched, t
 		if (!aep) goto done;
 
 		if (prps & SL_THD_PROPERTY_OWN_TCAP) owntc = 1;
-		capmgr_aep_create_ext(comp->id, aep, idx, owntc, key, ipiwin, ipimax, (struct cos_dcb_info **)dcbuaddr, extrcv);
+		capmgr_aep_create_ext(comp->id, aep, idx, owntc, key, ipiwin, ipimax, &dcb, extrcv);
 		if (!aep->thd) goto done;
 
-		t = sl_thd_alloc_init(aep, 0, prps, NULL);
+		t = sl_thd_alloc_init(aep, 0, prps, dcb);
 		sl_mod_thd_create(sl_mod_thd_policy_get(t));
 		ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), 1);
 	}
diff --git a/src/components/lib/sl/sl_sched.c b/src/components/lib/sl/sl_sched.c
index fb90cb3e6d..06735f92d4 100644
--- a/src/components/lib/sl/sl_sched.c
+++ b/src/components/lib/sl/sl_sched.c
@@ -218,6 +218,8 @@ sl_thd_block_no_cs(struct sl_thd *t, sl_thd_state_t block_type, cycles_t timeout
 {
 	assert(t && sl_thd_curr() == t); /* only current thread is allowed to block itself */
 	assert(t != sl__globals_core()->idle_thd && t != sl__globals_core()->sched_thd);
+	/* interrupt thread could run and block itself before scheduler sees any of that! */
+	sl_thd_sched_unblock_no_cs(t);
 	assert(sl_thd_is_runnable(t));
 	assert(block_type == SL_THD_BLOCKED_TIMEOUT || block_type == SL_THD_BLOCKED);
 
@@ -234,7 +236,6 @@ sl_thd_block_no_cs(struct sl_thd *t, sl_thd_state_t block_type, cycles_t timeout
 	}
 
 	/* reset rcv_suspended if the scheduler thinks "curr" was suspended on cos_rcv previously */
-	sl_thd_sched_unblock_no_cs(t);
 	assert(t->state == SL_THD_RUNNABLE);
 	sl_mod_block(sl_mod_thd_policy_get(t));
 	ps_faa(&(sl__globals()->nthds_running[cos_cpuid()]), -1);
@@ -665,45 +666,47 @@ static inline int
 __sl_sched_rcv(rcv_flags_t rf, struct cos_sched_event *e)
 {
 	struct sl_global_core *g = sl__globals_core();
-//	struct sl_thd *curr = sl_thd_curr();
-//	struct cos_dcb_info *cd = sl_thd_dcbinfo(curr);
-//	int ret = 0;
-////	if (cos_spd_id() != 4) printc("D");
-//
-//	assert(curr == g->sched_thd);
-//	if (!cd) return cos_ul_sched_rcv(g->sched_rcv, rf, g->timeout_next, e);
-//
-//	rf |= RCV_ULSCHED_RCV;
-//	
-//	__asm__ __volatile__ (			\
-//		"pushl %%ebp\n\t"		\
-//		"movl %%esp, %%ebp\n\t"		\
-//		"movl $1f, (%%eax)\n\t"		\
-//		"movl %%esp, 4(%%eax)\n\t"	\
-//		"movl $2f, %%ecx\n\t"		\
-//		"movl %%edx, %%eax\n\t"		\
-//		"inc %%eax\n\t"			\
-//		"shl $16, %%eax\n\t"		\
-//		"movl $0, %%edx\n\t"		\
-//		"movl $0, %%edi\n\t"		\
-//		"sysenter\n\t"			\
-//		"jmp 2f\n\t"			\
-//		".align 4\n\t"			\
-//		"1:\n\t"			\
-//		"movl $1, %%eax\n\t"		\
-//		".align 4\n\t"			\
-//		"2:\n\t"			\
-//		"popl %%ebp\n\t"		\
-//		: "=a" (ret)
-//		: "a" (cd), "b" (rf), "S" (g->timeout_next), "d" (g->sched_rcv)
-//		: "memory", "cc", "ecx", "edi");
-//
-////	if (cos_spd_id() != 4) printc("E");
-////	if (cos_thdid() == 7) PRINTC("%s:%d %d\n", __func__, __LINE__, ret);
-//	cd = sl_thd_dcbinfo(sl_thd_curr());
-//	cd->sp = 0;
-//
-//	rf |= RCV_ULONLY;
+#if 0
+	struct sl_thd *curr = sl_thd_curr();
+	struct cos_dcb_info *cd = sl_thd_dcbinfo(curr);
+	int ret = 0;
+//	if (cos_spd_id() != 4) printc("D");
+
+	assert(curr == g->sched_thd);
+	if (!cd) return cos_ul_sched_rcv(g->sched_rcv, rf, g->timeout_next, e);
+
+	rf |= RCV_ULSCHED_RCV;
+	
+	__asm__ __volatile__ (			\
+		"pushl %%ebp\n\t"		\
+		"movl %%esp, %%ebp\n\t"		\
+		"movl $1f, (%%eax)\n\t"		\
+		"movl %%esp, 4(%%eax)\n\t"	\
+		"movl $2f, %%ecx\n\t"		\
+		"movl %%edx, %%eax\n\t"		\
+		"inc %%eax\n\t"			\
+		"shl $16, %%eax\n\t"		\
+		"movl $0, %%edx\n\t"		\
+		"movl $0, %%edi\n\t"		\
+		"sysenter\n\t"			\
+		"jmp 2f\n\t"			\
+		".align 4\n\t"			\
+		"1:\n\t"			\
+		"movl $1, %%eax\n\t"		\
+		".align 4\n\t"			\
+		"2:\n\t"			\
+		"popl %%ebp\n\t"		\
+		: "=a" (ret)
+		: "a" (cd), "b" (rf), "S" (g->timeout_next), "d" (g->sched_rcv)
+		: "memory", "cc", "ecx", "edi");
+
+//	if (cos_spd_id() != 4) printc("E");
+//	if (cos_thdid() == 7) PRINTC("%s:%d %d\n", __func__, __LINE__, ret);
+	cd = sl_thd_dcbinfo(sl_thd_curr());
+	cd->sp = 0;
+
+	rf |= RCV_ULONLY;
+#endif
 	return cos_ul_sched_rcv(g->sched_rcv, rf, g->timeout_next, e);
 }
 
diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index 53e9f641d7..91b052881b 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -94,10 +94,9 @@ cap_ulthd_lazyupdate(struct pt_regs *regs, struct cos_cpu_local_info *cos_info,
 	struct cap_thd      *ch_ult = NULL;
 	struct thread       *ulthd = NULL;
 	capid_t              ultc = 0;
-	int                  invstk_top = 0;
 	struct cos_scb_info *scb_core = NULL; /* per-core scb_info */
 
-	*ci_ptr = thd_invstk_current_compinfo(thd, cos_info, &invstk_top);
+	*ci_ptr = thd_invstk_current_compinfo(thd, cos_info);
 
 	assert(*ci_ptr && (*ci_ptr)->captbl);
 
@@ -113,29 +112,12 @@ cap_ulthd_lazyupdate(struct pt_regs *regs, struct cos_cpu_local_info *cos_info,
 		if (unlikely(!CAP_TYPECHK_CORE(ch_ult, CAP_THD))) ch_ult = NULL;
 		else                                              ulthd = ch_ult->t;
 	}
-
-	if (unlikely(interrupt)) {
-		struct thread *fixthd = thd;
-
-		assert(scb_core->sched_tok < ~0U);
-		cos_faa((int *)&(scb_core->sched_tok), 1);
-
-		if (ulthd) fixthd = ulthd;
-
-		if (unlikely(fixthd->dcbinfo && fixthd->dcbinfo->sp)) {
-			regs->ip = fixthd->dcbinfo->ip + DCB_IP_KERN_OFF;
-			regs->sp = fixthd->dcbinfo->sp;
-			regs->dx = 0; /* sched token is in edx! */
-
-			fixthd->dcbinfo->sp = 0;
-		}
-	}
 	if (unlikely(!ultc || !ulthd || ulthd->dcbinfo == NULL)) goto done;
 	if (ulthd == thd) goto done;
-	/* check if kcurr and ucurr threads are both in the same page-table(component) */
-	if (thd_current_pgtbl(ulthd) != thd_current_pgtbl(thd)) goto done;
+	
 	thd_current_update(ulthd, thd, cos_info);
 	thd = ulthd;
+	*ci_ptr = thd_invstk_current_compinfo(thd, cos_info);
 
 done:
 	return thd;
diff --git a/src/kernel/include/scb.h b/src/kernel/include/scb.h
index ca80a7036a..98c112cd8c 100644
--- a/src/kernel/include/scb.h
+++ b/src/kernel/include/scb.h
@@ -68,9 +68,10 @@ scb_comp_update(struct captbl *ct, struct cap_scb *sc, struct cap_comp *compc, s
 	paddr_t pf = chal_va2pa((void *)(sc->kern_addr));
 
 	if (unlikely(!ltbl_isalive(&sc->liveness))) return -EPERM;
-	if (pgtbl_mapping_add(ptcin->pgtbl, uaddrin, pf, PGTBL_USER_DEF)) return -EINVAL;
+	/* for non-schedulers, scbs are from schedulers, so uaddrin will be zero and sc->compc should have been set! */
+	if (uaddrin && pgtbl_mapping_add(ptcin->pgtbl, uaddrin, pf, PGTBL_USER_DEF)) return -EINVAL;
 
-	sc->compc = compc;
+	if (uaddrin && sc->compc == NULL) sc->compc = compc;
 	compc->info.scb_data = (struct cos_scb_info *)(sc->kern_addr);
 
 	return 0;
diff --git a/src/kernel/include/thd.h b/src/kernel/include/thd.h
index acbfdcf8a4..863301702c 100644
--- a/src/kernel/include/thd.h
+++ b/src/kernel/include/thd.h
@@ -295,7 +295,7 @@ static inline void
 thd_current_update(struct thread *next, struct thread *prev, struct cos_cpu_local_info *cos_info)
 {
 	/* commit the cached data */
-	prev->invstk_top     = cos_info->invstk_top;
+	prev->invstk_top = cos_info->invstk_top;
 	cos_info->invstk_top = next->invstk_top;
 	cos_info->curr_thd   = next;
 }
@@ -535,11 +535,9 @@ thd_invstk_peek_compinfo(struct thread *curr_thd, struct cos_cpu_local_info *cos
 }
 
 static inline struct comp_info *
-thd_invstk_current_compinfo(struct thread *curr_thd, struct cos_cpu_local_info *cos_info, int *invstk_top)
+thd_invstk_current_compinfo(struct thread *curr_thd, struct cos_cpu_local_info *cos_info)
 {
-	*invstk_top = curr_invstk_top(cos_info);
-
-	return &(curr_thd->invstk[*invstk_top].comp_info);
+	return &(curr_thd->invstk[curr_invstk_top(cos_info)].comp_info);
 }
 
 static inline struct comp_info *
diff --git a/src/platform/i386/hpet.c b/src/platform/i386/hpet.c
index 743c938a33..350628a09e 100644
--- a/src/platform/i386/hpet.c
+++ b/src/platform/i386/hpet.c
@@ -202,11 +202,9 @@ static int count = 0;
 	if (unlikely(hpet_periodicity_curr[HPET_PERIODIC] && !hpet_first_hpet_period)) {
 	count++;
 
-	//printk("Y");
-	if (count < 5) goto done;
+	if (count < 15) goto done;
 		rdtscll(hpet_first_hpet_period);
 	}
-	//printk("H");
 
 	preempt = cap_hw_asnd(&hw_asnd_caps[get_cpuid()][HW_HPET_PERIODIC], regs);
 done:
diff --git a/src/platform/i386/runscripts/test_slite02.sh b/src/platform/i386/runscripts/test_slite02.sh
index 14a90e1aa3..0fb9290e71 100644
--- a/src/platform/i386/runscripts/test_slite02.sh
+++ b/src/platform/i386/runscripts/test_slite02.sh
@@ -3,9 +3,14 @@
 cp llboot_comp.o llboot.o
 cp test_sched.o boot.o
 cp test_sched_inv.o intcomp.o
-#cp test_sched_inv.o w1comp.o
-#cp test_sched_inv.o w3comp.o
-#./cos_linker "llboot.o, ;intcomp.o, ;capmgr.o, ;w1comp.o, ;*boot.o, ;w3comp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o;w1comp.o-boot.o|capmgr.o;w3comp.o-boot.o|capmgr.o" ./gen_client_stub
+cp test_sched_inv.o w1comp.o
+cp test_boot.o dummy1.o
+cp test_boot.o dummy2.o
+#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, ;intcomp.o, ;w1comp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o;w1comp.o-boot.o|capmgr.o" ./gen_client_stub
+#./cos_linker "llboot.o, ;intcomp.o, ;capmgr.o, ;w1comp.o, ;*boot.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o;w1comp.o-boot.o|capmgr.o" ./gen_client_stub
+cp test_sched_inv.o w3comp.o
+./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, ;intcomp.o, ;w1comp.o, ;w3comp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o;w1comp.o-boot.o|capmgr.o;w3comp.o-boot.o|capmgr.o" ./gen_client_stub
 
-cp test_boot.o dummy.o
-./cos_linker "llboot.o, ;intcomp.o, ;capmgr.o, ;dummy.o, ;*boot.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o" ./gen_client_stub
+#cp test_boot.o dummy.o
+#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, ;intcomp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o" ./gen_client_stub
+#

From e5fbdc9a1abd845a2988444a0dedf691d09fa98a Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Wed, 2 Oct 2019 14:14:56 -0400
Subject: [PATCH 115/127] p2p channel api and multi-component test updated with
 p2p

---
 .../implementation/sched/chan_backend.c       |   3 +-
 .../implementation/sched/sched_info.h         |   2 +
 .../implementation/sched/test_sched/init.c    | 148 +++++++++++--
 .../tests/test_schedinv/test_schedinv.c       |  14 +-
 .../implementation/tests/unit_slrcv/init.c    | 153 +++++++------
 src/components/include/crt_chan.h             | 203 +++++++++++++-----
 src/components/interface/crt/chan_crt.h       |   4 -
 src/kernel/include/shared/cos_types.h         |   4 +
 src/platform/i386/runscripts/test_slite02.sh  |  14 +-
 9 files changed, 387 insertions(+), 158 deletions(-)

diff --git a/src/components/implementation/sched/chan_backend.c b/src/components/implementation/sched/chan_backend.c
index ee3712a623..10736eabdf 100644
--- a/src/components/implementation/sched/chan_backend.c
+++ b/src/components/implementation/sched/chan_backend.c
@@ -1,13 +1,12 @@
 #include <chan_crt.h>
 
 #include <crt_chan.h>
+#include <sched_info.h>
 
 struct __sched_inout_chan {
 	struct crt_chan *in, *out;
 } __sched_thds[NUM_CPU][MAX_NUM_THREADS];
 
-CRT_CHAN_TYPE_PROTOTYPES(LU, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
-
 void
 __sched_stdio_init(void)
 {
diff --git a/src/components/implementation/sched/sched_info.h b/src/components/implementation/sched/sched_info.h
index 7f24a2cd30..a1895d717e 100644
--- a/src/components/implementation/sched/sched_info.h
+++ b/src/components/implementation/sched/sched_info.h
@@ -11,8 +11,10 @@
 #include <cos_kernel_api.h>
 #include <cos_defkernel_api.h>
 #include <cos_types.h>
+#include <crt_chan.h>
 
 #define SCHED_MAX_CHILD_COMPS 8
+CRT_CHAN_TYPE_PROTOTYPES(LU, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
 
 struct sched_childinfo {
 	struct cos_defcompinfo defcinfo;
diff --git a/src/components/implementation/sched/test_sched/init.c b/src/components/implementation/sched/test_sched/init.c
index 1dfa424e70..6874ea3b09 100644
--- a/src/components/implementation/sched/test_sched/init.c
+++ b/src/components/implementation/sched/test_sched/init.c
@@ -13,6 +13,12 @@
 #include <chan_crt.h>
 #include <channel.h>
 
+#define INITIALIZE_PRIO 1
+#define INITIALIZE_PERIOD_MS (4000)
+#define INITIALIZE_BUDGET_MS (2000)
+
+static struct sl_thd *__initializer_thd[NUM_CPU] CACHE_ALIGNED;
+
 u32_t cycs_per_usec = 0;
 cycles_t *int_start = NULL;
 volatile unsigned long *rdy = NULL;
@@ -29,25 +35,35 @@ sched_child_init(struct sched_childinfo *schedci)
         assert(schedci->initthd);
 	initthd = schedci->initthd;
 
-	sl_thd_param_set(initthd, sched_param_pack(SCHEDP_PRIO, 1));
+	sl_thd_param_set(initthd, sched_param_pack(SCHEDP_PRIO, 2));
 }
 
 extern void __sched_stdio_thd_init(thdid_t, struct crt_chan *, struct crt_chan *);
-#define MAX_PIPE_SZ 4
+#define MAX_PIPE_SZ 8
+#define MAX_USE_PIPE_SZ 3
 CRT_CHAN_STATIC_ALLOC(c0, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
 CRT_CHAN_STATIC_ALLOC(c1, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
 CRT_CHAN_STATIC_ALLOC(c2, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
 CRT_CHAN_STATIC_ALLOC(c3, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
+CRT_CHAN_STATIC_ALLOC(c4, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
+CRT_CHAN_STATIC_ALLOC(c5, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
+CRT_CHAN_STATIC_ALLOC(c6, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
+CRT_CHAN_STATIC_ALLOC(c7, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
 
 #define SPDID_INT 5
 #define SPDID_W1  6
 #define SPDID_W3  7
 
-#define PRIO_INT  MAX_PIPE_SZ + 4
-#define PRIO_W0   MAX_PIPE_SZ + 4 - 1
-#define PRIO_W1   MAX_PIPE_SZ + 4 - 2
-#define PRIO_W2   MAX_PIPE_SZ + 4 - 3
-#define PRIO_W3   MAX_PIPE_SZ + 4 - 4
+#define PRIO_START (MAX_PIPE_SZ + 8)
+
+#define PRIO_INT PRIO_START 
+#define PRIO_W0  (PRIO_START - 1)
+#define PRIO_W1  (PRIO_START - 2)
+#define PRIO_W2  (PRIO_START - 3)
+#define PRIO_W3  (PRIO_START - 4)
+#define PRIO_W4  (PRIO_START - 5)
+#define PRIO_W5  (PRIO_START - 6)
+#define PRIO_W6  (PRIO_START - 7)
 
 #define SND_DATA 0x1234
 
@@ -55,17 +71,67 @@ CRT_CHAN_STATIC_ALLOC(c3, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
 #define MAX_ITERS 100
 int iters = 0;
 cycles_t tot = 0, wc = 0;
+static int pc, tc;
+
+struct __thd_info {
+	struct sl_thd *t;
+	tcap_prio_t p;
+} iot[MAX_PIPE_SZ + 1];
+
+struct __pipe_info {
+	struct sl_thd *sndr, *rcvr; /* p2p channels */
+	struct crt_chan *c;
+} iop[MAX_PIPE_SZ];
+
+static int
+schedinit_self(void)
+{
+	if (ps_load(&tc) < (MAX_USE_PIPE_SZ + 1)) return 1;
+
+	assert(ps_load(&tc) == (MAX_USE_PIPE_SZ + 1));
+
+	return 0;
+}
+
+static void
+__init_done(void *d)
+{
+	while (schedinit_self()) sl_thd_block_periodic(0);
+
+	int i;
+
+	for (i = 0; i < MAX_USE_PIPE_SZ; i++) {
+		if (i == 0) {
+			crt_chan_init_LU(iop[i].c);
+		} else {
+			assert(iop[i].sndr && iop[i].rcvr);
+			crt_chan_p2p_init_LU(iop[i].c, iop[i].sndr, iop[i].rcvr);
+		}
+	}
+
+	/* don't want the threads to run before channels are initialized! */
+	for (i = MAX_USE_PIPE_SZ; i >= 0; i--) {
+		PRINTC("%d, %lx, %u\n", i, (unsigned long)(iot[i].t), sl_thd_thdid(iot[i].t));
+		assert(iot[i].t);
+		sl_thd_param_set(iot[i].t, sched_param_pack(SCHEDP_PRIO, iot[i].p));
+	}
+	PRINTLOG(PRINT_DEBUG, "SELF (inc. CHILD) INIT DONE.\n");
+
+	sl_thd_exit();
+
+	assert(0);
+}
+
 
 static void
 work_thd_fn(void *data)
 {
 	int is_last = (int)data;
-	unsigned long i = 0;
 
 	ps_faa(rdy, 1);
 
 	while (1) {
-		i = chan_in();
+		chan_in();
 		if (unlikely(is_last)) {
 			cycles_t end, diff;
 			rdtscll(end);
@@ -76,7 +142,7 @@ work_thd_fn(void *data)
 			iters++;
 
 			if (iters == MAX_ITERS) {
-				printc("%llu, %llu\n", tot / iters, wc);
+				PRINTC("%llu, %llu\n", tot / iters, wc);
 				tot = wc = 0;
 				iters = 0;
 			}
@@ -93,12 +159,18 @@ sched_child_thd_create(struct sched_childinfo *schedci, thdclosure_index_t idx)
 	struct sl_thd *t = sl_thd_aep_alloc_ext(sched_child_defci_get(schedci), NULL, idx, 0, 0, 0, 0, 0, &addr, NULL);
 	assert(t);
 	if (cos_inv_token() == SPDID_W1) {
-		sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, PRIO_W1));
+		iot[2].t = t;
+		iot[2].p = PRIO_W1;
+		iop[1].rcvr = t;
+		iop[2].sndr = t;
 		__sched_stdio_thd_init(sl_thd_thdid(t), c1, c2);
 	} else if (cos_inv_token() == SPDID_W3) {
-		sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, PRIO_W3));
+		iot[4].t = t;
+		iot[4].p = PRIO_W3;
+		iop[3].rcvr = t;
 		__sched_stdio_thd_init(sl_thd_thdid(t), c3, NULL);
 	}
+	ps_faa(&tc, 1);
 
 	return t ? sl_thd_thdid(t) : 0;
 }
@@ -113,8 +185,11 @@ sched_child_aep_create(struct sched_childinfo *schedci, thdclosure_index_t idx,
 	if (!ps_cas(&first, 1, 0)) assert(0);
 	struct sl_thd *t = sl_thd_aep_alloc_ext(sched_child_defci_get(schedci), NULL, idx, 1, owntc, key, ipiwin, ipimax, &addr, extrcv);
 	assert(t);
-	sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, PRIO_INT));
 	__sched_stdio_thd_init(sl_thd_thdid(t), NULL, c0);
+	iot[0].t = t;
+	iot[0].p = PRIO_INT;
+	iop[0].sndr = t;
+	ps_faa(&tc, 1);
 
 	return t ? sl_thd_thdid(t) : 0;
 }
@@ -122,16 +197,24 @@ sched_child_aep_create(struct sched_childinfo *schedci, thdclosure_index_t idx,
 void
 test_pipes_init(void)
 {
-	struct sl_thd *t = sl_thd_alloc(work_thd_fn, 0);
+	struct sl_thd *t = sl_thd_alloc(work_thd_fn, MAX_USE_PIPE_SZ == 1 ? (void *)1 : (void *)0);
 	assert(t);
-	sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, PRIO_W0));
-//	__sched_stdio_thd_init(sl_thd_thdid(t), c0, NULL);
+	iot[1].t = t;
+	iot[1].p = PRIO_W0;
+	iop[0].rcvr = t; /* no optimized path for rcving from INT thread */
+	iop[1].sndr = t;
 	__sched_stdio_thd_init(sl_thd_thdid(t), c0, c1);
-	t = sl_thd_alloc(work_thd_fn, 0);
-	assert(t);
-	sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, PRIO_W2));
-	__sched_stdio_thd_init(sl_thd_thdid(t), c2, c3);
-//	__sched_stdio_thd_init(sl_thd_thdid(t), c2, NULL);
+	ps_faa(&tc, 1);
+	if (MAX_USE_PIPE_SZ >= 3) { 
+		t = sl_thd_alloc(work_thd_fn, MAX_USE_PIPE_SZ == 3 ? (void *)1 : (void *)0);
+		assert(t);
+		iot[3].t = t;
+		iot[3].p = PRIO_W2;
+		iop[2].rcvr = t;
+		iop[3].sndr = t;
+		__sched_stdio_thd_init(sl_thd_thdid(t), c2, c3);
+		ps_faa(&tc, 1);
+	}
 }
 
 void
@@ -143,6 +226,20 @@ cos_init(void)
 	static u32_t cpubmp[NUM_CPU_BMP_WORDS] = { 0 };
 	int i;
 
+	assert(NUM_CPU == 1);
+	assert(MAX_USE_PIPE_SZ <= MAX_PIPE_SZ);
+	memset(iop, 0, sizeof(struct __pipe_info) * MAX_PIPE_SZ);
+	memset(iot, 0, sizeof(struct __thd_info) * (MAX_PIPE_SZ + 1));
+	pc = tc = 0;
+	iop[0].c = c0;
+	iop[1].c = c1;
+	iop[2].c = c2;
+	iop[3].c = c3;
+	iop[4].c = c4;
+	iop[5].c = c5;
+	iop[6].c = c6;
+	iop[7].c = c7;
+
 	PRINTLOG(PRINT_DEBUG, "CPU cycles per sec: %u\n", cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE));
 
 	if (ps_cas((unsigned long *)&first, NUM_CPU + 1, cos_cpuid())) {
@@ -163,7 +260,7 @@ cos_init(void)
 		while (!ps_load((unsigned long *)&init_done[i])) ;
 	}
 
-	sl_init_corebmp(500*SL_MIN_PERIOD_US, cpubmp);
+	sl_init_corebmp(100*SL_MIN_PERIOD_US, cpubmp);
 	vaddr_t tscaddr = 0;
 	cbuf_t id = channel_shared_page_alloc(SHMCHANNEL_KEY, &tscaddr);
 	assert(id >= 0);
@@ -173,7 +270,12 @@ cos_init(void)
 	*rdy = 0;
 	sched_childinfo_init();
 	test_pipes_init();
-	self_init[cos_cpuid()] = 1;
+	__initializer_thd[cos_cpuid()] = sl_thd_alloc(__init_done, NULL);
+	assert(__initializer_thd[cos_cpuid()]);
+	sl_thd_param_set(__initializer_thd[cos_cpuid()], sched_param_pack(SCHEDP_PRIO, INITIALIZE_PRIO));
+	sl_thd_param_set(__initializer_thd[cos_cpuid()], sched_param_pack(SCHEDP_WINDOW, INITIALIZE_BUDGET_MS));
+	sl_thd_param_set(__initializer_thd[cos_cpuid()], sched_param_pack(SCHEDP_BUDGET, INITIALIZE_PERIOD_MS));
+
 	hypercall_comp_init_done();
 
 	sl_sched_loop_nonblock();
diff --git a/src/components/implementation/tests/test_schedinv/test_schedinv.c b/src/components/implementation/tests/test_schedinv/test_schedinv.c
index e71231edcc..6cda87a69e 100644
--- a/src/components/implementation/tests/test_schedinv/test_schedinv.c
+++ b/src/components/implementation/tests/test_schedinv/test_schedinv.c
@@ -19,10 +19,10 @@
 
 static u32_t cycs_per_usec = 0;
 
-#define MAX_PIPE_SZ 4
+#define MAX_USE_PIPE_SZ 3
 
 #define SND_DATA 0x4321
-#define HPET_PERIOD_TEST_US 5000
+#define HPET_PERIOD_TEST_US 20000
 
 #define SHMCHANNEL_KEY 0x2020
 static cycles_t *sttsc = NULL;
@@ -33,7 +33,7 @@ __test_int_fn(arcvcap_t rcv, void *data)
 {
 	ps_faa(rdy, 1);
 
-	while (ps_load(rdy) <= MAX_PIPE_SZ) sched_thd_block_timeout(0, time_now() + time_usec2cyc(HPET_PERIOD_TEST_US));
+	while (ps_load(rdy) <= MAX_USE_PIPE_SZ) sched_thd_block_timeout(0, time_now() + time_usec2cyc(HPET_PERIOD_TEST_US));
 	int a = capmgr_hw_periodic_attach(HW_HPET_PERIODIC, cos_thdid(), HPET_PERIOD_TEST_US);
 	assert(a == 0);
 
@@ -69,7 +69,7 @@ __test_wrk_fn(void *data)
 			tot += diff;
 			iters++;
 			if (iters == ITERS) {
-				printc("%llu, %llu\n", tot / ITERS, wc);	
+				PRINTC("%llu, %llu\n", tot / ITERS, wc);	
 				tot = wc = 0;
 				iters = 0;
 			}
@@ -91,7 +91,10 @@ test_aeps(void)
 	if (cos_spd_id() == SPDID_INT) {
 		tid = sched_aep_create(&intaep, __test_int_fn, (void *)0, 0, 0, 0, 0);
 	} else {
-		tid = sched_thd_create(__test_wrk_fn, cos_spd_id() == SPDID_W3 ? (void *)1: (void *)0);
+		tid = sched_thd_create(__test_wrk_fn, 
+			((cos_spd_id() == SPDID_W3 && MAX_USE_PIPE_SZ == 4) 
+			|| (cos_spd_id() == SPDID_W1 && MAX_USE_PIPE_SZ == 2)) 
+			? (void *)1: (void *)0);
 	}
 	assert(tid);
 }
@@ -113,6 +116,7 @@ cos_init(void)
 
 	assert(hypercall_comp_child_next(cos_spd_id(), &child, &childflags) == -1);
 	test_aeps();
+	PRINTC("Init Done!\n");
 
 	sched_thd_exit();
 }
diff --git a/src/components/implementation/tests/unit_slrcv/init.c b/src/components/implementation/tests/unit_slrcv/init.c
index 64fe2a3b76..ad239b4ce3 100644
--- a/src/components/implementation/tests/unit_slrcv/init.c
+++ b/src/components/implementation/tests/unit_slrcv/init.c
@@ -13,7 +13,7 @@
 static struct sl_xcore_thd *ping;
 static struct sl_xcore_thd *pong;
 
-#define HPET_PERIOD_TEST_US 2000
+#define HPET_PERIOD_TEST_US 20000
 
 #define WORK_US (1000)
 
@@ -35,69 +35,105 @@ ping_fn(void *d)
 unsigned int iter = 0;
 cycles_t st = 0, en = 0, tot = 0, wc = 0;
 CRT_CHAN_STATIC_ALLOC(c0, int, 4);
+CRT_CHAN_STATIC_ALLOC(c1, int, 4);
+CRT_CHAN_STATIC_ALLOC(c2, int, 4);
+CRT_CHAN_STATIC_ALLOC(c3, int, 4);
+CRT_CHAN_STATIC_ALLOC(c4, int, 4);
+CRT_CHAN_STATIC_ALLOC(c5, int, 4);
 CRT_CHAN_TYPE_PROTOTYPES(test, int, 4);
 
+#define PIPELINE_LEN 3
+#define ITERS 100
+
+static inline void
+chrcv(int i)
+{
+	int r;
+
+	//printc("[r%d]", i);
+	switch(i) {
+	case 0: crt_chan_recv_test(c0, &r); break;
+	case 1: crt_chan_recv_test(c1, &r); break;
+	case 2: crt_chan_recv_test(c2, &r); break;
+	case 3: crt_chan_recv_test(c3, &r); break;
+	case 4: crt_chan_recv_test(c4, &r); break;
+	case 5: crt_chan_recv_test(c5, &r); break;
+	default: assert(0);
+	}
+	//printc("[d%d]", i);
+}
+
+static inline void
+chsnd(int i)
+{
+	int s = 0xDEAD0000 | i;
+
+	//printc("[s%d]", i);
+	switch(i) {
+	case 0: crt_chan_send_test(c0, &s); break;
+	case 1: crt_chan_send_test(c1, &s); break;
+	case 2: crt_chan_send_test(c2, &s); break;
+	case 3: crt_chan_send_test(c3, &s); break;
+	case 4: crt_chan_send_test(c4, &s); break;
+	case 5: crt_chan_send_test(c5, &s); break;
+	default: assert(0);
+	}
+	//printc("[o%d]", i);
+}
+
+static inline void
+chinit(int i)
+{
+	switch(i) {
+	case 0: crt_chan_init_test(c0); break;
+	case 1: crt_chan_init_test(c1); break;
+	case 2: crt_chan_init_test(c2); break;
+	case 3: crt_chan_init_test(c3); break;
+	case 4: crt_chan_init_test(c4); break;
+	case 5: crt_chan_init_test(c5); break;
+	default: assert(0);
+	}
+}
+
 static inline void
 work_fn(void *x)
 {
-		int rcv;
+	int chid = (int)x;
 	while (1) {
-	//	printc("a");
-		//sl_thd_block(0);
-		crt_chan_recv_test(c0, &rcv);
-	//	printc("b");
-		rdtscll(en);
-		if (unlikely(!st)) continue;
-		assert(en > st);
-		cycles_t diff = en - st;
-		if (diff > wc) wc = diff;
-		tot += diff;
-		iter ++;
-		if (unlikely(iter == 1000)) {
-			PRINTC("%llu %llu\n", tot / iter, wc);
-			iter = 0;
-			wc = tot = 0;
+		chrcv(chid);
+
+		if (likely(chid + 1 < PIPELINE_LEN)) chsnd(chid + 1);
+		else {
+			rdtscll(en);
+			assert(en > st);
+			cycles_t diff = en - st;
+			if (diff > wc) wc = diff;
+			tot += diff;
+			iter ++;
+			if (unlikely(iter == ITERS)) {
+				PRINTC("%llu %llu\n", tot / iter, wc);
+				//iter = 0;
+				//wc = tot = 0;
+			}
 		}
 	}
+	sl_thd_exit();
 }
 
-struct sl_thd *wt = NULL;
-thdid_t wtid = 0;
+struct sl_thd *wt[PIPELINE_LEN] = { NULL };
 
 static inline void
 pong_fn(arcvcap_t r, void *d)
 {
-	//printc("#");
+	PRINTC("Hpet Register\n");
 	int a = capmgr_hw_periodic_attach(HW_HPET_PERIODIC, cos_thdid(), HPET_PERIOD_TEST_US);
 	assert(a == 0);
-	int snd = 0x1234;
-	//printc("!");
 
-	int i = 0;
 	while (1) {
-	//	printc("c");
+		if (iter == ITERS) capmgr_hw_detach(HW_HPET_PERIODIC);
 		int p = sl_thd_rcv(RCV_ULONLY);
 		rdtscll(st);
-	//	printc("d");
-		//int p = cos_rcv(r, 0);
-
-		//printc("[%d] ", i++);
-		//sl_thd_wakeup(wtid);
-		crt_chan_send_test(c0, &snd);
-//		printc("e");
-//		rdtscll(en);
-//		//if (unlikely(!st)) continue;
-//		assert(en > st);
-//		cycles_t diff = en - st;
-//		if (diff > wc) wc = diff;
-//		tot += diff;
-//		iter ++;
-//		if (unlikely(iter == 1000)) {
-//			PRINTC("%llu %llu\n", tot / iter, wc);
-//			iter = 0;
-//			wc = tot = 0;
-//		}
-
+		chsnd(0);
 	}
 	sl_thd_exit();
 }
@@ -145,28 +181,19 @@ cos_init(void *d)
 		assert(NUM_CPU == 1);
 		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
 		cos_defcompinfo_init();
-		//cos_dcb_info_init_curr();
 		sl_init(SL_MIN_PERIOD_US*100);
-
-		crt_chan_init_test(c0);
-
-		wt = sl_thd_alloc(work_fn, NULL);
-		assert(wt);
-		wtid = sl_thd_thdid(wt);
+		int i;
 		struct sl_thd *rt = sl_thd_aep_alloc(pong_fn, NULL, 0, 0, 0, 0);
 		assert(rt);
-		sl_thd_param_set(wt, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1));
-		sl_thd_param_set(rt, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1));
-//		r = sl_thd_rcvcap(rt);
-//		assert(r);
-		//struct sl_thd *st = sl_thd_alloc(ping_fn, (void *)&s);
-		//assert(st);
-		//sl_thd_param_set(st, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1));
-
-		//s = cos_asnd_alloc(ci, r, ci->captbl_cap);
-		//assert(s);
-//		s = capmgr_asnd_rcv_create(r);
-//		assert(s);
+
+		for (i = 0; i < PIPELINE_LEN; i++) {
+			chinit(i);
+			wt[i] = sl_thd_alloc(work_fn, (void *)i);
+			assert(wt[i]);
+			sl_thd_param_set(wt[i], sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1+PIPELINE_LEN-i));
+		}
+
+		sl_thd_param_set(rt, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1+PIPELINE_LEN+1));
 	}
 	ps_faa(&init_done[cos_cpuid()], 1);
 
diff --git a/src/components/include/crt_chan.h b/src/components/include/crt_chan.h
index ea833db694..5f4267bb8a 100644
--- a/src/components/include/crt_chan.h
+++ b/src/components/include/crt_chan.h
@@ -14,6 +14,8 @@
 #include <cos_component.h>
 #include <crt_blkpt.h>
 #include <bitmap.h>
+#include <sl.h>
+#include <sl_thd.h>
 
 struct crt_chan {
 	u32_t producer;
@@ -32,6 +34,8 @@ struct crt_chan {
 	 */
 	u32_t item_sz, wraparound_mask;
 	u32_t nslots;
+	/* FIXME: p2p channels only SINGLE-CORE for now! */
+	unsigned long sender, receiver; /* for p2p channels, sl_thd pointers + MSB for blocked on channel send/recv.. */
 	/* The memory for the channel. */
 	char mem[0];
 };
@@ -44,44 +48,47 @@ struct __crt_chan_envelope_##name {	                        \
 } __##name;							\
 struct crt_chan *name = &__##name.c
 
-#define CRT_CHAN_TYPE_PROTOTYPES(name, type, nslots)			\
-static inline int							\
-crt_chan_init_##name(struct crt_chan *c)				\
-{ return crt_chan_init(c, sizeof(type), nslots); }			\
-static inline void							\
-crt_chan_teardown_##name(struct crt_chan *c)				\
-{ crt_chan_teardown(c); }						\
-static inline int							\
-crt_chan_empty_##name(struct crt_chan *c)				\
-{ return __crt_chan_empty(c, nslots - 1); }				\
-static inline int							\
-crt_chan_full_##name(struct crt_chan *c)				\
-{ return __crt_chan_full(c, nslots - 1); }				\
-static inline int							\
-crt_chan_send_##name(struct crt_chan *c, void *item)			\
-{									\
-	assert(pow2(nslots));						\
-	return __crt_chan_send(c, item, nslots - 1, sizeof(type));	\
-}									\
-static inline int							\
-crt_chan_recv_##name(struct crt_chan *c, void *item)			\
-{									\
-	assert(pow2(nslots));						\
-	return __crt_chan_recv(c, item, nslots - 1, sizeof(type));	\
-}									\
-static inline int							\
-crt_chan_async_send_##name(struct crt_chan *c, void *item)		\
-{									\
-	assert(pow2(nslots));						\
-	if (__crt_chan_produce(c, item, nslots - 1, sizeof(type))) return -EAGAIN; \
-	return 0;							\
-}									\
-static inline int							\
-crt_chan_async_recv_##name(struct crt_chan *c, void *item)		\
-{									\
-	assert(pow2(nslots));						\
-	if (__crt_chan_consume(c, item, nslots - 1, sizeof(type))) return -EAGAIN; \
-	return 0;							\
+#define CRT_CHAN_TYPE_PROTOTYPES(name, type, nslots)					\
+static inline int									\
+crt_chan_init_##name(struct crt_chan *c)						\
+{ return crt_chan_init(c, sizeof(type), nslots); }					\
+static inline int									\
+crt_chan_p2p_init_##name(struct crt_chan *c, struct sl_thd *sndr, struct sl_thd *rcvr)	\
+{ return crt_chan_p2p_init(c, sizeof(type), nslots, sndr, rcvr); }			\
+static inline void									\
+crt_chan_teardown_##name(struct crt_chan *c)						\
+{ crt_chan_teardown(c); }								\
+static inline int									\
+crt_chan_empty_##name(struct crt_chan *c)						\
+{ return __crt_chan_empty(c, nslots - 1); }						\
+static inline int									\
+crt_chan_full_##name(struct crt_chan *c)						\
+{ return __crt_chan_full(c, nslots - 1); }						\
+static inline int									\
+crt_chan_send_##name(struct crt_chan *c, void *item)					\
+{											\
+	assert(pow2(nslots));								\
+	return __crt_chan_send(c, item, nslots - 1, sizeof(type));			\
+}											\
+static inline int									\
+crt_chan_recv_##name(struct crt_chan *c, void *item)					\
+{											\
+	assert(pow2(nslots));								\
+	return __crt_chan_recv(c, item, nslots - 1, sizeof(type));			\
+}											\
+static inline int									\
+crt_chan_async_send_##name(struct crt_chan *c, void *item)				\
+{											\
+	assert(pow2(nslots));								\
+	if (__crt_chan_produce(c, item, nslots - 1, sizeof(type))) return -EAGAIN; 	\
+	return 0;									\
+}											\
+static inline int									\
+crt_chan_async_recv_##name(struct crt_chan *c, void *item)				\
+{											\
+	assert(pow2(nslots));								\
+	if (__crt_chan_consume(c, item, nslots - 1, sizeof(type))) return -EAGAIN; 	\
+	return 0;									\
 }
 
 #define CRT_CHANCHAN_PROTOTYPES(nslots) \
@@ -94,7 +101,6 @@ __crt_chan_buff_idx(struct crt_chan *c, u32_t v, u32_t wraparound_mask)
 static inline int
 __crt_chan_full(struct crt_chan *c, u32_t wraparound_mask)
 { return __crt_chan_buff_idx(c, c->consumer, wraparound_mask) == __crt_chan_buff_idx(c, c->producer + 1, wraparound_mask); }
-//{ return c->consumer == __crt_chan_buff_idx(c, c->producer + 1, wraparound_mask); }
 
 static inline int
 __crt_chan_empty(struct crt_chan *c, u32_t wraparound_mask)
@@ -122,6 +128,49 @@ __crt_chan_consume(struct crt_chan *c, void *d, u32_t wraparound_mask, u32_t sz)
 	return 0;
 }
 
+/* only wake it up if it's blocked on the channel! */
+static inline void
+__crt_chan_p2p_wakeup(unsigned long *w)
+{
+	unsigned long wc, wn;
+
+	sl_cs_enter();
+	wc = ps_load(w);
+	if (likely(wc & (1<<31))) goto blocked;
+	sl_cs_exit();
+
+	return;
+
+blocked:
+	wn = wc & ~(1<<31);
+	struct sl_thd *wt = (struct sl_thd *)wn;
+	if (unlikely(!ps_upcas(w, wc, wn))) BUG();
+	sl_thd_wakeup_no_cs(wt);
+	sl_cs_exit_switchto(wt);
+}
+
+/* block on channel */
+static inline void
+__crt_chan_p2p_block(unsigned long *b)
+{
+	unsigned long bc, bn;
+
+	sl_cs_enter();
+	bc = ps_load(b);
+	assert((bc & (1<<31)) == 0);
+	bn = bc | (1<<31);
+	if (unlikely(!ps_upcas(b, bc, bn))) BUG();
+
+	if (sl_thd_block_no_cs(sl_thd_curr(), SL_THD_BLOCKED, 0)) BUG();
+	sl_cs_exit_schedule();
+}
+
+static inline int
+__crt_chan_is_p2p(struct crt_chan *c)
+{
+	return ((c->sender & ~(1<<31)) && (c->receiver & ~(1<<31)));
+}
+
 /**
  * The next two functions pass all of the variables in via arguments,
  * so that we can use them for constant propagation along with
@@ -130,16 +179,27 @@ __crt_chan_consume(struct crt_chan *c, void *d, u32_t wraparound_mask, u32_t sz)
 static inline int
 __crt_chan_send(struct crt_chan *c, void *item, u32_t wraparound_mask, u32_t item_sz)
 {
-	while (1) {
-		struct crt_blkpt_checkpoint chkpt;
-
-		crt_blkpt_checkpoint(&c->full, &chkpt);
-		if (!__crt_chan_produce(c, item, wraparound_mask, item_sz)) {
-			/* success! */
-			crt_blkpt_trigger(&c->empty, 0);
-			break;
+	/* optimizing for p2p */
+	if (likely(__crt_chan_is_p2p(c))) {
+		while (1) {
+			if (!__crt_chan_produce(c, item, wraparound_mask, item_sz)) {
+				__crt_chan_p2p_wakeup(&c->receiver);
+				break;
+			}
+			__crt_chan_p2p_block(&c->sender);
+		}
+	} else {
+		while (1) {
+			struct crt_blkpt_checkpoint chkpt;
+
+			crt_blkpt_checkpoint(&c->full, &chkpt);
+			if (!__crt_chan_produce(c, item, wraparound_mask, item_sz)) {
+				/* success! */
+				crt_blkpt_trigger(&c->empty, 0);
+				break;
+			}
+			crt_blkpt_wait(&c->full, 0, &chkpt);
 		}
-		crt_blkpt_wait(&c->full, 0, &chkpt);
 	}
 
 	return 0;
@@ -148,16 +208,27 @@ __crt_chan_send(struct crt_chan *c, void *item, u32_t wraparound_mask, u32_t ite
 static inline int
 __crt_chan_recv(struct crt_chan *c, void *item, u32_t wraparound_mask, u32_t item_sz)
 {
-	while (1) {
-		struct crt_blkpt_checkpoint chkpt;
-
-		crt_blkpt_checkpoint(&c->empty, &chkpt);
-		if (!__crt_chan_consume(c, item, wraparound_mask, item_sz)) {
-			/* success! */
-			crt_blkpt_trigger(&c->full, 0);
-			break;
+	/* optimizing for p2p */
+	if (likely(__crt_chan_is_p2p(c))) {
+		while (1) {
+			if (!__crt_chan_consume(c, item, wraparound_mask, item_sz)) {
+				__crt_chan_p2p_wakeup(&c->sender);
+				break;
+			}
+			__crt_chan_p2p_block(&c->receiver);
+		}
+	} else {
+		while (1) {
+			struct crt_blkpt_checkpoint chkpt;
+
+			crt_blkpt_checkpoint(&c->empty, &chkpt);
+			if (!__crt_chan_consume(c, item, wraparound_mask, item_sz)) {
+				/* success! */
+				crt_blkpt_trigger(&c->full, 0);
+				break;
+			}
+			crt_blkpt_wait(&c->empty, 0, &chkpt);
 		}
-		crt_blkpt_wait(&c->empty, 0, &chkpt);
 	}
 
 	return 0;
@@ -192,6 +263,24 @@ crt_chan_init(struct crt_chan *c, int item_sz, int slots)
 	c->nslots  = slots;
 	c->item_sz = item_sz;
 	c->wraparound_mask = slots - 1; /* slots is a pow2 */
+	c->sender = c->receiver = 0;
+
+	return 0;
+}
+
+static inline int
+crt_chan_p2p_init(struct crt_chan *c, int item_sz, int slots,
+		  struct sl_thd *sndr, struct sl_thd *rcvr)
+{
+	int r = crt_chan_init(c, item_sz, slots);
+	assert(sndr && rcvr);
+
+	/* FIXME: only single-core for now! */
+	if (r > 0) return r;
+	c->sender = (unsigned long)sndr;
+	c->receiver = (unsigned long)rcvr;
+	assert((c->sender & (1<<31)) == 0);
+	assert((c->receiver & (1<<31)) == 0);
 
 	return 0;
 }
diff --git a/src/components/interface/crt/chan_crt.h b/src/components/interface/crt/chan_crt.h
index 16386def5b..2d93167c45 100644
--- a/src/components/interface/crt/chan_crt.h
+++ b/src/components/interface/crt/chan_crt.h
@@ -1,10 +1,6 @@
 #ifndef CHAN_CRT_H
 #define CHAN_CRT_H
 
-#define CHAN_CRT_NSLOTS 4
-#define CHAN_CRT_ITEM_TYPE unsigned long
-#define CHAN_CRT_ITEM_SZ sizeof(CHAN_CRT_ITEM_TYPE)
-
 int           chan_out(unsigned long item);
 unsigned long chan_in(void);
 
diff --git a/src/kernel/include/shared/cos_types.h b/src/kernel/include/shared/cos_types.h
index e67708b7bf..2022db0368 100644
--- a/src/kernel/include/shared/cos_types.h
+++ b/src/kernel/include/shared/cos_types.h
@@ -521,6 +521,10 @@ typedef unsigned int isolation_level_t;
 #define MEMMGR_MAX_SHMEM_REGIONS 1024
 #define CAPMGR_AEPKEYS_MAX       (1<<15)
 
+#define CHAN_CRT_NSLOTS 4
+#define CHAN_CRT_ITEM_TYPE unsigned long
+#define CHAN_CRT_ITEM_SZ sizeof(CHAN_CRT_ITEM_TYPE)
+
 #define IPIWIN_DEFAULT_US (1000) /* 1ms */
 #define IPIMAX_DEFAULT    (64) /* IPIs per ms for each RCV ep */
 
diff --git a/src/platform/i386/runscripts/test_slite02.sh b/src/platform/i386/runscripts/test_slite02.sh
index 0fb9290e71..78f7b5127c 100644
--- a/src/platform/i386/runscripts/test_slite02.sh
+++ b/src/platform/i386/runscripts/test_slite02.sh
@@ -4,12 +4,18 @@ cp llboot_comp.o llboot.o
 cp test_sched.o boot.o
 cp test_sched_inv.o intcomp.o
 cp test_sched_inv.o w1comp.o
+cp test_sched_inv.o w3comp.o
 cp test_boot.o dummy1.o
 cp test_boot.o dummy2.o
-#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, ;intcomp.o, ;w1comp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o;w1comp.o-boot.o|capmgr.o" ./gen_client_stub
-#./cos_linker "llboot.o, ;intcomp.o, ;capmgr.o, ;w1comp.o, ;*boot.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o;w1comp.o-boot.o|capmgr.o" ./gen_client_stub
-cp test_sched_inv.o w3comp.o
-./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, ;intcomp.o, ;w1comp.o, ;w3comp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o;w1comp.o-boot.o|capmgr.o;w3comp.o-boot.o|capmgr.o" ./gen_client_stub
+
+# only int and w0 in root
+#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, ;intcomp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o" ./gen_client_stub
+
+#int, w0 in root and w1 in comp
+./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, ;intcomp.o, ;w1comp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o;w1comp.o-boot.o|capmgr.o" ./gen_client_stub
+
+# int, w1 - w3
+#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, ;intcomp.o, ;w1comp.o, ;w3comp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o;w1comp.o-boot.o|capmgr.o;w3comp.o-boot.o|capmgr.o" ./gen_client_stub
 
 #cp test_boot.o dummy.o
 #./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, ;intcomp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o" ./gen_client_stub

From e0927112f5bd5a719762b0108a914b82f6314add Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Wed, 2 Oct 2019 14:48:16 -0400
Subject: [PATCH 116/127] using p2p in hierarchical test

---
 .../implementation/tests/unit_slrcv/init.c    | 27 ++++++++++---------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/components/implementation/tests/unit_slrcv/init.c b/src/components/implementation/tests/unit_slrcv/init.c
index ad239b4ce3..63dc56e2e7 100644
--- a/src/components/implementation/tests/unit_slrcv/init.c
+++ b/src/components/implementation/tests/unit_slrcv/init.c
@@ -34,12 +34,12 @@ ping_fn(void *d)
 
 unsigned int iter = 0;
 cycles_t st = 0, en = 0, tot = 0, wc = 0;
-CRT_CHAN_STATIC_ALLOC(c0, int, 4);
-CRT_CHAN_STATIC_ALLOC(c1, int, 4);
-CRT_CHAN_STATIC_ALLOC(c2, int, 4);
-CRT_CHAN_STATIC_ALLOC(c3, int, 4);
-CRT_CHAN_STATIC_ALLOC(c4, int, 4);
-CRT_CHAN_STATIC_ALLOC(c5, int, 4);
+CRT_CHAN_STATIC_ALLOC(c0, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
+CRT_CHAN_STATIC_ALLOC(c1, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
+CRT_CHAN_STATIC_ALLOC(c2, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
+CRT_CHAN_STATIC_ALLOC(c3, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
+CRT_CHAN_STATIC_ALLOC(c4, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
+CRT_CHAN_STATIC_ALLOC(c5, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
 CRT_CHAN_TYPE_PROTOTYPES(test, int, 4);
 
 #define PIPELINE_LEN 3
@@ -82,15 +82,15 @@ chsnd(int i)
 }
 
 static inline void
-chinit(int i)
+chinit(int i, struct sl_thd *s, struct sl_thd *r)
 {
 	switch(i) {
 	case 0: crt_chan_init_test(c0); break;
-	case 1: crt_chan_init_test(c1); break;
-	case 2: crt_chan_init_test(c2); break;
-	case 3: crt_chan_init_test(c3); break;
-	case 4: crt_chan_init_test(c4); break;
-	case 5: crt_chan_init_test(c5); break;
+	case 1: crt_chan_p2p_init_test(c1, s, r); break;
+	case 2: crt_chan_p2p_init_test(c2, s, r); break;
+	case 3: crt_chan_p2p_init_test(c3, s, r); break;
+	case 4: crt_chan_p2p_init_test(c4, s, r); break;
+	case 5: crt_chan_p2p_init_test(c5, s, r); break;
 	default: assert(0);
 	}
 }
@@ -187,10 +187,11 @@ cos_init(void *d)
 		assert(rt);
 
 		for (i = 0; i < PIPELINE_LEN; i++) {
-			chinit(i);
 			wt[i] = sl_thd_alloc(work_fn, (void *)i);
 			assert(wt[i]);
 			sl_thd_param_set(wt[i], sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1+PIPELINE_LEN-i));
+			if (i == 0) chinit(i, 0, 0);
+			else chinit(i, wt[i-1], wt[i]);
 		}
 
 		sl_thd_param_set(rt, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1+PIPELINE_LEN+1));

From 4be00fdeeb155ad93cc8edd47fa5f2da61b4832b Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Wed, 2 Oct 2019 18:06:34 -0400
Subject: [PATCH 117/127] switch to scheduler if there are more events pending
 in the kernel

---
 src/components/include/cos_ulsched_rcv.h |  9 +++++++++
 src/components/include/sl.h              | 11 ++++++++++-
 src/kernel/include/shared/cos_sched.h    |  2 +-
 src/kernel/include/thd.h                 | 25 +++++++++++++++++-------
 4 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/src/components/include/cos_ulsched_rcv.h b/src/components/include/cos_ulsched_rcv.h
index 881d0da7f6..23becab4dd 100644
--- a/src/components/include/cos_ulsched_rcv.h
+++ b/src/components/include/cos_ulsched_rcv.h
@@ -9,6 +9,15 @@ __cos_sched_events_present(struct cos_sched_ring *r)
 	return !(r->tail == r->head);
 }
 
+static inline int
+cos_sched_ispending(void)
+{
+	struct cos_scb_info *scb_cpu = cos_scb_info_get_core();
+	struct cos_sched_ring *r     = &scb_cpu->sched_events;
+
+	return r->more;
+}
+
 static inline int
 __cos_sched_event_consume(struct cos_sched_ring *r, struct cos_sched_event *e)
 {
diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index 058293d33d..fb3fdb0ab3 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -39,6 +39,7 @@
 #include <sl_consts.h>
 #include <sl_xcore.h>
 #include <heap.h>
+#include <cos_ulsched_rcv.h>
 
 #define SL_CS
 #undef SL_REPLENISH
@@ -547,7 +548,15 @@ sl_thd_activate_c(struct sl_thd *t, sched_tok_t tok, tcap_time_t timeout, tcap_p
 {
 	if (unlikely(t->properties & SL_THD_PROPERTY_SEND)) {
 		return cos_sched_asnd(t->sndcap, g->timeout_next, g->sched_rcv, tok);
-	} else if (unlikely(t->properties & SL_THD_PROPERTY_OWN_TCAP)) {
+	}
+
+	/* there is more events.. run scheduler again! */
+	if (unlikely(cos_sched_ispending())) {
+		if (curr == g->sched_thd) return -EBUSY;
+		return sl_thd_dispatch_usr(g->sched_thd, tok, curr);
+	}
+
+	if (unlikely(t->properties & SL_THD_PROPERTY_OWN_TCAP)) {
 		return sl_thd_dispatch_kern(t, tok, curr, timeout, sl_thd_tcap(t), prio == 0 ? t->prio : prio);
 	}
 
diff --git a/src/kernel/include/shared/cos_sched.h b/src/kernel/include/shared/cos_sched.h
index bf6b7ef6d1..525d7edcb9 100644
--- a/src/kernel/include/shared/cos_sched.h
+++ b/src/kernel/include/shared/cos_sched.h
@@ -18,7 +18,7 @@ struct cos_sched_event {
 #define COS_SCHED_EVENT_RING_SIZE 16
 
 struct cos_sched_ring {
-	int head, tail;
+	int head, tail, more;
 	struct cos_sched_event event_buf[COS_SCHED_EVENT_RING_SIZE];
 } __attribute__((packed));
 
diff --git a/src/kernel/include/thd.h b/src/kernel/include/thd.h
index 863301702c..e10f2b1cdd 100644
--- a/src/kernel/include/thd.h
+++ b/src/kernel/include/thd.h
@@ -202,10 +202,27 @@ thd_rcvcap_init(struct thread *t, int is_init)
 	rc->rcvcap_thd_notif                   = NULL;
 }
 
+static inline struct comp_info *
+thd_invstk_peek_compinfo(struct thread *curr_thd, struct cos_cpu_local_info *cos_info, int peek_index)
+{
+	/* curr_thd should be the current thread! We are using cached invstk_top. */
+	return &(curr_thd->invstk[peek_index].comp_info);
+}
+
 static inline void
 thd_rcvcap_evt_enqueue(struct thread *head, struct thread *t)
 {
+	struct cos_cpu_local_info *cos_info = cos_cpu_local_info();
+	struct comp_info *c = thd_invstk_peek_compinfo(head, cos_info, 0); /* in its root component! */
+	struct cos_scb_info   *scb = NULL;
+	struct cos_sched_ring *r   = NULL;
+
 	if (list_empty(&t->event_list) && head != t) list_enqueue(&head->event_head, &t->event_list);
+	if (unlikely(!c ||!c->scb_data)) return;
+
+	scb = ((c->scb_data) + get_cpuid());
+	r   = &(scb->sched_events);
+	r->more = !list_isempty(&head->event_head);
 }
 
 static inline void
@@ -527,13 +544,6 @@ curr_invstk_top(struct cos_cpu_local_info *cos_info)
 	return cos_info->invstk_top;
 }
 
-static inline struct comp_info *
-thd_invstk_peek_compinfo(struct thread *curr_thd, struct cos_cpu_local_info *cos_info, int peek_index)
-{
-	/* curr_thd should be the current thread! We are using cached invstk_top. */
-	return &(curr_thd->invstk[peek_index].comp_info);
-}
-
 static inline struct comp_info *
 thd_invstk_current_compinfo(struct thread *curr_thd, struct cos_cpu_local_info *cos_info)
 {
@@ -634,6 +644,7 @@ thd_sched_events_produce(struct thread *thd, struct cos_cpu_local_info *cos_info
 	}
 
 	r->tail += delta;
+	r->more  = !list_isempty(&thd->event_head);
 
 	return delta;
 }

From 291e21bdde5187f4b980c0b2e8fe956c968c3d85 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Thu, 3 Oct 2019 10:37:25 -0400
Subject: [PATCH 118/127] pending for user-level rcv

---
 src/components/include/sl.h |  2 +-
 src/kernel/capinv.c         |  6 ++++--
 src/kernel/include/thd.h    | 10 ++++------
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index fb3fdb0ab3..37d8a5e8df 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -1004,7 +1004,7 @@ sl_thd_rcv(rcv_flags_t flags)
 //		//q = ps_load(p);
 //	}
 //	assert(sl_thd_dcbinfo(t)->sp == 0);
-//	assert(q == 1);
+//	assert(q == 1); /* q should be 1 if the thread did not call COS_RCV and is woken up.. */
 //
 //done:
 //	ps_upcas(p, q, 0);
diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index 91b052881b..2d2948fd7e 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -991,13 +991,15 @@ cap_arcv_op(struct cap_arcv *arcv, struct thread *thd, struct pt_regs *regs, str
 	if (thd_rcvcap_pending(thd)) {
 		__userregs_set(regs, 0, __userregs_getsp(regs), __userregs_getip(regs));
 		thd_rcvcap_pending_deliver(thd, regs);
-		if (thd->dcbinfo) thd->dcbinfo->sp = 0;
+		/* for sched_rcv enabling user-level switch */
+		//if (thd->dcbinfo) thd->dcbinfo->sp = 0;
 
 		return 0;
 	} else if (rflags & RCV_NON_BLOCKING) {
 		__userregs_set(regs, 0, __userregs_getsp(regs), __userregs_getip(regs));
 		__userregs_setretvals(regs, -EAGAIN, 0, 0, 0);
-		if (thd->dcbinfo) thd->dcbinfo->sp = 0;
+		/* for sched_rcv enabling user-level switch */
+		//if (thd->dcbinfo) thd->dcbinfo->sp = 0;
 
 		return 0;
 	}
diff --git a/src/kernel/include/thd.h b/src/kernel/include/thd.h
index e10f2b1cdd..55dde752dc 100644
--- a/src/kernel/include/thd.h
+++ b/src/kernel/include/thd.h
@@ -251,7 +251,7 @@ thd_track_exec(struct thread *t)
 static int
 thd_rcvcap_pending(struct thread *t)
 {
-	if (t->rcvcap.pending) return t->rcvcap.pending;
+	if (t->rcvcap.pending || (t->dcbinfo && t->dcbinfo->pending)) return 1;
 	return !list_isempty(&t->event_head);
 }
 
@@ -270,17 +270,15 @@ thd_rcvcap_set_counter(struct thread *t, sched_tok_t cntr)
 static void
 thd_rcvcap_pending_set(struct thread *arcvt)
 {
-	if (likely(arcvt->dcbinfo)) {
-		arcvt->dcbinfo->pending = 1;
-	//printk("%u:%d\n", arcvt->tid, arcvt->dcbinfo->pending);
-	}
+	if (likely(arcvt->dcbinfo)) arcvt->dcbinfo->pending = 1;
 	else arcvt->rcvcap.pending = 1;
 }
 
 static void
 thd_rcvcap_pending_reset(struct thread *arcvt)
 {
-	arcvt->rcvcap.pending = 0;
+	if (likely(arcvt->dcbinfo)) arcvt->dcbinfo->pending = 0;
+	else arcvt->rcvcap.pending = 0;
 }
 
 static inline int

From a2d2286ebfa5de8e52cde9846c5bd192ddebcefd Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Thu, 3 Oct 2019 14:29:31 -0400
Subject: [PATCH 119/127] Timers disabled coz of cos_thd_switch in ul switch

* pass the timeout with the api to call slowpath kernel switch
* make sure cos_rcv is called with the sched timer in cos_ul_rcv.
* if cos_rcv is switching back to the scheduler, do not disable the
  timer.
* there seem to be some cases, where switching to a tcap with infinite
  budget is disabling timer somewhere. It happens when tcap has inf budget
  and timeout is set to NIL. Timeout should be set appropriately!
  This will disable the timer programmed previously.
---
 src/components/include/cos_ulsched_rcv.h | 15 ++++++++-
 src/components/include/sl.h              | 39 +++++++++++++++---------
 src/components/lib/sl/sl_sched.c         |  3 +-
 src/kernel/capinv.c                      | 33 +++++++-------------
 src/kernel/include/dcb.h                 |  4 +--
 src/kernel/include/hw.h                  | 10 +++---
 src/kernel/include/inv.h                 | 26 ++++++++--------
 src/kernel/include/scb.h                 |  4 +--
 src/kernel/include/thd.h                 | 32 +++++++++++--------
 9 files changed, 91 insertions(+), 75 deletions(-)

diff --git a/src/components/include/cos_ulsched_rcv.h b/src/components/include/cos_ulsched_rcv.h
index 23becab4dd..0fa235a7df 100644
--- a/src/components/include/cos_ulsched_rcv.h
+++ b/src/components/include/cos_ulsched_rcv.h
@@ -48,7 +48,20 @@ cos_ul_sched_rcv(arcvcap_t rcv, rcv_flags_t rfl, tcap_time_t timeout, struct cos
 		if (unlikely(ret < 0)) return ret;
 	}
 
-	return (ret || __cos_sched_events_present(r));
+	return (ret || __cos_sched_events_present(r) || cos_sched_ispending());
+}
+
+static inline int
+cos_ul_rcv(arcvcap_t rcv, rcv_flags_t rfl, tcap_time_t sched_timeout)
+{
+	struct cos_sched_event ev = { .tid = 0 };
+	int ret = 0;
+
+	ret = cos_sched_rcv(rcv, rfl, sched_timeout, &(ev.tid), (int *)&(ev.evt.blocked),
+			    (cycles_t *)&(ev.evt.elapsed_cycs), (tcap_time_t *)&(ev.evt.next_timeout));
+	assert(ev.tid == 0);
+
+	return ret;
 }
 
 #endif /* COS_ULSCHED_RCV_H */
diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index 37d8a5e8df..ac10bb28c2 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -482,9 +482,10 @@ sl_thd_dispatch_usr(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 {
 	volatile struct cos_scb_info *scb = sl_scb_info_core();
 	struct cos_dcb_info *cd = sl_thd_dcbinfo(curr), *nd = sl_thd_dcbinfo(next);
+	struct sl_global_core *g = sl__globals_core();
 
 	assert(curr != next);
-	if (unlikely(!cd || !nd)) return cos_defswitch(sl_thd_thdcap(next), next->prio, sl__globals_core()->timeout_next, tok);
+	if (unlikely(!cd || !nd)) return cos_defswitch(sl_thd_thdcap(next), next->prio, g->timeout_next, tok);
 
 	/*
 	 * jump labels in the asm routine:
@@ -501,6 +502,9 @@ sl_thd_dispatch_usr(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 	 *    of this routine or kernel at some point had to switch to a
 	 *    thread that co-operatively switched away from this routine.
 	 *    NOTE: kernel takes care of resetting dcb sp in this case!
+	 *
+	 * a simple cos_thd_switch() kind will disable timers! so, pass in the timeout anyway to 
+	 * slowpath thread switch!
 	 */
 
 	__asm__ __volatile__ (				\
@@ -520,9 +524,9 @@ sl_thd_dispatch_usr(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 		"inc %%eax\n\t"				\
 		"shl $16, %%eax\n\t"			\
 		"movl $0, %%ebx\n\t"			\
+		"movl %%esi, %%edx\n\t"			\
 		"movl $0, %%esi\n\t"			\
 		"movl $0, %%edi\n\t"			\
-		"movl $0, %%edx\n\t"			\
 		"sysenter\n\t"				\
 		"jmp 3f\n\t"				\
 		".align 4\n\t"				\
@@ -533,7 +537,7 @@ sl_thd_dispatch_usr(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 		"popl %%ebp\n\t"			\
 		:
 		: "a" (cd), "b" (nd),
-		  "S" ((u32_t)((u64_t)tok >> 32)), "D" ((u32_t)(((u64_t)tok << 32) >> 32)),
+		  "S" (g->timeout_next), "D" (tok),
 		  "c" (&(scb->curr_thd)), "d" (sl_thd_thdcap(next))
 		: "memory", "cc");
 
@@ -560,9 +564,14 @@ sl_thd_activate_c(struct sl_thd *t, sched_tok_t tok, tcap_time_t timeout, tcap_p
 		return sl_thd_dispatch_kern(t, tok, curr, timeout, sl_thd_tcap(t), prio == 0 ? t->prio : prio);
 	}
 
-	if (unlikely(timeout || prio || t == g->idle_thd)) {
+	/* TODO: there is something in the kernel that seem to disable timers..!! */
+	/* WORKAROUND: idle thread is a big cpu hogger.. so make sure there is timeout set around switching to and away! */
+	if (unlikely(curr == g->idle_thd || t == g->idle_thd)) return sl_thd_dispatch_kern(t, tok, curr, g->timeout_next, g->sched_tcap, prio);
+
+	if (unlikely(timeout || prio)) {
 		return sl_thd_dispatch_kern(t, tok, curr, timeout, g->sched_tcap, prio);
 	} else {
+		assert(t != g->idle_thd);
 		return sl_thd_dispatch_usr(t, tok, curr);
 	}
 }
@@ -652,7 +661,7 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 		struct sl_thd_policy *pt = sl_mod_schedule();
 
 		if (unlikely(!pt))
-			t = globals->sched_thd;
+			t = globals->idle_thd;
 		else
 			t = sl_mod_thd_get(pt);
 	}
@@ -665,9 +674,7 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 #ifdef SL_CS
 	sl_cs_exit();
 #endif
-	if (unlikely(t == c)) {
-		return 0;
-	}
+	if (unlikely(t == c)) return 0;
 
 	ret = sl_thd_activate_c(t, tok, 0, 0, c, globals);
 
@@ -737,7 +744,7 @@ sl_cs_exit_schedule_nospin_arg_timeout(struct sl_thd *to, cycles_t abs_timeout)
 	now    = sl_now();
 
 	offset = (s64_t)(globals->timer_next - now);
-	if (globals->timer_next && offset <= 0) sl_timeout_expended(now, globals->timer_next);
+	if (offset <= 0) sl_timeout_expended(now, globals->timer_next);
 	sl_timeout_wakeup_expired(now);
 
 	/*
@@ -755,7 +762,7 @@ sl_cs_exit_schedule_nospin_arg_timeout(struct sl_thd *to, cycles_t abs_timeout)
 		struct sl_thd_policy *pt = sl_mod_schedule();
 
 		if (unlikely(!pt))
-			t = globals->sched_thd;
+			t = globals->idle_thd;
 		else
 			t = sl_mod_thd_get(pt);
 	}
@@ -765,15 +772,17 @@ sl_cs_exit_schedule_nospin_arg_timeout(struct sl_thd *to, cycles_t abs_timeout)
 #endif
 
 	assert(t && sl_thd_is_runnable(t));
-	if (unlikely(!(offset > globals->cyc_per_usec && globals->timer_prev
-		   && abs_timeout > globals->timer_next))) {
-		timeout = abs_timeout < globals->timer_next 
-				      ? tcap_cyc2time(abs_timeout) : globals->timeout_next;
+	if (offset <= 0 || 
+	    (abs_timeout > now && abs_timeout > globals->timer_next + globals->cyc_per_usec)) {
+		timeout = offset <= 0 ? globals->timer_next : (abs_timeout > now ? tcap_cyc2time(abs_timeout) : 0);
 	}
 
 #ifdef SL_CS
 	sl_cs_exit();
 #endif
+	if (likely(c == t && t == globals->sched_thd && timeout)) {
+		return cos_defswitch(globals->sched_thdcap, globals->sched_thd->prio, timeout, tok);
+	}
 	if (unlikely(t == c)) return 0;
 
 	/* 
@@ -976,7 +985,7 @@ sl_thd_event_dequeue(struct sl_thd *t, struct cos_thd_event *e)
 static inline int
 sl_thd_rcv(rcv_flags_t flags)
 {
-	return cos_rcv(sl_thd_rcvcap(sl_thd_curr()), flags);
+	return cos_ul_rcv(sl_thd_rcvcap(sl_thd_curr()), flags, sl__globals_core()->timeout_next);
 //	/* FIXME: elapsed_cycs accounting..?? */
 //	struct cos_thd_event ev = { .blocked = 1, .next_timeout = 0, .epoch = 0, .elapsed_cycs = 0 };
 //	struct sl_thd *t = sl_thd_curr();
diff --git a/src/components/lib/sl/sl_sched.c b/src/components/lib/sl/sl_sched.c
index 06735f92d4..44ed18bc98 100644
--- a/src/components/lib/sl/sl_sched.c
+++ b/src/components/lib/sl/sl_sched.c
@@ -553,7 +553,6 @@ sl_timeout_period(microsec_t period)
 	cycles_t p = sl_usec2cyc(period);
 
 	sl__globals_core()->period = p;
-	sl_timeout_relative(p);
 }
 
 /* engage space heater mode */
@@ -816,7 +815,7 @@ sl_sched_loop_intern(int non_block)
 
 		if (sl_cs_enter_sched()) continue;
 		/* If switch returns an inconsistency, we retry anyway */
-		sl_cs_exit_schedule_nospin_timeout(g->timer_next);
+		sl_cs_exit_schedule_nospin_timeout(0);
 	}
 }
 
diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index 2d2948fd7e..dffc472ae9 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -20,8 +20,6 @@
 
 #define COS_DEFAULT_RET_CAP 0
 
-static int hw_asnd_call = 0;
-
 /*
  * TODO: switch to a dedicated TLB flush thread (in a separate
  * protection domain) to do this.
@@ -485,9 +483,6 @@ cap_thd_switch(struct pt_regs *regs, struct thread *curr, struct thread *next, s
 	struct comp_info *   next_ci = &(next->invstk[next->invstk_top].comp_info);
 	int                  preempt = 0;
 
-	if (hw_asnd_call) {
-		//printk("[%d %d]\n", curr->tid, next->tid);
-	}
 	assert(next_ci && curr && next);
 	assert(curr->cpuid == get_cpuid() && next->cpuid == get_cpuid());
 	if (unlikely(curr == next)) return thd_switch_update(curr, regs, 1);
@@ -701,13 +696,7 @@ cap_thd_op(struct cap_thd *thd_cap, struct thread *thd, struct pt_regs *regs, st
 	struct tcap *tcap    = tcap_current(cos_info);
 	int          ret;
 
-	//printk("\n\n%u:%u %lu %lu %llu %lu %lu\n", thd->tid, next->tid, arcv, tc, prio, usr_counter, timeout);
 	if (thd_cap->cpuid != get_cpuid() || thd_cap->cpuid != next->cpuid) return -EINVAL;
-	if (unlikely(thd->dcbinfo && thd->dcbinfo->sp)) {
-		//printk("\n%u: %u %lx %lx %lx\n", thd->tid, next->tid, regs->cx, thd->dcbinfo->ip, thd->dcbinfo->ip + DCB_IP_KERN_OFF);
-//		assert((unsigned long)regs->cx == thd->dcbinfo->ip + DCB_IP_KERN_OFF);
-//		assert((unsigned long)regs->bp == thd->dcbinfo->sp);
-	}
 
 	if (arcv) {
 		struct cap_arcv *arcv_cap;
@@ -720,7 +709,8 @@ cap_thd_op(struct cap_thd *thd_cap, struct thread *thd, struct pt_regs *regs, st
 		ret  = cap_sched_tok_validate(rcvt, usr_counter, ci, cos_info);
 		if (ret) return ret;
 
-		if (thd_rcvcap_pending(rcvt)) {
+		/* only if it has scheduler events to process! */
+		if (thd_rcvcap_evt_pending(rcvt)) {
 			if (thd == rcvt) return -EBUSY;
 
 			next = rcvt;
@@ -747,7 +737,6 @@ cap_thd_op(struct cap_thd *thd_cap, struct thread *thd, struct pt_regs *regs, st
 
 	ret = cap_switch(regs, thd, next, tcap, timeout, ci, cos_info);
 	if (tc && tcap_current(cos_info) == tcap && prio) tcap_setprio(tcap, prio);
-	//printk("\n\n%u:%u-%d\n", thd->tid, next->tid,ret);
 
 	return ret;
 }
@@ -862,7 +851,8 @@ cap_asnd_op(struct cap_asnd *asnd, struct thread *thd, struct pt_regs *regs, str
 		ret  = cap_sched_tok_validate(rcvt, usr_tok, ci, cos_info);
 		if (ret) return ret;
 
-		if (thd_rcvcap_pending(rcvt)) {
+		/* only if the rcvt has scheduler events to process */
+		if (thd_rcvcap_evt_pending(rcvt)) {
 			if (thd == rcvt) return -EBUSY;
 
 			next = rcvt;
@@ -916,19 +906,15 @@ cap_hw_asnd(struct cap_asnd *asnd, struct pt_regs *regs)
 	rcv_thd  = arcv->thd;
 	rcv_tcap = rcv_thd->rcvcap.rcvcap_tcap;
 	assert(rcv_tcap && tcap);
-	hw_asnd_call = 1;
 
 	next = asnd_process(rcv_thd, thd, rcv_tcap, tcap, &tcap_next, 0, cos_info);
 	assert(next == rcv_thd);
-	if (next == thd) {
-		hw_asnd_call = 0;
-		return 1;
-	}
+	if (next == thd) return 1;
 	thd->state |= THD_STATE_PREEMPTED;
 
-	int p = cap_switch(regs, thd, next, tcap_next, TCAP_TIME_NIL, ci, cos_info);
-	hw_asnd_call = 0;
-	return p;
+	/* don't disable timer if we're not switching to a diff tcap.. */
+	/* TODO: hierarchical timeouts */
+	return cap_switch(regs, thd, next, tcap_next, tcap == tcap_next ? tcap_cyc2time(cos_info->next_timer) : TCAP_TIME_NIL, ci, cos_info);
 }
 
 int
@@ -1038,6 +1024,9 @@ cap_arcv_op(struct cap_arcv *arcv, struct thread *thd, struct pt_regs *regs, str
 		assert(!(thd->state & THD_STATE_PREEMPTED));
 		thd->state |= THD_STATE_RCVING;
 		thd->timeout = timeout;
+	} else {
+		/* switching back to the thread.. don't disable timers..*/
+		swtimeout = timeout;
 	}
 
 	return cap_switch(regs, thd, next, tc_next, swtimeout, ci, cos_info);
diff --git a/src/kernel/include/dcb.h b/src/kernel/include/dcb.h
index cd466180dd..eac71fa497 100644
--- a/src/kernel/include/dcb.h
+++ b/src/kernel/include/dcb.h
@@ -23,7 +23,7 @@ struct cap_dcb {
 	cpuid_t               cpuid;
 } __attribute__((packed));
 
-static int
+static inline int
 dcb_activate(struct captbl *t, capid_t ctcap, capid_t dcbcap, vaddr_t kaddr, livenessid_t lid, capid_t ptcapin, vaddr_t uaddr)
 {
 	struct cap_dcb      *dc;
@@ -50,7 +50,7 @@ dcb_activate(struct captbl *t, capid_t ctcap, capid_t dcbcap, vaddr_t kaddr, liv
 	return 0;
 }
 
-static int
+static inline int
 dcb_deactivate(struct cap_captbl *ct, capid_t dcbcap, livenessid_t lid, capid_t ptcap, capid_t cosframe_addr, capid_t ptcapin, vaddr_t uaddrin)
 {
 	struct cap_dcb *dc;
diff --git a/src/kernel/include/hw.h b/src/kernel/include/hw.h
index 6b28a17f2b..4c03f1cd87 100644
--- a/src/kernel/include/hw.h
+++ b/src/kernel/include/hw.h
@@ -24,7 +24,7 @@ struct cap_hw {
 	u32_t             hw_bitmap;
 } __attribute__((packed));
 
-static void
+static inline void
 hw_asndcap_init(void)
 {
 	memset(&hw_asnd_caps, 0, sizeof(struct cap_asnd) * HW_IRQ_TOTAL * NUM_CPU);
@@ -36,7 +36,7 @@ hw_asndcap_init(void)
  * from another, and only with a subset of the bitmap.  Any other HW
  * resources should not be passed on.
  */
-static int
+static inline int
 hw_activate(struct captbl *t, capid_t cap, capid_t capin, u32_t bitmap)
 {
 	struct cap_hw *hwc;
@@ -52,13 +52,13 @@ hw_activate(struct captbl *t, capid_t cap, capid_t capin, u32_t bitmap)
 	return 0;
 }
 
-static int
+static inline int
 hw_deactivate(struct cap_captbl *t, capid_t capin, livenessid_t lid)
 {
 	return cap_capdeactivate(t, capin, CAP_HW, lid);
 }
 
-static int
+static inline int
 hw_attach_rcvcap(struct cap_hw *hwc, hwid_t hwid, struct cap_arcv *rcvc, capid_t rcv_cap)
 {
 	if (hwid < HW_IRQ_EXTERNAL_MIN || hwid > HW_IRQ_EXTERNAL_MAX) return -EINVAL;
@@ -68,7 +68,7 @@ hw_attach_rcvcap(struct cap_hw *hwc, hwid_t hwid, struct cap_arcv *rcvc, capid_t
 	return asnd_construct(&hw_asnd_caps[get_cpuid()][hwid], rcvc, rcv_cap, 0, 0);
 }
 
-static int
+static inline int
 hw_detach_rcvcap(struct cap_hw *hwc, hwid_t hwid)
 {
 	if (hwid < HW_IRQ_EXTERNAL_MIN || hwid > HW_IRQ_EXTERNAL_MAX) return -EINVAL;
diff --git a/src/kernel/include/inv.h b/src/kernel/include/inv.h
index 089c784b54..7ac9cb14b1 100644
--- a/src/kernel/include/inv.h
+++ b/src/kernel/include/inv.h
@@ -50,7 +50,7 @@ struct cap_arcv {
 	u8_t           depth;
 } __attribute__((packed));
 
-static int
+static inline int
 sinv_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t comp_cap, vaddr_t entry_addr, invtoken_t token)
 {
 	struct cap_sinv *sinvc;
@@ -72,13 +72,13 @@ sinv_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t comp_cap, va
 	return 0;
 }
 
-static int
+static inline int
 sinv_deactivate(struct cap_captbl *t, capid_t capin, livenessid_t lid)
 {
 	return cap_capdeactivate(t, capin, CAP_SINV, lid);
 }
 
-static int
+static inline int
 sret_activate(struct captbl *t, capid_t cap, capid_t capin)
 {
 	struct cap_sret *sretc;
@@ -91,13 +91,13 @@ sret_activate(struct captbl *t, capid_t cap, capid_t capin)
 	return 0;
 }
 
-static int
+static inline int
 sret_deactivate(struct cap_captbl *t, capid_t capin, livenessid_t lid)
 {
 	return cap_capdeactivate(t, capin, CAP_SRET, lid);
 }
 
-static int
+static inline int
 asnd_construct(struct cap_asnd *asndc, struct cap_arcv *arcvc, capid_t rcv_cap, u32_t budget, u32_t period)
 {
 	/* FIXME: Add synchronization with __xx_pre and __xx_post */
@@ -118,7 +118,7 @@ asnd_construct(struct cap_asnd *asndc, struct cap_arcv *arcvc, capid_t rcv_cap,
 	return 0;
 }
 
-static int
+static inline int
 asnd_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t rcv_captbl, capid_t rcv_cap, u32_t budget,
               u32_t period)
 {
@@ -142,7 +142,7 @@ asnd_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t rcv_captbl,
 	return ret;
 }
 
-static int
+static inline int
 asnd_deactivate(struct cap_captbl *t, capid_t capin, livenessid_t lid)
 {
 	return cap_capdeactivate(t, capin, CAP_ASND, lid);
@@ -153,7 +153,7 @@ int cap_ipi_process(struct pt_regs *regs);
 /* send to a receive end-point within an interrupt */
 int cap_hw_asnd(struct cap_asnd *asnd, struct pt_regs *regs);
 
-static void
+static inline void
 __arcv_setup(struct cap_arcv *arcv, struct thread *thd, struct tcap *tcap, struct thread *notif)
 {
 	assert(arcv && thd && tcap && !thd_bound2rcvcap(thd));
@@ -168,7 +168,7 @@ __arcv_setup(struct cap_arcv *arcv, struct thread *thd, struct tcap *tcap, struc
 	tcap_promote(tcap, thd);
 }
 
-static int
+static inline int
 __arcv_teardown(struct cap_arcv *arcv, struct thread *thd)
 {
 	struct thread *notif;
@@ -189,13 +189,13 @@ __arcv_teardown(struct cap_arcv *arcv, struct thread *thd)
 	return 0;
 }
 
-static struct thread *
+static inline struct thread *
 arcv_thd_notif(struct thread *arcvt)
 {
 	return arcvt->rcvcap.rcvcap_thd_notif;
 }
 
-static int
+static inline int
 arcv_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t comp_cap, capid_t thd_cap, capid_t tcap_cap,
               capid_t arcv_cap, int init)
 {
@@ -245,7 +245,7 @@ arcv_activate(struct captbl *t, capid_t cap, capid_t capin, capid_t comp_cap, ca
 	return 0;
 }
 
-static int
+static inline int
 arcv_deactivate(struct cap_captbl *t, capid_t capin, livenessid_t lid)
 {
 	struct cap_arcv *arcvc;
@@ -345,7 +345,7 @@ sret_ret(struct thread *thd, struct pt_regs *regs, struct cos_cpu_local_info *co
 	__userregs_set(regs, __userregs_getinvret(regs), sp, ip);
 }
 
-static void
+static inline void
 inv_init(void)
 {
 //#define __OUTPUT_CAP_SIZE
diff --git a/src/kernel/include/scb.h b/src/kernel/include/scb.h
index 98c112cd8c..b90d66b3d2 100644
--- a/src/kernel/include/scb.h
+++ b/src/kernel/include/scb.h
@@ -21,7 +21,7 @@ struct cap_scb {
 	vaddr_t               kern_addr;
 } __attribute__((packed));
 
-static int
+static inline int
 scb_activate(struct captbl *t, capid_t ctcap, capid_t scbcap, vaddr_t kaddr, livenessid_t lid)
 {
 	struct cap_scb *sc;
@@ -40,7 +40,7 @@ scb_activate(struct captbl *t, capid_t ctcap, capid_t scbcap, vaddr_t kaddr, liv
 	return 0;
 }
 
-static int
+static inline int
 scb_deactivate(struct cap_captbl *ct, capid_t scbcap, capid_t ptcap, capid_t cosframe_addr, livenessid_t lid)
 {
 	struct cap_scb *sc;
diff --git a/src/kernel/include/thd.h b/src/kernel/include/thd.h
index 55dde752dc..c9c01c734b 100644
--- a/src/kernel/include/thd.h
+++ b/src/kernel/include/thd.h
@@ -209,6 +209,12 @@ thd_invstk_peek_compinfo(struct thread *curr_thd, struct cos_cpu_local_info *cos
 	return &(curr_thd->invstk[peek_index].comp_info);
 }
 
+static inline int
+thd_rcvcap_evt_pending(struct thread *t)
+{
+	return !list_isempty(&t->event_head);
+}
+
 static inline void
 thd_rcvcap_evt_enqueue(struct thread *head, struct thread *t)
 {
@@ -222,7 +228,7 @@ thd_rcvcap_evt_enqueue(struct thread *head, struct thread *t)
 
 	scb = ((c->scb_data) + get_cpuid());
 	r   = &(scb->sched_events);
-	r->more = !list_isempty(&head->event_head);
+	r->more = thd_rcvcap_evt_pending(head);
 }
 
 static inline void
@@ -248,33 +254,33 @@ thd_track_exec(struct thread *t)
 	return !list_empty(&t->event_list);
 }
 
-static int
+static inline int
 thd_rcvcap_pending(struct thread *t)
 {
 	if (t->rcvcap.pending || (t->dcbinfo && t->dcbinfo->pending)) return 1;
-	return !list_isempty(&t->event_head);
+	return thd_rcvcap_evt_pending(t);
 }
 
-static sched_tok_t
+static inline sched_tok_t
 thd_rcvcap_get_counter(struct thread *t)
 {
 	return t->rcvcap.sched_count;
 }
 
-static void
+static inline void
 thd_rcvcap_set_counter(struct thread *t, sched_tok_t cntr)
 {
 	t->rcvcap.sched_count = cntr;
 }
 
-static void
+static inline void
 thd_rcvcap_pending_set(struct thread *arcvt)
 {
 	if (likely(arcvt->dcbinfo)) arcvt->dcbinfo->pending = 1;
 	else arcvt->rcvcap.pending = 1;
 }
 
-static void
+static inline void
 thd_rcvcap_pending_reset(struct thread *arcvt)
 {
 	if (likely(arcvt->dcbinfo)) arcvt->dcbinfo->pending = 0;
@@ -327,7 +333,7 @@ thd_scheduler_set(struct thread *thd, struct thread *sched)
 	if (unlikely(thd->scheduler_thread != sched)) thd->scheduler_thread = sched;
 }
 
-static int
+static inline int
 thd_activate(struct captbl *t, capid_t cap, capid_t capin, struct thread *thd, capid_t compcap, thdclosure_index_t init_data, capid_t dcbcap, unsigned short dcboff)
 {
 	struct cos_cpu_local_info *cli = cos_cpu_local_info();
@@ -432,7 +438,7 @@ thd_migrate(struct captbl *ct, capid_t thd_cap, cpuid_t core)
 	return 0;
 }
 
-static int
+static inline int
 thd_deactivate(struct captbl *ct, struct cap_captbl *dest_ct, unsigned long capin, livenessid_t lid, capid_t pgtbl_cap,
                capid_t cosframe_addr, capid_t dcbcap, const int root)
 {
@@ -499,7 +505,7 @@ thd_deactivate(struct captbl *ct, struct cap_captbl *dest_ct, unsigned long capi
 	return ret;
 }
 
-static int
+static inline int
 thd_tls_set(struct captbl *ct, capid_t thd_cap, vaddr_t tlsaddr, struct thread *current)
 {
 	struct cap_thd *tc;
@@ -517,7 +523,7 @@ thd_tls_set(struct captbl *ct, capid_t thd_cap, vaddr_t tlsaddr, struct thread *
 	return 0;
 }
 
-static void
+static inline void
 thd_init(void)
 {
 	assert(sizeof(struct cap_thd) <= __captbl_cap2bytes(CAP_THD));
@@ -607,7 +613,7 @@ thd_preemption_state_update(struct thread *curr, struct thread *next, struct pt_
 	memcpy(&curr->regs, regs, sizeof(struct pt_regs));
 }
 
-static int
+static inline int
 thd_sched_events_produce(struct thread *thd, struct cos_cpu_local_info *cos_info)
 {
 	int delta = 0, inv_top = curr_invstk_top(cos_info);
@@ -642,7 +648,7 @@ thd_sched_events_produce(struct thread *thd, struct cos_cpu_local_info *cos_info
 	}
 
 	r->tail += delta;
-	r->more  = !list_isempty(&thd->event_head);
+	r->more  = thd_rcvcap_evt_pending(thd);
 
 	return delta;
 }

From 59de0a3fd6c02a39767ba3529aa87f561413e308 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Mon, 7 Oct 2019 11:53:23 -0400
Subject: [PATCH 120/127] fixes in isolation and security tests

---
 .../implementation/sched/root_fprr/init.c     |  11 +-
 .../implementation/sched/test_sched/init.c    |   2 +-
 .../tests/micro_chan/unit_schedlib.c          |  14 ++-
 .../tests/test_schedinv/test_schedinv.c       |   2 +-
 .../implementation/tests/unit_slrcv/init.c    | 114 +++++++++++-------
 src/components/include/cos_ulsched_rcv.h      |  23 +++-
 src/components/include/sl.h                   | 100 +++++++--------
 src/components/lib/sl/sl_child.c              |  21 +++-
 src/components/lib/sl/sl_sched.c              |  43 ++++---
 src/kernel/capinv.c                           |   4 +-
 src/kernel/include/shared/cos_types.h         |   1 +
 src/platform/i386/hpet.c                      |   2 +-
 src/platform/i386/runscripts/test_slite02.sh  |   4 +-
 13 files changed, 208 insertions(+), 133 deletions(-)

diff --git a/src/components/implementation/sched/root_fprr/init.c b/src/components/implementation/sched/root_fprr/init.c
index abfc035718..d78a6068cc 100644
--- a/src/components/implementation/sched/root_fprr/init.c
+++ b/src/components/implementation/sched/root_fprr/init.c
@@ -16,8 +16,8 @@ u32_t cycs_per_usec = 0;
 #define INITIALIZE_PERIOD_MS (4000)
 #define INITIALIZE_BUDGET_MS (2000)
 
-#define FIXED_PRIO 1
-#define FIXED_PERIOD_MS (100000)
+#define FIXED_PRIO 2
+#define FIXED_PERIOD_MS (50000)
 #define FIXED_BUDGET_MS (100000)
 
 static struct sl_thd *__initializer_thd[NUM_CPU] CACHE_ALIGNED;
@@ -47,22 +47,23 @@ sched_child_init(struct sched_childinfo *schedci)
 {
 	vaddr_t dcbaddr;
 	struct sl_thd *initthd;
+	tcap_prio_t p = FIXED_PRIO;
 
 	assert(schedci);
+	if (schedci->id != 1) p++;
 	schedci->initthd = sl_thd_initaep_alloc(sched_child_defci_get(schedci), NULL, schedci->flags & COMP_FLAG_SCHED, schedci->flags & COMP_FLAG_SCHED ? 1 : 0, 0, 0, 0, &dcbaddr);
         assert(schedci->initthd);
 	initthd = schedci->initthd;
 
 	if (schedci->flags & COMP_FLAG_SCHED) {
-		if (cos_tcap_transfer(sl_thd_rcvcap(initthd), BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF, FIXED_PRIO)) {
+		if (cos_tcap_transfer(sl_thd_rcvcap(initthd), BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF, p)) {
 			PRINTC("Failed to transfer INF budget\n");
 			assert(0);
 		}
 		sl_thd_param_set(initthd, sched_param_pack(SCHEDP_WINDOW, FIXED_PERIOD_MS));
 		sl_thd_param_set(initthd, sched_param_pack(SCHEDP_BUDGET, FIXED_BUDGET_MS));
 	}
-	if (schedci->id == 1) sl_thd_param_set(initthd, sched_param_pack(SCHEDP_PRIO, FIXED_PRIO));
-	else                  sl_thd_param_set(initthd, sched_param_pack(SCHEDP_PRIO, FIXED_PRIO+1));
+	sl_thd_param_set(initthd, sched_param_pack(SCHEDP_PRIO, p));
 }
 
 thdid_t
diff --git a/src/components/implementation/sched/test_sched/init.c b/src/components/implementation/sched/test_sched/init.c
index 6874ea3b09..35563ec3ca 100644
--- a/src/components/implementation/sched/test_sched/init.c
+++ b/src/components/implementation/sched/test_sched/init.c
@@ -40,7 +40,7 @@ sched_child_init(struct sched_childinfo *schedci)
 
 extern void __sched_stdio_thd_init(thdid_t, struct crt_chan *, struct crt_chan *);
 #define MAX_PIPE_SZ 8
-#define MAX_USE_PIPE_SZ 3
+#define MAX_USE_PIPE_SZ 4
 CRT_CHAN_STATIC_ALLOC(c0, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
 CRT_CHAN_STATIC_ALLOC(c1, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
 CRT_CHAN_STATIC_ALLOC(c2, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
diff --git a/src/components/implementation/tests/micro_chan/unit_schedlib.c b/src/components/implementation/tests/micro_chan/unit_schedlib.c
index 589094d6ea..736e6bbae2 100644
--- a/src/components/implementation/tests/micro_chan/unit_schedlib.c
+++ b/src/components/implementation/tests/micro_chan/unit_schedlib.c
@@ -28,6 +28,8 @@ CRT_CHAN_STATIC_ALLOC(c0, int, 4);
 CRT_CHAN_TYPE_PROTOTYPES(test, int, 4);
 struct crt_lock lock;
 
+unsigned int one_only = 0;
+
 typedef enum { CHILLING = 0, RECVING, SENDING } actions_t;
 unsigned long status[NCHANTHDS];
 unsigned long cnts[NCHANTHDS] = {0, };
@@ -146,6 +148,7 @@ test_thd_fn(void *data)
 			rounds ++;
 			crt_chan_recv_test(c0, &recv);
 			rdtscll(end_time);
+			assert(ps_faa(&one_only, -1) == 1);
 
 			diff = end_time - start_time;
 			if (diff > max) max = diff;
@@ -157,7 +160,7 @@ test_thd_fn(void *data)
 				int i;
 
 				for (i = 0; i < CHAN_ITER; i++) {
-					printc("%llu, ", iters[i]);
+					printc("%llu\n", iters[i]);
 				}
 				printc("\nAvg: %llu, Wc:%llu\n", total / CHAN_ITER, max);
 
@@ -170,8 +173,9 @@ test_thd_fn(void *data)
 		}
 	}
 	else {
+		send = 0x1234;
 		while (1) {
-			send = 0x1234;
+			assert(ps_faa(&one_only, 1) == 0);
 			rdtscll(start_time);
 			crt_chan_send_test(c0, &send);
 		}
@@ -352,7 +356,6 @@ test_yields(void)
 
 	start_time = end_time = 0;
 
-		crt_chan_init_test(c0);
 	for (i = 0; i < N_TESTTHDS; i++) {
 		threads[i] = sl_thd_alloc(test_thd_fn, (void *)i);
 		assert(threads[i]);
@@ -362,6 +365,9 @@ test_yields(void)
 		PRINTC("Thread %u:%lu created\n", sl_thd_thdid(threads[i]), sl_thd_thdcap(threads[i]));
 		//sl_thd_yield_thd(threads[i]);
 	}
+	assert(N_TESTTHDS == 2);
+	//crt_chan_p2p_init_test(c0, threads[SND], threads[RCV]);
+	crt_chan_init_test(c0);
 }
 
 //void
@@ -476,7 +482,7 @@ cos_init(void)
 	cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
 	cos_defcompinfo_llinit();
 	cos_dcb_info_init_curr();
-	sl_init(SL_MIN_PERIOD_US*10);
+	sl_init(SL_MIN_PERIOD_US*50);
 
 	//test_yield_perf();
 	test_yields();
diff --git a/src/components/implementation/tests/test_schedinv/test_schedinv.c b/src/components/implementation/tests/test_schedinv/test_schedinv.c
index 6cda87a69e..36f938e0a2 100644
--- a/src/components/implementation/tests/test_schedinv/test_schedinv.c
+++ b/src/components/implementation/tests/test_schedinv/test_schedinv.c
@@ -19,7 +19,7 @@
 
 static u32_t cycs_per_usec = 0;
 
-#define MAX_USE_PIPE_SZ 3
+#define MAX_USE_PIPE_SZ 4
 
 #define SND_DATA 0x4321
 #define HPET_PERIOD_TEST_US 20000
diff --git a/src/components/implementation/tests/unit_slrcv/init.c b/src/components/implementation/tests/unit_slrcv/init.c
index 63dc56e2e7..efddf24649 100644
--- a/src/components/implementation/tests/unit_slrcv/init.c
+++ b/src/components/implementation/tests/unit_slrcv/init.c
@@ -33,7 +33,7 @@ ping_fn(void *d)
 }
 
 unsigned int iter = 0;
-cycles_t st = 0, en = 0, tot = 0, wc = 0;
+volatile cycles_t st = 0, en = 0, tot = 0, wc = 0;
 CRT_CHAN_STATIC_ALLOC(c0, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
 CRT_CHAN_STATIC_ALLOC(c1, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
 CRT_CHAN_STATIC_ALLOC(c2, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
@@ -42,15 +42,25 @@ CRT_CHAN_STATIC_ALLOC(c4, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
 CRT_CHAN_STATIC_ALLOC(c5, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
 CRT_CHAN_TYPE_PROTOTYPES(test, int, 4);
 
-#define PIPELINE_LEN 3
-#define ITERS 100
+#define PIPELINE_LEN 4
+#define PRIO_START (TCAP_PRIO_MAX + 10 + PIPELINE_LEN + 1)
+#define PRIO_INT (PRIO_START + 1)
+#define ITERS 1000
+static cycles_t vals[ITERS] = { 0 };
+static int iters = 0;
+static int pipe_line = 0;
+static int pipe_send = 0, pipe_rcv = 0;
 
 static inline void
 chrcv(int i)
 {
 	int r;
 
-	//printc("[r%d]", i);
+	if (i == 0) {
+		assert(ps_cas(&pipe_rcv, 0, PIPELINE_LEN));
+	}
+
+	//printc("[r%d,%d]", i, pipe_line);
 	switch(i) {
 	case 0: crt_chan_recv_test(c0, &r); break;
 	case 1: crt_chan_recv_test(c1, &r); break;
@@ -60,7 +70,9 @@ chrcv(int i)
 	case 5: crt_chan_recv_test(c5, &r); break;
 	default: assert(0);
 	}
-	//printc("[d%d]", i);
+	assert(ps_faa(&pipe_line, -1) == 1);
+	//printc("[d%d,%d]", i, pipe_line);
+	assert(ps_faa(&pipe_rcv, -1) == (PIPELINE_LEN - i));
 }
 
 static inline void
@@ -68,7 +80,12 @@ chsnd(int i)
 {
 	int s = 0xDEAD0000 | i;
 
-	//printc("[s%d]", i);
+	if (i == 0) {
+		assert(ps_cas(&pipe_send, 0, PIPELINE_LEN));
+	}
+	assert(ps_faa(&pipe_send, -1) == (PIPELINE_LEN - i));
+	//printc("[s%d,%d]", i, pipe_line);
+	assert(ps_faa(&pipe_line, 1) == 0);
 	switch(i) {
 	case 0: crt_chan_send_test(c0, &s); break;
 	case 1: crt_chan_send_test(c1, &s); break;
@@ -78,7 +95,7 @@ chsnd(int i)
 	case 5: crt_chan_send_test(c5, &s); break;
 	default: assert(0);
 	}
-	//printc("[o%d]", i);
+	//printc("[o%d,%d]", i, pipe_line);
 }
 
 static inline void
@@ -105,15 +122,17 @@ work_fn(void *x)
 		if (likely(chid + 1 < PIPELINE_LEN)) chsnd(chid + 1);
 		else {
 			rdtscll(en);
+			printc("e");
 			assert(en > st);
 			cycles_t diff = en - st;
 			if (diff > wc) wc = diff;
+			printc("%llu\n", diff);
 			tot += diff;
 			iter ++;
 			if (unlikely(iter == ITERS)) {
-				PRINTC("%llu %llu\n", tot / iter, wc);
-				//iter = 0;
-				//wc = tot = 0;
+				PRINTC("%d: %llu %llu\n", iter, tot / iter, wc);
+				iter = 0;
+				wc = tot = 0;
 			}
 		}
 	}
@@ -130,8 +149,11 @@ pong_fn(arcvcap_t r, void *d)
 	assert(a == 0);
 
 	while (1) {
-		if (iter == ITERS) capmgr_hw_detach(HW_HPET_PERIODIC);
+		//if (iter == ITERS) capmgr_hw_detach(HW_HPET_PERIODIC);
+		//printc("I");
 		int p = sl_thd_rcv(RCV_ULONLY);
+		//work_usecs(WORK_US);
+		printc("s");
 		rdtscll(st);
 		chsnd(0);
 	}
@@ -149,61 +171,65 @@ cos_init(void *d)
 	static volatile asndcap_t s = 0;
 	unsigned int cycs_per_us = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
 
-	if (NUM_CPU == 2) {
-		assert(0); // need to rework.. 
-		if (cos_cpuid() == 0) {
-			cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
-			cos_defcompinfo_llinit();
-			cos_dcb_info_init_curr();
-			sl_init(SL_MIN_PERIOD_US);
-
-			struct sl_thd *t = sl_thd_aep_alloc(pong_fn, NULL, 0, 0, 0, 0);
-			assert(t);
-			sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1));
-			r = sl_thd_rcvcap(t);
-			assert(r);
-		} else {
-			while (!ps_load(&init_done[0])) ;
-
-			cos_defcompinfo_sched_init();
-			cos_dcb_info_init_curr();
-			sl_init(SL_MIN_PERIOD_US);
-
-			struct sl_thd *t = sl_thd_alloc(ping_fn, (void *)&s);
-			assert(t);
-			sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1));
-
-			while (!r) ;
-			s = cos_asnd_alloc(ci, r, ci->captbl_cap);
-			assert(s);
-		}
-	} else {
+//	if (NUM_CPU == 2) {
+//		assert(0); // need to rework.. 
+//		if (cos_cpuid() == 0) {
+//			cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
+//			cos_defcompinfo_llinit();
+//			cos_dcb_info_init_curr();
+//			sl_init(SL_MIN_PERIOD_US);
+//
+//			struct sl_thd *t = sl_thd_aep_alloc(pong_fn, NULL, 0, 0, 0, 0);
+//			assert(t);
+//			sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1));
+//			r = sl_thd_rcvcap(t);
+//			assert(r);
+//		} else {
+//			while (!ps_load(&init_done[0])) ;
+//
+//			cos_defcompinfo_sched_init();
+//			cos_dcb_info_init_curr();
+//			sl_init(SL_MIN_PERIOD_US);
+//
+//			struct sl_thd *t = sl_thd_alloc(ping_fn, (void *)&s);
+//			assert(t);
+//			sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1));
+//
+//			while (!r) ;
+//			s = cos_asnd_alloc(ci, r, ci->captbl_cap);
+//			assert(s);
+//		}
+//	} else {
 		assert(NUM_CPU == 1);
 		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
 		cos_defcompinfo_init();
 		sl_init(SL_MIN_PERIOD_US*100);
-		int i;
+		//int i;
 		struct sl_thd *rt = sl_thd_aep_alloc(pong_fn, NULL, 0, 0, 0, 0);
 		assert(rt);
 
+		//sl_thd_param_set(rt, sched_param_pack(SCHEDP_PRIO, PRIO_INT));
 		for (i = 0; i < PIPELINE_LEN; i++) {
 			wt[i] = sl_thd_alloc(work_fn, (void *)i);
 			assert(wt[i]);
-			sl_thd_param_set(wt[i], sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1+PIPELINE_LEN-i));
+			//sl_thd_param_set(wt[i], sched_param_pack(SCHEDP_PRIO, PRIO_START-i));
 			if (i == 0) chinit(i, 0, 0);
 			else chinit(i, wt[i-1], wt[i]);
 		}
 
-		sl_thd_param_set(rt, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX+1+PIPELINE_LEN+1));
-	}
+//	}
 	ps_faa(&init_done[cos_cpuid()], 1);
 
 	/* make sure the INITTHD of the scheduler is created on all cores.. for cross-core sl initialization to work! */
 	for (i = 0; i < NUM_CPU; i++) {
 		while (!ps_load(&init_done[i])) ;
 	}
+	PRINTC("Int component init done!\n");
 	//hypercall_comp_init_done();
 	schedinit_child();
+	for (i = 0; i < PIPELINE_LEN; i++) sl_thd_param_set(wt[i], sched_param_pack(SCHEDP_PRIO, PRIO_START-i));
+	sl_thd_param_set(rt, sched_param_pack(SCHEDP_PRIO, PRIO_INT));
+
 	sl_sched_loop();
 
 	PRINTC("Should never get here!\n");
diff --git a/src/components/include/cos_ulsched_rcv.h b/src/components/include/cos_ulsched_rcv.h
index 0fa235a7df..60ff25d795 100644
--- a/src/components/include/cos_ulsched_rcv.h
+++ b/src/components/include/cos_ulsched_rcv.h
@@ -6,7 +6,7 @@
 static inline int
 __cos_sched_events_present(struct cos_sched_ring *r)
 {
-	return !(r->tail == r->head);
+	return (ps_load(&r->tail) != ps_load(&r->head));
 }
 
 static inline int
@@ -15,7 +15,16 @@ cos_sched_ispending(void)
 	struct cos_scb_info *scb_cpu = cos_scb_info_get_core();
 	struct cos_sched_ring *r     = &scb_cpu->sched_events;
 
-	return r->more;
+	return ps_load(&r->more);
+}
+
+static inline int
+cos_sched_events_isempty(void)
+{
+	struct cos_scb_info *scb_cpu = cos_scb_info_get_core();
+	struct cos_sched_ring *r     = &scb_cpu->sched_events;
+
+	return (ps_load(&r->tail) == ps_load(&r->head)) && !ps_load(&r->more);
 }
 
 static inline int
@@ -23,7 +32,8 @@ __cos_sched_event_consume(struct cos_sched_ring *r, struct cos_sched_event *e)
 {
 	int f = 0;
 
-	if (!r || !e || !__cos_sched_events_present(r)) return 0;
+	if (unlikely(!r || !__cos_sched_events_present(r))) return 0;
+	assert(e);
 	f = ps_upfaa((unsigned long *)&r->head, 1);
 	*e = r->event_buf[f];
 //	memcpy((void *)e, (void *)&(r->event_buf[f]), sizeof(struct cos_sched_event));
@@ -39,10 +49,12 @@ cos_ul_sched_rcv(arcvcap_t rcv, rcv_flags_t rfl, tcap_time_t timeout, struct cos
 	struct cos_scb_info *scb_cpu = cos_scb_info_get_core();
 	struct cos_sched_ring *r     = &scb_cpu->sched_events;
 
+	evt->tid = 0;
 	assert(scb_cpu);
 	/* a non-scheduler thread, should call with rcv == 0 to consume user-level events alone */
-	if (unlikely(__cos_sched_event_consume(r, evt) == 0
-		     && rcv && !(rfl & RCV_ULONLY))) {
+	if (__cos_sched_event_consume(r, evt) == 0
+		     && rcv && !(rfl & RCV_ULONLY)) {
+
 		ret = cos_sched_rcv(rcv, rfl, timeout, &(evt->tid), (int *)&(evt->evt.blocked),
 			            (cycles_t *)&(evt->evt.elapsed_cycs), (tcap_time_t *)&(evt->evt.next_timeout));
 		if (unlikely(ret < 0)) return ret;
@@ -57,6 +69,7 @@ cos_ul_rcv(arcvcap_t rcv, rcv_flags_t rfl, tcap_time_t sched_timeout)
 	struct cos_sched_event ev = { .tid = 0 };
 	int ret = 0;
 
+	if (likely(sched_timeout)) rfl |= RCV_SCHEDTIMEOUT;
 	ret = cos_sched_rcv(rcv, rfl, sched_timeout, &(ev.tid), (int *)&(ev.evt.blocked),
 			    (cycles_t *)&(ev.evt.elapsed_cycs), (tcap_time_t *)&(ev.evt.next_timeout));
 	assert(ev.tid == 0);
diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index ac10bb28c2..777125322e 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -43,6 +43,7 @@
 
 #define SL_CS
 #undef SL_REPLENISH
+#undef SL_PARENTCHILD
 
 /* Critical section (cs) API to protect scheduler data-structures */
 struct sl_cs {
@@ -450,21 +451,21 @@ sl_thd_dispatch_kern(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr,
 	assert(curr != next);
 	if (unlikely(!cd || !nd)) return cos_switch(sl_thd_thdcap(next), sl_thd_tcap(next), next->prio, timeout, g->sched_rcv, tok);
 
-	__asm__ __volatile__ (				\
-		"pushl %%ebp\n\t"			\
-		"movl %%esp, %%ebp\n\t"			\
-		"movl $1f, (%%esi)\n\t"			\
-		"movl %%esp, 4(%%esi)\n\t"		\
-		"movl %%ecx, %%esi\n\t"			\
-		"movl $2f, %%ecx\n\t"			\
-		"sysenter\n\t"				\
-		"jmp 2f\n\t"				\
-		".align 4\n\t"				\
-		"1:\n\t"				\
-		"movl $0, %%eax\n\t"			\
-		".align 4\n\t"				\
-		"2:\n\t"				\
-		"popl %%ebp\n\t"			\
+	__asm__ __volatile__ (			\
+		"pushl %%ebp\n\t"		\
+		"movl %%esp, %%ebp\n\t"		\
+		"movl $1f, (%%esi)\n\t"		\
+		"movl %%esp, 4(%%esi)\n\t"	\
+		"movl %%ecx, %%esi\n\t"		\
+		"movl $2f, %%ecx\n\t"		\
+		"sysenter\n\t"			\
+		"jmp 2f\n\t"			\
+		".align 4\n\t"			\
+		"1:\n\t"			\
+		"movl $0, %%eax\n\t"		\
+		".align 4\n\t"			\
+		"2:\n\t"			\
+		"popl %%ebp\n\t"		\
 		: "=a" (ret)
 		: "a" (a), "b" (b), "S" (cd), "D" (D), "d" (d), "c" (S)
 		: "memory", "cc");
@@ -507,34 +508,34 @@ sl_thd_dispatch_usr(struct sl_thd *next, sched_tok_t tok, struct sl_thd *curr)
 	 * slowpath thread switch!
 	 */
 
-	__asm__ __volatile__ (				\
-		"pushl %%ebp\n\t"			\
-		"movl %%esp, %%ebp\n\t"			\
-		"movl $2f, (%%eax)\n\t"			\
-		"movl %%esp, 4(%%eax)\n\t"		\
-		"cmp $0, 4(%%ebx)\n\t"			\
-		"je 1f\n\t"				\
-		"movl %%edx, (%%ecx)\n\t"		\
-		"movl 4(%%ebx), %%esp\n\t"		\
-		"jmp *(%%ebx)\n\t"			\
-		".align 4\n\t"				\
-		"1:\n\t"				\
-		"movl $3f, %%ecx\n\t"			\
-		"movl %%edx, %%eax\n\t"			\
-		"inc %%eax\n\t"				\
-		"shl $16, %%eax\n\t"			\
-		"movl $0, %%ebx\n\t"			\
-		"movl %%esi, %%edx\n\t"			\
-		"movl $0, %%esi\n\t"			\
-		"movl $0, %%edi\n\t"			\
-		"sysenter\n\t"				\
-		"jmp 3f\n\t"				\
-		".align 4\n\t"				\
-		"2:\n\t"				\
-		"movl $0, 4(%%ebx)\n\t"			\
-		".align 4\n\t"				\
-		"3:\n\t"				\
-		"popl %%ebp\n\t"			\
+	__asm__ __volatile__ (			\
+		"pushl %%ebp\n\t"		\
+		"movl %%esp, %%ebp\n\t"		\
+		"movl $2f, (%%eax)\n\t"		\
+		"movl %%esp, 4(%%eax)\n\t"	\
+		"cmp $0, 4(%%ebx)\n\t"		\
+		"je 1f\n\t"			\
+		"movl %%edx, (%%ecx)\n\t"	\
+		"movl 4(%%ebx), %%esp\n\t"	\
+		"jmp *(%%ebx)\n\t"		\
+		".align 4\n\t"			\
+		"1:\n\t"			\
+		"movl $3f, %%ecx\n\t"		\
+		"movl %%edx, %%eax\n\t"		\
+		"inc %%eax\n\t"			\
+		"shl $16, %%eax\n\t"		\
+		"movl $0, %%ebx\n\t"		\
+		"movl %%esi, %%edx\n\t"		\
+		"movl $0, %%esi\n\t"		\
+		"movl $0, %%edi\n\t"		\
+		"sysenter\n\t"			\
+		"jmp 3f\n\t"			\
+		".align 4\n\t"			\
+		"2:\n\t"			\
+		"movl $0, 4(%%ebx)\n\t"		\
+		".align 4\n\t"			\
+		"3:\n\t"			\
+		"popl %%ebp\n\t"		\
 		:
 		: "a" (cd), "b" (nd),
 		  "S" (g->timeout_next), "D" (tok),
@@ -566,7 +567,9 @@ sl_thd_activate_c(struct sl_thd *t, sched_tok_t tok, tcap_time_t timeout, tcap_p
 
 	/* TODO: there is something in the kernel that seem to disable timers..!! */
 	/* WORKAROUND: idle thread is a big cpu hogger.. so make sure there is timeout set around switching to and away! */
-	if (unlikely(curr == g->idle_thd || t == g->idle_thd)) return sl_thd_dispatch_kern(t, tok, curr, g->timeout_next, g->sched_tcap, prio);
+	if (unlikely(curr == g->idle_thd || t == g->idle_thd)) {
+		return sl_thd_dispatch_kern(t, tok, curr, g->timeout_next, g->sched_tcap, prio);
+	}
 
 	if (unlikely(timeout || prio)) {
 		return sl_thd_dispatch_kern(t, tok, curr, timeout, g->sched_tcap, prio);
@@ -665,6 +668,7 @@ sl_cs_exit_schedule_nospin_arg(struct sl_thd *to)
 		else
 			t = sl_mod_thd_get(pt);
 	}
+	if (unlikely(!t)) t= globals->sched_thd;
 
 #ifdef SL_REPLENISH
 	sl_thd_replenish_no_cs(t, now);
@@ -766,6 +770,7 @@ sl_cs_exit_schedule_nospin_arg_timeout(struct sl_thd *to, cycles_t abs_timeout)
 		else
 			t = sl_mod_thd_get(pt);
 	}
+	if (unlikely(!t)) t= globals->sched_thd;
 
 #ifdef SL_REPLENISH
 	sl_thd_replenish_no_cs(t, now);
@@ -781,6 +786,7 @@ sl_cs_exit_schedule_nospin_arg_timeout(struct sl_thd *to, cycles_t abs_timeout)
 	sl_cs_exit();
 #endif
 	if (likely(c == t && t == globals->sched_thd && timeout)) {
+		/* program the new timer.. */
 		return cos_defswitch(globals->sched_thdcap, globals->sched_thd->prio, timeout, tok);
 	}
 	if (unlikely(t == c)) return 0;
@@ -959,10 +965,8 @@ sl_thd_event_enqueue(struct sl_thd *t, struct cos_thd_event *e)
 {
 	struct sl_global_core *g = sl__globals_core();
 
-	if (e->epoch <= t->event_info.epoch) {
-		printc("<%d>", sl_thd_thdid(t));
-		return;
-	}
+	assert(e->epoch);
+	if (e->epoch <= t->event_info.epoch) return;
 
 	if (ps_list_singleton(t, SL_THD_EVENT_LIST)) ps_list_head_append(&g->event_head, t, SL_THD_EVENT_LIST);
 
diff --git a/src/components/lib/sl/sl_child.c b/src/components/lib/sl/sl_child.c
index 0edf6d1023..badc3bba88 100644
--- a/src/components/lib/sl/sl_child.c
+++ b/src/components/lib/sl/sl_child.c
@@ -47,6 +47,7 @@ sl_parent_notif_alloc(struct sl_thd *childthd)
 int
 sl_parent_notif_enqueue(struct sl_thd *thd, struct sl_child_notification *notif)
 {
+#ifdef SL_PARENTCHILD
 	assert(thd && notif);
 	assert(thd->properties & SL_THD_PROPERTY_SEND);
 
@@ -55,6 +56,9 @@ sl_parent_notif_enqueue(struct sl_thd *thd, struct sl_child_notification *notif)
 
 	if (ck_ring_enqueue_spsc_child(thd->ch_ring, thd->ch_ringbuf, notif) == false) return -1;
 	if (cos_asnd(sl_thd_asndcap(thd), 0)) return -1;
+#else
+	assert(0);
+#endif
 
 	return 0;
 }
@@ -85,6 +89,7 @@ sl_child_notif_map(cbuf_t id)
 int
 sl_child_notif_dequeue(struct sl_child_notification *notif)
 {
+#ifdef SL_PARENTCHILD
 	struct ck_ring *cring = child_ring[cos_cpuid()];
 	struct sl_child_notification *crbuf = child_ringbuf[cos_cpuid()];
 
@@ -92,38 +97,52 @@ sl_child_notif_dequeue(struct sl_child_notification *notif)
 	if (!cring || !crbuf) return 0;
 
 	if (ck_ring_dequeue_spsc_child(cring, crbuf, notif) == true) return 1;
-
+#endif
 	return 0;
 }
 
 int
 sl_child_notif_empty(void)
 {
+#ifdef SL_PARENTCHILD
 	struct ck_ring *cring = child_ring[cos_cpuid()];
 
 	if (!cring) return 1;
 
 	return (!ck_ring_size(cring));
+#else
+	return 1;
+#endif
 }
 
 int
 sl_parent_notif_block_no_cs(struct sl_thd *child, struct sl_thd *thd)
 {
+#ifdef SL_PARENTCHILD
 	struct sl_child_notification notif;
 
 	notif.type = SL_CHILD_THD_BLOCK;
 	notif.tid  = sl_thd_thdid(thd);
 
 	return sl_parent_notif_enqueue(child, &notif);
+#else
+	assert(0);
+	return 0;
+#endif
 }
 
 int
 sl_parent_notif_wakeup_no_cs(struct sl_thd *child, struct sl_thd *thd)
 {
+#ifdef SL_PARENTCHILD
 	struct sl_child_notification notif;
 
 	notif.type = SL_CHILD_THD_WAKEUP;
 	notif.tid  = sl_thd_thdid(thd);
 
 	return sl_parent_notif_enqueue(child, &notif);
+#else
+	assert(0);
+	return 0;
+#endif
 }
diff --git a/src/components/lib/sl/sl_sched.c b/src/components/lib/sl/sl_sched.c
index 44ed18bc98..e91444b852 100644
--- a/src/components/lib/sl/sl_sched.c
+++ b/src/components/lib/sl/sl_sched.c
@@ -558,7 +558,23 @@ sl_timeout_period(microsec_t period)
 /* engage space heater mode */
 void
 sl_idle(void *d)
-{ while (1) ; }
+{
+	struct sl_global_core *gc = sl__globals_core();
+
+	while (1) {
+		cycles_t now = sl_now();
+
+		do {
+			if (cos_sched_ispending() ||
+#if NUM_CPU > 1
+			    ck_ring_size(sl__ring_curr()) != 0 ||
+#endif
+			    !sl_child_notif_empty()) break;
+			now = sl_now();
+		} while (now < gc->timer_next);
+		sl_thd_activate_c(gc->sched_thd, cos_sched_sync(), 0, 0, gc->idle_thd, gc);
+	}
+}
 
 /* call from the user? */
 static void
@@ -727,23 +743,12 @@ sl_sched_loop_intern(int non_block)
 			struct cos_sched_event e = { .tid = 0 };
 
 			
-	struct sl_thd *curr = sl_thd_curr();
-	struct cos_dcb_info *cd = sl_thd_dcbinfo(curr);
-			assert(cd->sp == 0);
 			/*
 			 * a child scheduler may receive both scheduling notifications (block/unblock
 			 * states of it's child threads) and normal notifications (mainly activations from
 			 * it's parent scheduler).
 			 */
-			//pending = cos_ul_sched_rcv(g->sched_rcv, rfl, g->timeout_next, &e);
-//			if (cos_spd_id() != 4) printc("L");
-			//else                   printc("l");
 			pending = __sl_sched_rcv(rfl, &e);
-			assert(cd->sp == 0);
-//			if (cos_spd_id() != 4) printc("M");
-
-			//else                   printc("m");
-
 			if (pending < 0 || !e.tid) goto pending_events;
 
 			t = sl_thd_lkup(e.tid);
@@ -762,8 +767,11 @@ sl_sched_loop_intern(int non_block)
 
 pending_events:
 			if (ps_list_head_empty(&g->event_head) &&
+#if NUM_CPU > 1
 			    ck_ring_size(sl__ring_curr()) == 0 &&
-			    sl_child_notif_empty()) continue;
+#endif
+			    sl_child_notif_empty() && 
+			    !cos_sched_events_isempty()) continue;
 
 			/*
 			 * receiving scheduler notifications is not in critical section mainly for
@@ -807,8 +815,10 @@ sl_sched_loop_intern(int non_block)
 				else                                  sl_thd_wakeup_no_cs(t);
 			}
 
+#if NUM_CPU > 1
 			/* process cross-core requests */
 			sl_xcore_process_no_cs();
+#endif
 
 			sl_cs_exit();
 		} while (pending > 0);
@@ -831,13 +841,6 @@ sl_sched_loop_nonblock(void)
 	sl_sched_loop_intern(1);
 }
 
-int
-sl_thd_kern_dispatch(thdcap_t t)
-{
-	//return cos_switch(t, sl__globals_core()->sched_tcap, 0, sl__globals_core()->timeout_next, sl__globals_core()->sched_rcv, cos_sched_sync());
-	return cos_thd_switch(t);
-}
-
 void
 sl_thd_replenish_no_cs(struct sl_thd *t, cycles_t now)
 {
diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index dffc472ae9..1961c7e3b5 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -969,8 +969,10 @@ cap_arcv_op(struct cap_arcv *arcv, struct thread *thd, struct pt_regs *regs, str
 	struct next_thdinfo *nti         = &cos_info->next_ti;
 	rcv_flags_t          rflags      = __userregs_get1(regs);
 	tcap_time_t          swtimeout   = TCAP_TIME_NIL;
-	tcap_time_t          timeout     = __userregs_get2(regs);
+	tcap_time_t          timeout     = TCAP_TIME_NIL, x = __userregs_get2(regs);
 
+	if (likely(rflags & RCV_SCHEDTIMEOUT)) swtimeout = x;
+	else                                   timeout   = x;
 	if (unlikely(arcv->thd != thd || arcv->cpuid != get_cpuid())) return -EINVAL;
 
 	/* deliver pending notifications? */
diff --git a/src/kernel/include/shared/cos_types.h b/src/kernel/include/shared/cos_types.h
index 2022db0368..cee8b006ef 100644
--- a/src/kernel/include/shared/cos_types.h
+++ b/src/kernel/include/shared/cos_types.h
@@ -74,6 +74,7 @@ typedef enum {
 	RCV_NON_BLOCKING = 1,
 	RCV_ULONLY       = (1 << 1),
 	RCV_ULSCHED_RCV  = (1 << 2),
+	RCV_SCHEDTIMEOUT = (1 << 3),
 } rcv_flags_t;
 
 #define BOOT_LIVENESS_ID_BASE 2
diff --git a/src/platform/i386/hpet.c b/src/platform/i386/hpet.c
index 350628a09e..840754ef2c 100644
--- a/src/platform/i386/hpet.c
+++ b/src/platform/i386/hpet.c
@@ -202,7 +202,7 @@ static int count = 0;
 	if (unlikely(hpet_periodicity_curr[HPET_PERIODIC] && !hpet_first_hpet_period)) {
 	count++;
 
-	if (count < 15) goto done;
+	if (count < 25) goto done;
 		rdtscll(hpet_first_hpet_period);
 	}
 
diff --git a/src/platform/i386/runscripts/test_slite02.sh b/src/platform/i386/runscripts/test_slite02.sh
index 78f7b5127c..d01a307cbb 100644
--- a/src/platform/i386/runscripts/test_slite02.sh
+++ b/src/platform/i386/runscripts/test_slite02.sh
@@ -12,10 +12,10 @@ cp test_boot.o dummy2.o
 #./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, ;intcomp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o" ./gen_client_stub
 
 #int, w0 in root and w1 in comp
-./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, ;intcomp.o, ;w1comp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o;w1comp.o-boot.o|capmgr.o" ./gen_client_stub
+#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, ;intcomp.o, ;w1comp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o;w1comp.o-boot.o|capmgr.o" ./gen_client_stub
 
 # int, w1 - w3
-#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, ;intcomp.o, ;w1comp.o, ;w3comp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o;w1comp.o-boot.o|capmgr.o;w3comp.o-boot.o|capmgr.o" ./gen_client_stub
+./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, ;intcomp.o, ;w1comp.o, ;w3comp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o;w1comp.o-boot.o|capmgr.o;w3comp.o-boot.o|capmgr.o" ./gen_client_stub
 
 #cp test_boot.o dummy.o
 #./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, ;intcomp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o" ./gen_client_stub

From 57f3023baafa6b210dfbcbe174f7d0d28626309c Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Thu, 17 Oct 2019 11:46:12 -0400
Subject: [PATCH 121/127] removed debug assert, cleaned up test programs

---
 .../implementation/sched/root_fprr/init.c     |  6 ++++--
 .../implementation/sched/test_sched/init.c    | 16 ++++++++++----
 .../tests/micro_chan/unit_schedlib.c          |  4 ++--
 .../tests/test_schedinv/test_schedinv.c       | 21 ++++++++++++++-----
 .../implementation/tests/unit_slrcv/init.c    | 13 ++++++------
 src/components/include/sl.h                   |  2 +-
 src/components/lib/sl/sl_sched.c              |  3 ++-
 src/kernel/capinv.c                           |  1 -
 src/platform/i386/runscripts/test_slite02.sh  |  4 ++--
 src/platform/i386/runscripts/unit_slite01.sh  |  3 ++-
 10 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/src/components/implementation/sched/root_fprr/init.c b/src/components/implementation/sched/root_fprr/init.c
index d78a6068cc..54f819c7ff 100644
--- a/src/components/implementation/sched/root_fprr/init.c
+++ b/src/components/implementation/sched/root_fprr/init.c
@@ -50,7 +50,8 @@ sched_child_init(struct sched_childinfo *schedci)
 	tcap_prio_t p = FIXED_PRIO;
 
 	assert(schedci);
-	if (schedci->id != 1) p++;
+	if (schedci->id != 1) p = FIXED_PRIO;
+	else                  p = FIXED_PRIO + 1;
 	schedci->initthd = sl_thd_initaep_alloc(sched_child_defci_get(schedci), NULL, schedci->flags & COMP_FLAG_SCHED, schedci->flags & COMP_FLAG_SCHED ? 1 : 0, 0, 0, 0, &dcbaddr);
         assert(schedci->initthd);
 	initthd = schedci->initthd;
@@ -59,9 +60,10 @@ sched_child_init(struct sched_childinfo *schedci)
 		if (cos_tcap_transfer(sl_thd_rcvcap(initthd), BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF, p)) {
 			PRINTC("Failed to transfer INF budget\n");
 			assert(0);
+		} else {
+			sl_thd_param_set(initthd, sched_param_pack(SCHEDP_BUDGET, FIXED_BUDGET_MS));
 		}
 		sl_thd_param_set(initthd, sched_param_pack(SCHEDP_WINDOW, FIXED_PERIOD_MS));
-		sl_thd_param_set(initthd, sched_param_pack(SCHEDP_BUDGET, FIXED_BUDGET_MS));
 	}
 	sl_thd_param_set(initthd, sched_param_pack(SCHEDP_PRIO, p));
 }
diff --git a/src/components/implementation/sched/test_sched/init.c b/src/components/implementation/sched/test_sched/init.c
index 35563ec3ca..63ea842050 100644
--- a/src/components/implementation/sched/test_sched/init.c
+++ b/src/components/implementation/sched/test_sched/init.c
@@ -13,6 +13,7 @@
 #include <chan_crt.h>
 #include <channel.h>
 
+#define MAX_USE_PIPE_SZ 1
 #define INITIALIZE_PRIO 1
 #define INITIALIZE_PERIOD_MS (4000)
 #define INITIALIZE_BUDGET_MS (2000)
@@ -40,7 +41,6 @@ sched_child_init(struct sched_childinfo *schedci)
 
 extern void __sched_stdio_thd_init(thdid_t, struct crt_chan *, struct crt_chan *);
 #define MAX_PIPE_SZ 8
-#define MAX_USE_PIPE_SZ 4
 CRT_CHAN_STATIC_ALLOC(c0, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
 CRT_CHAN_STATIC_ALLOC(c1, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
 CRT_CHAN_STATIC_ALLOC(c2, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
@@ -68,7 +68,8 @@ CRT_CHAN_STATIC_ALLOC(c7, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
 #define SND_DATA 0x1234
 
 #define SHMCHANNEL_KEY 0x2020
-#define MAX_ITERS 100
+#define MAX_ITERS 100000
+cycles_t vals[MAX_ITERS] = { 0 };
 int iters = 0;
 cycles_t tot = 0, wc = 0;
 static int pc, tc;
@@ -134,17 +135,24 @@ work_thd_fn(void *data)
 		chan_in();
 		if (unlikely(is_last)) {
 			cycles_t end, diff;
+			if (iters >= MAX_ITERS) continue;
 			rdtscll(end);
 			assert(int_start);
 			diff = end - *int_start;
 			if (wc < diff) wc = diff;
 			tot += diff;
+			vals[iters] = diff;
+			//printc("%llu\n", diff);
 			iters++;
+			if (iters % 1000 == 0) printc(".");
 
 			if (iters == MAX_ITERS) {
+				int i;
+
+				for (i = 0; i < MAX_ITERS; i++) printc("%llu\n", vals[i]);
 				PRINTC("%llu, %llu\n", tot / iters, wc);
-				tot = wc = 0;
-				iters = 0;
+				//tot = wc = 0;
+				//iters = 0;
 			}
 			continue;
 		}
diff --git a/src/components/implementation/tests/micro_chan/unit_schedlib.c b/src/components/implementation/tests/micro_chan/unit_schedlib.c
index 736e6bbae2..8c770acd53 100644
--- a/src/components/implementation/tests/micro_chan/unit_schedlib.c
+++ b/src/components/implementation/tests/micro_chan/unit_schedlib.c
@@ -366,8 +366,8 @@ test_yields(void)
 		//sl_thd_yield_thd(threads[i]);
 	}
 	assert(N_TESTTHDS == 2);
-	//crt_chan_p2p_init_test(c0, threads[SND], threads[RCV]);
-	crt_chan_init_test(c0);
+	crt_chan_p2p_init_test(c0, threads[SND], threads[RCV]);
+	//crt_chan_init_test(c0);
 }
 
 //void
diff --git a/src/components/implementation/tests/test_schedinv/test_schedinv.c b/src/components/implementation/tests/test_schedinv/test_schedinv.c
index 36f938e0a2..5782539f3f 100644
--- a/src/components/implementation/tests/test_schedinv/test_schedinv.c
+++ b/src/components/implementation/tests/test_schedinv/test_schedinv.c
@@ -19,7 +19,7 @@
 
 static u32_t cycs_per_usec = 0;
 
-#define MAX_USE_PIPE_SZ 4
+#define MAX_USE_PIPE_SZ 1
 
 #define SND_DATA 0x4321
 #define HPET_PERIOD_TEST_US 20000
@@ -27,6 +27,9 @@ static u32_t cycs_per_usec = 0;
 #define SHMCHANNEL_KEY 0x2020
 static cycles_t *sttsc = NULL;
 volatile unsigned long *rdy = NULL;
+int iters = 0;
+#define ITERS 100000
+cycles_t vals[ITERS] = { 0 };
 
 static void
 __test_int_fn(arcvcap_t rcv, void *data)
@@ -40,16 +43,17 @@ __test_int_fn(arcvcap_t rcv, void *data)
 	/* TODO: register to HPET */
 	while (1) {
 		cos_rcv(rcv, 0);
+		iters++;
 		rdtscll(*sttsc);
 		chan_out(SND_DATA);
+
+		if (iters == ITERS) capmgr_hw_detach(HW_HPET_PERIODIC);
 	}
 
 	sched_thd_exit();
 }
 
-#define ITERS 100
 cycles_t tot = 0, wc = 0;
-int iters = 0;
 
 static void
 __test_wrk_fn(void *data)
@@ -62,16 +66,23 @@ __test_wrk_fn(void *data)
 		if (unlikely(e)) {
 			cycles_t en, diff;
 
+			if (unlikely(iters >= ITERS)) continue;
 			rdtscll(en);
 			assert(sttsc);
 			diff = en - *sttsc;
 			if (diff > wc) wc = diff;
 			tot += diff;
+			vals[iters] = diff;
+			//printc("%llu\n", diff);
 			iters++;
+			if (iters % 1000 == 0) printc(",");
 			if (iters == ITERS) {
-				PRINTC("%llu, %llu\n", tot / ITERS, wc);	
+				int i;
+
+				for (i = 0; i < ITERS; i++) printc("%llu\n", vals[i]);
+				PRINTC("%llu, %llu\n", tot / ITERS, wc);
 				tot = wc = 0;
-				iters = 0;
+				//iters = 0;
 			}
 			continue;
 		}
diff --git a/src/components/implementation/tests/unit_slrcv/init.c b/src/components/implementation/tests/unit_slrcv/init.c
index efddf24649..aa5a85741c 100644
--- a/src/components/implementation/tests/unit_slrcv/init.c
+++ b/src/components/implementation/tests/unit_slrcv/init.c
@@ -45,9 +45,8 @@ CRT_CHAN_TYPE_PROTOTYPES(test, int, 4);
 #define PIPELINE_LEN 4
 #define PRIO_START (TCAP_PRIO_MAX + 10 + PIPELINE_LEN + 1)
 #define PRIO_INT (PRIO_START + 1)
-#define ITERS 1000
+#define ITERS 100000
 static cycles_t vals[ITERS] = { 0 };
-static int iters = 0;
 static int pipe_line = 0;
 static int pipe_send = 0, pipe_rcv = 0;
 
@@ -122,14 +121,17 @@ work_fn(void *x)
 		if (likely(chid + 1 < PIPELINE_LEN)) chsnd(chid + 1);
 		else {
 			rdtscll(en);
-			printc("e");
+			if (iter >= ITERS) continue;
 			assert(en > st);
 			cycles_t diff = en - st;
 			if (diff > wc) wc = diff;
-			printc("%llu\n", diff);
+			//printc("%llu\n", diff);
+			vals[iter] = diff;
 			tot += diff;
 			iter ++;
 			if (unlikely(iter == ITERS)) {
+				int i;
+				for (i = 0; i < ITERS; i++) printc("%llu\n", vals[i]);
 				PRINTC("%d: %llu %llu\n", iter, tot / iter, wc);
 				iter = 0;
 				wc = tot = 0;
@@ -149,13 +151,12 @@ pong_fn(arcvcap_t r, void *d)
 	assert(a == 0);
 
 	while (1) {
-		//if (iter == ITERS) capmgr_hw_detach(HW_HPET_PERIODIC);
 		//printc("I");
 		int p = sl_thd_rcv(RCV_ULONLY);
 		//work_usecs(WORK_US);
-		printc("s");
 		rdtscll(st);
 		chsnd(0);
+		if (iter == ITERS) capmgr_hw_detach(HW_HPET_PERIODIC);
 	}
 	sl_thd_exit();
 }
diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index 777125322e..1e824c206d 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -42,7 +42,7 @@
 #include <cos_ulsched_rcv.h>
 
 #define SL_CS
-#undef SL_REPLENISH
+#define SL_REPLENISH
 #undef SL_PARENTCHILD
 
 /* Critical section (cs) API to protect scheduler data-structures */
diff --git a/src/components/lib/sl/sl_sched.c b/src/components/lib/sl/sl_sched.c
index e91444b852..095dffb072 100644
--- a/src/components/lib/sl/sl_sched.c
+++ b/src/components/lib/sl/sl_sched.c
@@ -850,7 +850,8 @@ sl_thd_replenish_no_cs(struct sl_thd *t, cycles_t now)
 	cycles_t replenish;
 	int ret;
 
-	if (!(t->properties & SL_THD_PROPERTY_OWN_TCAP && t->budget)) return;
+	if (likely(!(t->properties & SL_THD_PROPERTY_OWN_TCAP))) return;
+	if (!t->budget) return;
 	assert(t->period);
 	assert(sl_thd_tcap(t) != sl__globals_core()->sched_tcap);
 
diff --git a/src/kernel/capinv.c b/src/kernel/capinv.c
index 1961c7e3b5..74def59e85 100644
--- a/src/kernel/capinv.c
+++ b/src/kernel/capinv.c
@@ -908,7 +908,6 @@ cap_hw_asnd(struct cap_asnd *asnd, struct pt_regs *regs)
 	assert(rcv_tcap && tcap);
 
 	next = asnd_process(rcv_thd, thd, rcv_tcap, tcap, &tcap_next, 0, cos_info);
-	assert(next == rcv_thd);
 	if (next == thd) return 1;
 	thd->state |= THD_STATE_PREEMPTED;
 
diff --git a/src/platform/i386/runscripts/test_slite02.sh b/src/platform/i386/runscripts/test_slite02.sh
index d01a307cbb..e51ba080f7 100644
--- a/src/platform/i386/runscripts/test_slite02.sh
+++ b/src/platform/i386/runscripts/test_slite02.sh
@@ -9,13 +9,13 @@ cp test_boot.o dummy1.o
 cp test_boot.o dummy2.o
 
 # only int and w0 in root
-#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, ;intcomp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o" ./gen_client_stub
+./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, ;intcomp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o" ./gen_client_stub
 
 #int, w0 in root and w1 in comp
 #./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, ;intcomp.o, ;w1comp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o;w1comp.o-boot.o|capmgr.o" ./gen_client_stub
 
 # int, w1 - w3
-./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, ;intcomp.o, ;w1comp.o, ;w3comp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o;w1comp.o-boot.o|capmgr.o;w3comp.o-boot.o|capmgr.o" ./gen_client_stub
+#./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, ;intcomp.o, ;w1comp.o, ;w3comp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o;w1comp.o-boot.o|capmgr.o;w3comp.o-boot.o|capmgr.o" ./gen_client_stub
 
 #cp test_boot.o dummy.o
 #./cos_linker "llboot.o, ;dummy1.o, ;capmgr.o, ;dummy2.o, ;*boot.o, ;intcomp.o, :boot.o-capmgr.o;intcomp.o-boot.o|capmgr.o" ./gen_client_stub
diff --git a/src/platform/i386/runscripts/unit_slite01.sh b/src/platform/i386/runscripts/unit_slite01.sh
index 00ee72b414..8a887a8a36 100644
--- a/src/platform/i386/runscripts/unit_slite01.sh
+++ b/src/platform/i386/runscripts/unit_slite01.sh
@@ -3,6 +3,7 @@
 cp llboot_comp.o llboot.o
 cp root_fprr.o boot.o
 #cp unit_slrcvtest.o boot.o
+#cp test_boot.o dummy1.o
 #cp test_boot.o dummy2.o
-./cos_linker "llboot.o, ;*unit_slrcvtest.o, ;capmgr.o, ;*spin_comp.o, ;*boot.o, :boot.o-capmgr.o;unit_slrcvtest.o-boot.o|capmgr.o;spin_comp.o-boot.o|capmgr.o" ./gen_client_stub
+./cos_linker "llboot.o, ;*spin_comp.o, ;capmgr.o, ;*unit_slrcvtest.o, ;*boot.o, :boot.o-capmgr.o;unit_slrcvtest.o-boot.o|capmgr.o;spin_comp.o-boot.o|capmgr.o" ./gen_client_stub
 #./cos_linker "llboot.o, ;dummy2.o, ;capmgr.o, ;dummy1.o, ;*boot.o, :boot.o-capmgr.o" ./gen_client_stub

From 529ae447be0ab64f06888419d6bf3f06bf044851 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Fri, 18 Oct 2019 12:45:19 -0400
Subject: [PATCH 122/127] disable tcap check

---
 src/components/include/sl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/components/include/sl.h b/src/components/include/sl.h
index 1e824c206d..777125322e 100644
--- a/src/components/include/sl.h
+++ b/src/components/include/sl.h
@@ -42,7 +42,7 @@
 #include <cos_ulsched_rcv.h>
 
 #define SL_CS
-#define SL_REPLENISH
+#undef SL_REPLENISH
 #undef SL_PARENTCHILD
 
 /* Critical section (cs) API to protect scheduler data-structures */

From 3ed8962aae2630ec46de78fe5798bc9aaf0dec2e Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Fri, 18 Oct 2019 13:06:17 -0400
Subject: [PATCH 123/127] iters change in channel test

---
 .../implementation/tests/micro_chan/unit_schedlib.c         | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/components/implementation/tests/micro_chan/unit_schedlib.c b/src/components/implementation/tests/micro_chan/unit_schedlib.c
index 8c770acd53..bfc8c2340d 100644
--- a/src/components/implementation/tests/micro_chan/unit_schedlib.c
+++ b/src/components/implementation/tests/micro_chan/unit_schedlib.c
@@ -18,7 +18,7 @@
 #include <crt_lock.h>
 
 /* Iterations, channels */
-#define CHAN_ITER  1000
+#define CHAN_ITER  1000000
 #define NCHANTHDS  2
 #define CHAN_BATCH 3
 
@@ -366,8 +366,8 @@ test_yields(void)
 		//sl_thd_yield_thd(threads[i]);
 	}
 	assert(N_TESTTHDS == 2);
-	crt_chan_p2p_init_test(c0, threads[SND], threads[RCV]);
-	//crt_chan_init_test(c0);
+	//crt_chan_p2p_init_test(c0, threads[SND], threads[RCV]);
+	crt_chan_init_test(c0);
 }
 
 //void

From 89a2ee29af24679a3a3ba8cdc398ae354aa11864 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Fri, 1 Nov 2019 17:53:26 -0400
Subject: [PATCH 124/127] allow make clean all from src/

---
 src/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 8a7bd33810..47933ee302 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,9 +1,9 @@
 MAKEFLAGS=--no-print-directory --section-alignment 0x1000 -I$(PWD)
 #$(info Make flags $(MAKEFLAGS))
 
-default: | all cp
+default: | all
 
-all: comps plat
+all: comps plat cp
 
 comps:
 	$(info )

From b9b9006a67b50d9aa80ae6393a91ceb079c32084 Mon Sep 17 00:00:00 2001
From: phani <phanikishoreg@gwu.edu>
Date: Fri, 1 Nov 2019 17:53:58 -0400
Subject: [PATCH 125/127] part for NTHDS > 1 on single core and a test program

---
 .../implementation/tests/part_test/Makefile   |  8 ++
 .../implementation/tests/part_test/init.c     | 79 +++++++++++++++++++
 .../implementation/tests/part_test/main.c     | 32 ++++++++
 src/components/include/part_task.h            |  3 +-
 src/platform/i386/runscripts/part_test.sh     |  4 +
 5 files changed, 125 insertions(+), 1 deletion(-)
 create mode 100644 src/components/implementation/tests/part_test/Makefile
 create mode 100644 src/components/implementation/tests/part_test/init.c
 create mode 100644 src/components/implementation/tests/part_test/main.c
 create mode 100644 src/platform/i386/runscripts/part_test.sh

diff --git a/src/components/implementation/tests/part_test/Makefile b/src/components/implementation/tests/part_test/Makefile
new file mode 100644
index 0000000000..3fcb066f74
--- /dev/null
+++ b/src/components/implementation/tests/part_test/Makefile
@@ -0,0 +1,8 @@
+COMPONENT=part_test.o
+INTERFACES=
+#DEPENDENCIES=capmgr
+IF_LIB=
+ADDITIONAL_LIBS=-lcobj_format -lcos_kernel_api $(LIBSLRAW) -lsl_mod_part_fifo -lsl_thd_static_backend -lsl_lock -lcos_defkernel_api -lpart_raw -lsl_blkpt -lps
+
+include ../../Makefile.subsubdir
+MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/tests/part_test/init.c b/src/components/implementation/tests/part_test/init.c
new file mode 100644
index 0000000000..3511588c85
--- /dev/null
+++ b/src/components/implementation/tests/part_test/init.c
@@ -0,0 +1,79 @@
+#include <cos_kernel_api.h>
+#include <cos_defkernel_api.h>
+#include <llprint.h>
+#include <sl.h>
+#include <cos_dcb.h>
+#include <part.h>
+#include <part_task.h>
+
+int main(void);
+
+void
+cos_exit(int x)
+{
+	PRINTC("Exit code: %d\n", x);
+	while (1) ;
+}
+
+static void
+cos_main(void *d)
+{
+	assert(sl_thd_thdid(sl_thd_curr()) == cos_thdid());
+	main();
+
+	while (1) ;
+}
+
+extern void cos_gomp_init(void);
+
+void
+cos_init(void *d)
+{
+	struct cos_defcompinfo *defci = cos_defcompinfo_curr_get();
+	struct cos_compinfo *   ci    = cos_compinfo_get(defci);
+	int i;
+	static volatile unsigned long first = NUM_CPU + 1, init_done[NUM_CPU] = { 0 };
+	static unsigned b1 = 0, b2 = 0, b3 = 0;
+
+	PRINTC("In a parallel program!\n");
+	if (ps_cas(&first, NUM_CPU + 1, cos_cpuid())) {
+		cos_meminfo_init(&(ci->mi), BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
+		cos_defcompinfo_llinit();
+	} else {
+		while (!ps_load(&init_done[first])) ;
+
+		cos_defcompinfo_sched_init();
+	}
+	cos_dcb_info_init_curr();
+	ps_faa(&init_done[cos_cpuid()], 1);
+
+	/* make sure the INITTHD of the scheduler is created on all cores.. for cross-core sl initialization to work! */
+	for (i = 0; i < NUM_CPU; i++) {
+		while (!ps_load(&init_done[i])) ;
+	}
+	sl_init(SL_MIN_PERIOD_US*100);
+	/* barrier, wait for sl_init to be done on all cores */
+	ps_faa(&b1, 1);
+	while (ps_load(&b1) != NUM_CPU) ;
+	part_init();
+	/* barrier, wait for gomp_init to be done on all cores */
+	ps_faa(&b2, 1);
+	while (ps_load(&b2) != NUM_CPU) ;
+
+	if (!cos_cpuid()) {
+		struct sl_thd *t = NULL;
+
+		t = sl_thd_alloc(cos_main, NULL);
+		assert(t);
+		sl_thd_param_set(t, sched_param_pack(SCHEDP_PRIO, TCAP_PRIO_MAX));
+	}
+	/* wait for all cores to reach this point, so all threads wait for main thread to be ready! */
+	ps_faa(&b3, 1);
+	while (ps_load(&b3) != NUM_CPU) ;
+
+	sl_sched_loop_nonblock();
+
+	PRINTC("Should never get here!\n");
+	assert(0);
+}
+
diff --git a/src/components/implementation/tests/part_test/main.c b/src/components/implementation/tests/part_test/main.c
new file mode 100644
index 0000000000..b751b97ece
--- /dev/null
+++ b/src/components/implementation/tests/part_test/main.c
@@ -0,0 +1,32 @@
+#include <part.h>
+#include <part_task.h>
+
+#define NTHDS 2
+
+void
+work_fn(void *d)
+{
+	PRINTC("Sharing work!\n");
+}
+
+int
+main(void)
+{
+	struct sl_thd *c = sl_thd_curr();
+	struct part_task *p = (struct part_task *)c->part_context, *pt = &main_task;
+	int n = NTHDS > PART_MAX_PAR_THDS ? PART_MAX_PAR_THDS : NTHDS;
+
+	assert(p == NULL);
+	
+	pt->state = PART_TASK_S_ALLOCATED;
+	part_task_init(pt, PART_TASK_T_WORKSHARE, p, n, work_fn, NULL, NULL);
+	assert(pt->nthds = n);
+
+	c->part_context = pt;
+	part_list_append(pt);
+
+	work_fn(NULL);
+	part_task_end(pt);
+
+	PRINTC("Done!\n");
+}
diff --git a/src/components/include/part_task.h b/src/components/include/part_task.h
index dff7b9e42d..5ec8535771 100644
--- a/src/components/include/part_task.h
+++ b/src/components/include/part_task.h
@@ -13,7 +13,7 @@
 
 #define PART_MAX_TASKS      (NUM_CPU < 4 ? 2048 : 8192)
 #define PART_MAX_DATA       PART_MAX_TASKS 
-#define PART_MAX_PAR_THDS   NUM_CPU
+#define PART_MAX_PAR_THDS   4 // to test 4 data-parallel tasks on a single core
 #define PART_MAX_CORE_THDS  (NUM_CPU == 1 ? 200 : (NUM_CPU == 2 ? 128 : (NUM_CPU < 5 ? 64 : 48)))
 #define PART_MAX_THDS       512
 #define PART_MAX_CHILD      1024
@@ -108,6 +108,7 @@ part_task_init(struct part_task *t, part_task_type_t type, struct part_task *p,
 	memset(t->ws, 0, sizeof(struct part_workshare) * PART_MAX_WORKSHARES);
 	t->cs.fn = fn;
 	t->cs.data = data;
+	assert (nthds <= PART_MAX_PAR_THDS);
 	t->nthds = nthds;
 	t->nworkers = 0;
 	memset(t->workers, 0, sizeof(unsigned) * PART_MAX_PAR_THDS);
diff --git a/src/platform/i386/runscripts/part_test.sh b/src/platform/i386/runscripts/part_test.sh
new file mode 100644
index 0000000000..a8815e0903
--- /dev/null
+++ b/src/platform/i386/runscripts/part_test.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+cp part_test.o llboot.o
+./cos_linker "llboot.o, :" ./gen_client_stub

From 6d500f4d820999459dc33f161a20802f1cdadaa6 Mon Sep 17 00:00:00 2001
From: Phani <phanikishoreg@gwu.edu>
Date: Thu, 21 May 2020 10:48:58 -0400
Subject: [PATCH 126/127] merge with my own branches first

---
 .../no_interface/omp_workconservation/work_problem.c          | 2 +-
 src/components/implementation/sched/test_sched/init.c         | 2 +-
 src/components/implementation/srv_dummy/Makefile              | 2 +-
 src/components/implementation/srv_dummy/cdummy/Makefile       | 2 +-
 src/components/implementation/srv_dummy/chan_backend.c        | 1 +
 src/components/implementation/srv_dummy/sched_info.h          | 2 ++
 src/components/implementation/srv_dummy/sdummy/Makefile       | 2 +-
 src/components/implementation/tests/micro_booter/mb_tests.c   | 4 ++--
 .../implementation/tests/test_schedinv/test_schedinv.c        | 2 +-
 .../implementation/tests/unit_schedtests/unit_schedlib.c      | 2 +-
 src/components/include/part_task.h                            | 2 +-
 src/components/lib/ps                                         | 2 +-
 12 files changed, 14 insertions(+), 11 deletions(-)
 create mode 120000 src/components/implementation/srv_dummy/chan_backend.c

diff --git a/src/components/implementation/no_interface/omp_workconservation/work_problem.c b/src/components/implementation/no_interface/omp_workconservation/work_problem.c
index b2cafd461c..e395df3eeb 100644
--- a/src/components/implementation/no_interface/omp_workconservation/work_problem.c
+++ b/src/components/implementation/no_interface/omp_workconservation/work_problem.c
@@ -78,7 +78,7 @@ int main(void)
 		assert(diff > 0);
 
 		total += diff;
-		if (diff > max) max = diff;
+		if ((unsigned long long) diff > max) max = diff;
 		printc("%ld, %ld\n", diff, diff / CYC_US);
 	}
 
diff --git a/src/components/implementation/sched/test_sched/init.c b/src/components/implementation/sched/test_sched/init.c
index 63ea842050..83db6ea806 100644
--- a/src/components/implementation/sched/test_sched/init.c
+++ b/src/components/implementation/sched/test_sched/init.c
@@ -271,7 +271,7 @@ cos_init(void)
 	sl_init_corebmp(100*SL_MIN_PERIOD_US, cpubmp);
 	vaddr_t tscaddr = 0;
 	cbuf_t id = channel_shared_page_alloc(SHMCHANNEL_KEY, &tscaddr);
-	assert(id >= 0);
+	assert(id > 0);
 	int_start = (cycles_t *)tscaddr;
 	*int_start = 0ULL;
 	rdy = (volatile unsigned long *)(int_start + 1);
diff --git a/src/components/implementation/srv_dummy/Makefile b/src/components/implementation/srv_dummy/Makefile
index 0490a703e3..53929a7ceb 100644
--- a/src/components/implementation/srv_dummy/Makefile
+++ b/src/components/implementation/srv_dummy/Makefile
@@ -1,3 +1,3 @@
-INTERFACES=sched schedinit srv_dummy
+INTERFACES=sched schedinit crt srv_dummy
 
 include ../Makefile.subdir
diff --git a/src/components/implementation/srv_dummy/cdummy/Makefile b/src/components/implementation/srv_dummy/cdummy/Makefile
index 1762e85c90..f6165eca08 100644
--- a/src/components/implementation/srv_dummy/cdummy/Makefile
+++ b/src/components/implementation/srv_dummy/cdummy/Makefile
@@ -5,7 +5,7 @@ INTERFACES=sched schedinit srv_dummy
 DEPENDENCIES=capmgr sched schedinit channel
 IF_LIB=
 FN_PREPEND=parent_
-ADDITIONAL_LIBS=$(LIBSLCAPMGR) -lsl_thd_static_backend -lsl_mod_fprr -lsinv_client
+ADDITIONAL_LIBS=$(LIBSLCAPMGR) -lsl_thd_static_backend -lsl_mod_fprr -lsinv_client -lsl_blkpt
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/srv_dummy/chan_backend.c b/src/components/implementation/srv_dummy/chan_backend.c
new file mode 120000
index 0000000000..1f996d8e9b
--- /dev/null
+++ b/src/components/implementation/srv_dummy/chan_backend.c
@@ -0,0 +1 @@
+../sched/chan_backend.c
\ No newline at end of file
diff --git a/src/components/implementation/srv_dummy/sched_info.h b/src/components/implementation/srv_dummy/sched_info.h
index 7cb898ec51..b922a2464b 100644
--- a/src/components/implementation/srv_dummy/sched_info.h
+++ b/src/components/implementation/srv_dummy/sched_info.h
@@ -11,8 +11,10 @@
 #include <cos_kernel_api.h>
 #include <cos_defkernel_api.h>
 #include <cos_types.h>
+#include <crt_chan.h>
 
 #define SCHED_MAX_CHILD_COMPS 8
+CRT_CHAN_TYPE_PROTOTYPES(LU, CHAN_CRT_ITEM_TYPE, CHAN_CRT_NSLOTS);
 
 struct sched_childinfo {
 	struct cos_defcompinfo defcinfo;
diff --git a/src/components/implementation/srv_dummy/sdummy/Makefile b/src/components/implementation/srv_dummy/sdummy/Makefile
index 75fff5cefa..89e1ccf634 100644
--- a/src/components/implementation/srv_dummy/sdummy/Makefile
+++ b/src/components/implementation/srv_dummy/sdummy/Makefile
@@ -5,7 +5,7 @@ INTERFACES=sched schedinit srv_dummy
 DEPENDENCIES=capmgr sched schedinit
 IF_LIB=
 FN_PREPEND=parent_
-ADDITIONAL_LIBS=$(LIBSLCAPMGR) -lsl_thd_static_backend -lsl_mod_fprr
+ADDITIONAL_LIBS=$(LIBSLCAPMGR) -lsl_thd_static_backend -lsl_mod_fprr -lsl_blkpt
 
 include ../../Makefile.subsubdir
 MANDITORY_LIB=simple_stklib.o
diff --git a/src/components/implementation/tests/micro_booter/mb_tests.c b/src/components/implementation/tests/micro_booter/mb_tests.c
index 1eedd26023..744943c49d 100644
--- a/src/components/implementation/tests/micro_booter/mb_tests.c
+++ b/src/components/implementation/tests/micro_booter/mb_tests.c
@@ -67,7 +67,7 @@ test_thds_perf(void)
 
 		diff = end_swt_cycles - start_swt_cycles;
 	total_swt_cycles += diff;
-		if (diff > max) max = diff;
+		if (diff > (cycles_t)max) max = diff;
 	}
 	//total_swt_cycles = (end_swt_cycles - start_swt_cycles) / 2LL;
 
@@ -185,7 +185,7 @@ async_thd_parent_perf(void *thdcap)
 	rdtscll(end_arcv_cycles);
 	assert(switched);
 	diff = end_arcv_cycles - start_asnd_cycles;
-	if (diff > max) max = diff;
+	if (diff > (cycles_t)max) max = diff;
 	total_asnd_cycles += diff;
 	}
 	//total_asnd_cycles = (end_arcv_cycles - start_asnd_cycles) / 2;
diff --git a/src/components/implementation/tests/test_schedinv/test_schedinv.c b/src/components/implementation/tests/test_schedinv/test_schedinv.c
index 5782539f3f..2e71cb8ef3 100644
--- a/src/components/implementation/tests/test_schedinv/test_schedinv.c
+++ b/src/components/implementation/tests/test_schedinv/test_schedinv.c
@@ -119,7 +119,7 @@ cos_init(void)
 	vaddr_t addr = 0;
 	unsigned long pages = 0;
 	cbuf_t id =  channel_shared_page_map(SHMCHANNEL_KEY, &addr, &pages);
-	assert(id >= 0 && addr && pages == 1);
+	assert(id > 0 && addr && pages == 1);
 	sttsc = (cycles_t *)addr;
 	rdy = (volatile unsigned long *)(sttsc + 1);
 
diff --git a/src/components/implementation/tests/unit_schedtests/unit_schedlib.c b/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
index 770493803e..807776f25c 100644
--- a/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
+++ b/src/components/implementation/tests/unit_schedtests/unit_schedlib.c
@@ -95,7 +95,7 @@ test_inv_setup(void)
         ic = cos_sinv_alloc(ci, cc, (vaddr_t)__inv_test_serverfn, 0);
         assert(ic > 0);
         ret = call_cap_mb(ic, 1, 2, 3);
-        assert(ret == (int)MAGIC_RET);
+        assert(ret == MAGIC_RET);
 
 	sinv_cap = ic;
 }
diff --git a/src/components/include/part_task.h b/src/components/include/part_task.h
index 5ec8535771..8bc9f4ea38 100644
--- a/src/components/include/part_task.h
+++ b/src/components/include/part_task.h
@@ -13,7 +13,7 @@
 
 #define PART_MAX_TASKS      (NUM_CPU < 4 ? 2048 : 8192)
 #define PART_MAX_DATA       PART_MAX_TASKS 
-#define PART_MAX_PAR_THDS   4 // to test 4 data-parallel tasks on a single core
+#define PART_MAX_PAR_THDS   NUM_CPU /* change this to test more data-parallel tasks on single core configuration */
 #define PART_MAX_CORE_THDS  (NUM_CPU == 1 ? 200 : (NUM_CPU == 2 ? 128 : (NUM_CPU < 5 ? 64 : 48)))
 #define PART_MAX_THDS       512
 #define PART_MAX_CHILD      1024
diff --git a/src/components/lib/ps b/src/components/lib/ps
index 5749bd695f..33f7771466 160000
--- a/src/components/lib/ps
+++ b/src/components/lib/ps
@@ -1 +1 @@
-Subproject commit 5749bd695f8eb4529b879bd4d9cfdfe6add3bdd0
+Subproject commit 33f77714661eb553c4d5034ec68366859410d8a8

From bb1476b4d2464c6bb0572130beb526c01ea3d47f Mon Sep 17 00:00:00 2001
From: Phani <phanikishoreg@gwu.edu>
Date: Fri, 22 May 2020 11:13:45 -0400
Subject: [PATCH 127/127] indentation in kernel_tests is off completely

---
 .../tests/kernel_tests/k_perf_tests.c         | 298 ++++-----
 .../tests/kernel_tests/k_test_async.c         | 244 ++++----
 .../tests/kernel_tests/k_test_captbl.c        |  28 +-
 .../tests/kernel_tests/k_test_inv.c           | 120 ++--
 .../tests/kernel_tests/k_test_mem.c           |  80 +--
 .../tests/kernel_tests/k_test_tcap.c          | 582 +++++++++---------
 .../tests/kernel_tests/k_test_thd.c           | 186 +++---
 .../tests/kernel_tests/kernel_test_booter.c   |  98 +--
 .../tests/kernel_tests/kernel_tests.h         |  52 +-
 9 files changed, 844 insertions(+), 844 deletions(-)

diff --git a/src/components/implementation/tests/kernel_tests/k_perf_tests.c b/src/components/implementation/tests/kernel_tests/k_perf_tests.c
index 065d55f661..81812d60c5 100644
--- a/src/components/implementation/tests/kernel_tests/k_perf_tests.c
+++ b/src/components/implementation/tests/kernel_tests/k_perf_tests.c
@@ -23,58 +23,58 @@ volatile cycles_t         main_thd = 0, side_thd = 0;
 static void
 bounceback(void *d)
 {
-        while (1) {
-                rdtscll(side_thd);
-                cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
-        }
+	while (1) {
+		rdtscll(side_thd);
+		cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
+	}
 }
 
 static void
 test_thds_create_switch(void)
 {
-        thdcap_t ts;
-        int      ret, i;
+	thdcap_t ts;
+	int      ret, i;
 
-        perfdata_init(&pd[cos_cpuid()], "COS THD => COS_THD_SWITCH", test_results, ARRAY_SIZE);
+	perfdata_init(&pd[cos_cpuid()], "COS THD => COS_THD_SWITCH", test_results, ARRAY_SIZE);
 
-        ts = cos_thd_alloc(&booter_info, booter_info.comp_cap, bounceback, NULL, 0, 0);
-        if (EXPECT_LL_LT(1, ts, "Thread Creation: Cannot Allocate")) {
-                return;
-        }
+	ts = cos_thd_alloc(&booter_info, booter_info.comp_cap, bounceback, NULL, 0, 0);
+	if (EXPECT_LL_LT(1, ts, "Thread Creation: Cannot Allocate")) {
+		return;
+	}
 
-        for (i = 0; i < ITER; i++) {
-                rdtscll(main_thd);
-                ret = cos_thd_switch(ts);
-                EXPECT_LL_NEQ(0, ret, "COS Switch Error");
+	for (i = 0; i < ITER; i++) {
+		rdtscll(main_thd);
+		ret = cos_thd_switch(ts);
+		EXPECT_LL_NEQ(0, ret, "COS Switch Error");
 
-                perfdata_add(&pd[cos_cpuid()], (side_thd - main_thd));
-        }
+		perfdata_add(&pd[cos_cpuid()], (side_thd - main_thd));
+	}
 
-        perfdata_calc(&pd[cos_cpuid()]);
+	perfdata_calc(&pd[cos_cpuid()]);
 
-        PRINTC("\tCOS THD => COS_THD_SWITCH:\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n",
-                        perfdata_avg(&pd[cos_cpuid()]), perfdata_max(&pd[cos_cpuid()]), perfdata_min(&pd[cos_cpuid()]), perfdata_sz(&pd[cos_cpuid()]));
+	PRINTC("\tCOS THD => COS_THD_SWITCH:\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n",
+			perfdata_avg(&pd[cos_cpuid()]), perfdata_max(&pd[cos_cpuid()]), perfdata_min(&pd[cos_cpuid()]), perfdata_sz(&pd[cos_cpuid()]));
 
-        printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n",
-                        perfdata_sd(&pd[cos_cpuid()]),perfdata_90ptile(&pd[cos_cpuid()]), perfdata_95ptile(&pd[cos_cpuid()]), perfdata_99ptile(&pd[cos_cpuid()]));
+	printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n",
+			perfdata_sd(&pd[cos_cpuid()]),perfdata_90ptile(&pd[cos_cpuid()]), perfdata_95ptile(&pd[cos_cpuid()]), perfdata_99ptile(&pd[cos_cpuid()]));
 
-        perfdata_init(&pd[cos_cpuid()], "COS THD => COS_SWITCH", test_results, ARRAY_SIZE);
+	perfdata_init(&pd[cos_cpuid()], "COS THD => COS_SWITCH", test_results, ARRAY_SIZE);
 
-        for (i = 0; i < ITER; i++) {
-                rdtscll(main_thd);
-                ret = cos_switch(ts, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, 0, 0, 0, 0);
-                EXPECT_LL_NEQ(0, ret, "COS Switch Error");
+	for (i = 0; i < ITER; i++) {
+		rdtscll(main_thd);
+		ret = cos_switch(ts, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, 0, 0, 0, 0);
+		EXPECT_LL_NEQ(0, ret, "COS Switch Error");
 
-                perfdata_add(&pd[cos_cpuid()], (side_thd - main_thd));
-        }
+		perfdata_add(&pd[cos_cpuid()], (side_thd - main_thd));
+	}
 
-        perfdata_calc(&pd[cos_cpuid()]);
+	perfdata_calc(&pd[cos_cpuid()]);
 
-        PRINTC("\tCOS THD => COS_SWITCH:\t\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n",
-                        perfdata_avg(&pd[cos_cpuid()]), perfdata_max(&pd[cos_cpuid()]), perfdata_min(&pd[cos_cpuid()]), perfdata_sz(&pd[cos_cpuid()]));
+	PRINTC("\tCOS THD => COS_SWITCH:\t\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n",
+			perfdata_avg(&pd[cos_cpuid()]), perfdata_max(&pd[cos_cpuid()]), perfdata_min(&pd[cos_cpuid()]), perfdata_sz(&pd[cos_cpuid()]));
 
-        printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n",
-                        perfdata_sd(&pd[cos_cpuid()]),perfdata_90ptile(&pd[cos_cpuid()]), perfdata_95ptile(&pd[cos_cpuid()]), perfdata_99ptile(&pd[cos_cpuid()]));
+	printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n",
+			perfdata_sd(&pd[cos_cpuid()]),perfdata_90ptile(&pd[cos_cpuid()]), perfdata_95ptile(&pd[cos_cpuid()]), perfdata_99ptile(&pd[cos_cpuid()]));
 }
 
 /*
@@ -86,162 +86,162 @@ test_thds_create_switch(void)
 static void
 async_thd_fn_perf(void *thdcap)
 {
-        thdcap_t  tc = (thdcap_t)thdcap;
-        asndcap_t sc = scc_global[cos_cpuid()];
-        arcvcap_t rc = rcc_global[cos_cpuid()];
-        int           i, ret, pending = 0;
+	thdcap_t  tc = (thdcap_t)thdcap;
+	asndcap_t sc = scc_global[cos_cpuid()];
+	arcvcap_t rc = rcc_global[cos_cpuid()];
+	int           i, ret, pending = 0;
 
-        for (i = 0; i < ITER; i++) {
-                cos_rcv(rc, 0);
-                cos_asnd(sc, 1);
-        }
+	for (i = 0; i < ITER; i++) {
+		cos_rcv(rc, 0);
+		cos_asnd(sc, 1);
+	}
 
-        cos_thd_switch(tc);
+	cos_thd_switch(tc);
 
-        for (i = 0; i < ITER + 1; i++) {
-                cos_rcv(rc, 0);
-        }
+	for (i = 0; i < ITER + 1; i++) {
+		cos_rcv(rc, 0);
+	}
 
-        ret = cos_thd_switch(tc);
-        EXPECT_LL_NEQ(0, ret, "COS Switch Error");
+	ret = cos_thd_switch(tc);
+	EXPECT_LL_NEQ(0, ret, "COS Switch Error");
 
-        EXPECT_LL_NEQ(1, 0, "Error, shouldn't get here!\n");
-        assert(0);
+	EXPECT_LL_NEQ(1, 0, "Error, shouldn't get here!\n");
+	assert(0);
 }
 
 static void
 async_thd_parent_perf(void *thdcap)
 {
-        thdcap_t  tc = (thdcap_t)thdcap;
-        asndcap_t sc = scp_global[cos_cpuid()];
-        arcvcap_t rc = rcc_global[cos_cpuid()];
-        long long e = 0, s = 0;
-        int           i, pending = 0;
+	thdcap_t  tc = (thdcap_t)thdcap;
+	asndcap_t sc = scp_global[cos_cpuid()];
+	arcvcap_t rc = rcc_global[cos_cpuid()];
+	long long e = 0, s = 0;
+	int           i, pending = 0;
 
-        perfdata_init(&pd[cos_cpuid()], "Async Endpoints => Roundtrip", test_results, ARRAY_SIZE);
+	perfdata_init(&pd[cos_cpuid()], "Async Endpoints => Roundtrip", test_results, ARRAY_SIZE);
 
-        for (i = 0; i < ITER; i++) {
-                rdtscll(s);
-                cos_asnd(sc, 1);
-                cos_rcv(rc, 0);
-                rdtscll(e);
+	for (i = 0; i < ITER; i++) {
+		rdtscll(s);
+		cos_asnd(sc, 1);
+		cos_rcv(rc, 0);
+		rdtscll(e);
 
-                perfdata_add(&pd[cos_cpuid()], (e - s));
-        }
+		perfdata_add(&pd[cos_cpuid()], (e - s));
+	}
 
-        perfdata_calc(&pd[cos_cpuid()]);
+	perfdata_calc(&pd[cos_cpuid()]);
 
-        PRINTC("\tAsync Endpoints => Roundtrip:\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n",
-                        perfdata_avg(&pd[cos_cpuid()]), perfdata_max(&pd[cos_cpuid()]), perfdata_min(&pd[cos_cpuid()]),
-                        perfdata_sz(&pd[cos_cpuid()]));
+	PRINTC("\tAsync Endpoints => Roundtrip:\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n",
+			perfdata_avg(&pd[cos_cpuid()]), perfdata_max(&pd[cos_cpuid()]), perfdata_min(&pd[cos_cpuid()]),
+			perfdata_sz(&pd[cos_cpuid()]));
 
-        printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n",
-                        perfdata_sd(&pd[cos_cpuid()]),perfdata_90ptile(&pd[cos_cpuid()]), perfdata_95ptile(&pd[cos_cpuid()]),
-                        perfdata_99ptile(&pd[cos_cpuid()]));
+	printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n",
+			perfdata_sd(&pd[cos_cpuid()]),perfdata_90ptile(&pd[cos_cpuid()]), perfdata_95ptile(&pd[cos_cpuid()]),
+			perfdata_99ptile(&pd[cos_cpuid()]));
 
-        perfdata_init(&pd[cos_cpuid()], "Async Endpoints => One Way", test_results, ARRAY_SIZE);
+	perfdata_init(&pd[cos_cpuid()], "Async Endpoints => One Way", test_results, ARRAY_SIZE);
 
-        for (i = 0; i < ITER; i++) {
-                rdtscll(s);
-                cos_asnd(sc, 1);
-                rdtscll(e);
+	for (i = 0; i < ITER; i++) {
+		rdtscll(s);
+		cos_asnd(sc, 1);
+		rdtscll(e);
 
-                perfdata_add(&pd[cos_cpuid()], (e - s));
-        }
+		perfdata_add(&pd[cos_cpuid()], (e - s));
+	}
 
-        perfdata_calc(&pd[cos_cpuid()]);
+	perfdata_calc(&pd[cos_cpuid()]);
 
-        PRINTC("\tAsync Endpoints => One Way:\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n",
-                        perfdata_avg(&pd[cos_cpuid()]), perfdata_max(&pd[cos_cpuid()]), perfdata_min(&pd[cos_cpuid()]),
-                        perfdata_sz(&pd[cos_cpuid()]));
+	PRINTC("\tAsync Endpoints => One Way:\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n",
+			perfdata_avg(&pd[cos_cpuid()]), perfdata_max(&pd[cos_cpuid()]), perfdata_min(&pd[cos_cpuid()]),
+			perfdata_sz(&pd[cos_cpuid()]));
 
-        printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n",
-                        perfdata_sd(&pd[cos_cpuid()]),perfdata_90ptile(&pd[cos_cpuid()]), perfdata_95ptile(&pd[cos_cpuid()]),
-                        perfdata_99ptile(&pd[cos_cpuid()]));
+	printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n",
+			perfdata_sd(&pd[cos_cpuid()]),perfdata_90ptile(&pd[cos_cpuid()]), perfdata_95ptile(&pd[cos_cpuid()]),
+			perfdata_99ptile(&pd[cos_cpuid()]));
 
-        async_test_flag_[cos_cpuid()] = 0;
-        while (1) cos_thd_switch(tc);
+	async_test_flag_[cos_cpuid()] = 0;
+	while (1) cos_thd_switch(tc);
 }
 
 static void
 test_async_endpoints_perf(void)
 {
-        thdcap_t  tcp, tcc;
-        tcap_t    tccp, tccc;
-        arcvcap_t rcp, rcc;
-
-        /* parent rcv capabilities */
-        tcp = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_parent_perf,
-			    (void *)BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, 0, 0);
-        if(EXPECT_LL_LT(1, tcp, "Test Async Endpoints")) return;
-        tccp = cos_tcap_alloc(&booter_info);
-        if(EXPECT_LL_LT(1, tccp, "Test Async Endpoints")) return;
-        rcp = cos_arcv_alloc(&booter_info, tcp, tccp, booter_info.comp_cap, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE);
-        if(EXPECT_LL_LT(1, rcp, "Test Async Endpoints")) return;
-        if(EXPECT_LL_NEQ(0,cos_tcap_transfer(rcp, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF,
-                                             TCAP_PRIO_MAX + 1), "Test Async Endpoints")) {
-                return;
-        }
-
-        /* child rcv capabilities */
-        tcc = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_fn_perf, (void *)tcp, 0, 0);
-        if(EXPECT_LL_LT(1, tcc, "Test Async Endpoints")) return;
-        tccc = cos_tcap_alloc(&booter_info);
-        if(EXPECT_LL_LT(1, tccc, "Test Async Endpoints")) return;
-        rcc = cos_arcv_alloc(&booter_info, tcc, tccc, booter_info.comp_cap, rcp);
-        if(EXPECT_LL_LT(1, rcc, "Test Async Endpoints")) return;
-        if(EXPECT_LL_NEQ(0,cos_tcap_transfer(rcc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF,
-                                                                                 TCAP_PRIO_MAX), "Test Async Endpoints"))
-                 return;
-
-        /* make the snd channel to the child */
-        scp_global[cos_cpuid()] = cos_asnd_alloc(&booter_info, rcc, booter_info.captbl_cap);
-        if(EXPECT_LL_EQ(0, scp_global[cos_cpuid()], "Test Async Endpoints")) return;
-
-        /* make the snd channel to the parent */
-        scc_global[cos_cpuid()] = cos_asnd_alloc(&booter_info, rcp, booter_info.captbl_cap);
-        if(EXPECT_LL_EQ(0, scp_global[cos_cpuid()], "Test Async Endpoints")) return;
-
-        rcc_global[cos_cpuid()] = rcc;
-        rcp_global[cos_cpuid()] = rcp;
-
-        async_test_flag_[cos_cpuid()] = 1;
-        while (async_test_flag_[cos_cpuid()]) cos_thd_switch(tcp);
+	thdcap_t  tcp, tcc;
+	tcap_t    tccp, tccc;
+	arcvcap_t rcp, rcc;
+
+	/* parent rcv capabilities */
+	tcp = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_parent_perf,
+			(void *)BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, 0, 0);
+	if(EXPECT_LL_LT(1, tcp, "Test Async Endpoints")) return;
+	tccp = cos_tcap_alloc(&booter_info);
+	if(EXPECT_LL_LT(1, tccp, "Test Async Endpoints")) return;
+	rcp = cos_arcv_alloc(&booter_info, tcp, tccp, booter_info.comp_cap, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE);
+	if(EXPECT_LL_LT(1, rcp, "Test Async Endpoints")) return;
+	if(EXPECT_LL_NEQ(0,cos_tcap_transfer(rcp, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF,
+					TCAP_PRIO_MAX + 1), "Test Async Endpoints")) {
+		return;
+	}
+
+	/* child rcv capabilities */
+	tcc = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_fn_perf, (void *)tcp, 0, 0);
+	if(EXPECT_LL_LT(1, tcc, "Test Async Endpoints")) return;
+	tccc = cos_tcap_alloc(&booter_info);
+	if(EXPECT_LL_LT(1, tccc, "Test Async Endpoints")) return;
+	rcc = cos_arcv_alloc(&booter_info, tcc, tccc, booter_info.comp_cap, rcp);
+	if(EXPECT_LL_LT(1, rcc, "Test Async Endpoints")) return;
+	if(EXPECT_LL_NEQ(0,cos_tcap_transfer(rcc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF,
+					TCAP_PRIO_MAX), "Test Async Endpoints"))
+		return;
+
+	/* make the snd channel to the child */
+	scp_global[cos_cpuid()] = cos_asnd_alloc(&booter_info, rcc, booter_info.captbl_cap);
+	if(EXPECT_LL_EQ(0, scp_global[cos_cpuid()], "Test Async Endpoints")) return;
+
+	/* make the snd channel to the parent */
+	scc_global[cos_cpuid()] = cos_asnd_alloc(&booter_info, rcp, booter_info.captbl_cap);
+	if(EXPECT_LL_EQ(0, scp_global[cos_cpuid()], "Test Async Endpoints")) return;
+
+	rcc_global[cos_cpuid()] = rcc;
+	rcp_global[cos_cpuid()] = rcp;
+
+	async_test_flag_[cos_cpuid()] = 1;
+	while (async_test_flag_[cos_cpuid()]) cos_thd_switch(tcp);
 }
 
 void
 test_print_ubench(void)
 {
-        PRINTC("\tSINV:\t\t\t\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n",
-                        result_sinv.avg, result_sinv.max, result_sinv.max,
-                        result_sinv.sz);
+	PRINTC("\tSINV:\t\t\t\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n",
+			result_sinv.avg, result_sinv.max, result_sinv.max,
+			result_sinv.sz);
 
-        printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n",
-                        result_sinv.sd, result_sinv.p90tile, result_sinv.p95tile,
-                        result_sinv.p99tile);
+	printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n",
+			result_sinv.sd, result_sinv.p90tile, result_sinv.p95tile,
+			result_sinv.p99tile);
 
-        PRINTC("\tTimer => Timeout Overhead: \t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n",
-                        result_test_timer.avg, result_test_timer.max, result_test_timer.min,
-                        result_test_timer.sz);
+	PRINTC("\tTimer => Timeout Overhead: \t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n",
+			result_test_timer.avg, result_test_timer.max, result_test_timer.min,
+			result_test_timer.sz);
 
-        printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n",
-                        result_test_timer.sd, result_test_timer.p90tile, result_test_timer.p95tile,
-                        result_test_timer.p99tile);
+	printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n",
+			result_test_timer.sd, result_test_timer.p90tile, result_test_timer.p95tile,
+			result_test_timer.p99tile);
 
-        PRINTC("\tTimer => Budget based: \t\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n",
-                        result_budgets_single.avg, result_budgets_single.max, result_budgets_single.min,
-                        result_budgets_single.sz);
+	PRINTC("\tTimer => Budget based: \t\t\tAVG:%llu, MAX:%llu, MIN:%llu, ITER:%d\n",
+			result_budgets_single.avg, result_budgets_single.max, result_budgets_single.min,
+			result_budgets_single.sz);
 
-        printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n",
-                        result_budgets_single.sd, result_budgets_single.p90tile, result_budgets_single.p95tile,
-                        result_budgets_single.p99tile);
+	printc("\t\t\t\t\t\t\tSD:%llu, 90%%:%llu, 95%%:%llu, 99%%:%llu\n",
+			result_budgets_single.sd, result_budgets_single.p90tile, result_budgets_single.p95tile,
+			result_budgets_single.p99tile);
 }
 
 void
 test_run_perf_kernel(void)
 {
-        cyc_per_usec = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
-        test_thds_create_switch();
-        test_async_endpoints_perf();
-        test_print_ubench();
+	cyc_per_usec = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
+	test_thds_create_switch();
+	test_async_endpoints_perf();
+	test_print_ubench();
 }
diff --git a/src/components/implementation/tests/kernel_tests/k_test_async.c b/src/components/implementation/tests/kernel_tests/k_test_async.c
index f5ab4422bf..e32db4c61b 100644
--- a/src/components/implementation/tests/kernel_tests/k_test_async.c
+++ b/src/components/implementation/tests/kernel_tests/k_test_async.c
@@ -17,136 +17,136 @@ static int      failure = 0;
 static void
 async_thd_fn(void *thdcap)
 {
-        thdcap_t  tc = (thdcap_t)thdcap;
-        arcvcap_t rc = rcc_global[cos_cpuid()];
-        int       pending, rcvd, ret;
-
-        pending = cos_rcv(rc, RCV_NON_BLOCKING);
-        if (EXPECT_LL_NEQ(0, pending, "Test Async Endpoints")) failure = 1;
-
-        pending = cos_rcv(rc, 0);
-        /* switch */
-        if (EXPECT_LL_NEQ(0, pending, "Test Async Endpoints")) failure = 1;
-
-        pending = cos_rcv(rc, 0);
-        /* switch */
-        if (EXPECT_LL_NEQ(0, pending, "Test Async Endpoints")) failure = 1;
-
-        pending = cos_rcv(rc, 0);
-        /* switch */
-        if (EXPECT_LL_NEQ(0, pending, "Test Async Endpoints")) failure = 1;
-
-        pending = cos_rcv(rc, RCV_NON_BLOCKING);
-        if (EXPECT_LL_NEQ(pending, -EAGAIN, "Test Async Endpoints")) failure = 1;
-
-        pending = cos_rcv(rc, 0);
-        /* switch */
-        if (EXPECT_LL_NEQ(0, 1, "Test Async Endpoints")) failure = 1;
-
-        ret = cos_thd_switch(tc);
-        if (EXPECT_LL_NEQ(0, ret, "COS Switch Error") ||
-                EXPECT_LL_NEQ(0, 1, "Test Async Endpoints")) {
-                failure = 1;
-        }
-        while (1) cos_thd_switch(tc);
+	thdcap_t  tc = (thdcap_t)thdcap;
+	arcvcap_t rc = rcc_global[cos_cpuid()];
+	int       pending, rcvd, ret;
+
+	pending = cos_rcv(rc, RCV_NON_BLOCKING);
+	if (EXPECT_LL_NEQ(0, pending, "Test Async Endpoints")) failure = 1;
+
+	pending = cos_rcv(rc, 0);
+	/* switch */
+	if (EXPECT_LL_NEQ(0, pending, "Test Async Endpoints")) failure = 1;
+
+	pending = cos_rcv(rc, 0);
+	/* switch */
+	if (EXPECT_LL_NEQ(0, pending, "Test Async Endpoints")) failure = 1;
+
+	pending = cos_rcv(rc, 0);
+	/* switch */
+	if (EXPECT_LL_NEQ(0, pending, "Test Async Endpoints")) failure = 1;
+
+	pending = cos_rcv(rc, RCV_NON_BLOCKING);
+	if (EXPECT_LL_NEQ(pending, -EAGAIN, "Test Async Endpoints")) failure = 1;
+
+	pending = cos_rcv(rc, 0);
+	/* switch */
+	if (EXPECT_LL_NEQ(0, 1, "Test Async Endpoints")) failure = 1;
+
+	ret = cos_thd_switch(tc);
+	if (EXPECT_LL_NEQ(0, ret, "COS Switch Error") ||
+			EXPECT_LL_NEQ(0, 1, "Test Async Endpoints")) {
+		failure = 1;
+	}
+	while (1) cos_thd_switch(tc);
 }
 
 static void
 async_thd_parent(void *thdcap)
 {
-        thdcap_t    tc = (thdcap_t)thdcap;
-        arcvcap_t   rc = rcp_global[cos_cpuid()];
-        asndcap_t   sc = scp_global[cos_cpuid()];
-        int         ret;
-        thdid_t     tid;
-        int         blocked;
-        cycles_t    cycles, now;
-        tcap_time_t thd_timeout;
-
-        /* NON_BLOCKING ASND with 0 as arg*/
-        ret = cos_asnd(sc, 0);
-        ret = cos_asnd(sc, 0);
-        ret = cos_asnd(sc, 0);
-        ret = cos_asnd(sc, 1);
-
-        /* switch */
-        /* child blocked at this point, parent is using child's tcap, this call yields to the child */
-        ret = cos_asnd(sc, 0);
-
-        /* switch */
-        ret = cos_asnd(sc, 0);
-        if (EXPECT_LL_NEQ(0, ret, "Test Async Endpoints")) failure = 1;
-
-        /* switch */
-        ret = cos_asnd(sc, 1);
-        if (EXPECT_LL_NEQ(0, ret, "Test Async Endpoints")) failure = 1;
-
-        /* switch to parent */
-        cos_sched_rcv(rc, 0, 0, &tid, &blocked, &cycles, &thd_timeout);
-        rdtscll(now);
-
-        async_test_flag_[cos_cpuid()] = 0;
-        while (1) cos_thd_switch(tc);
+	thdcap_t    tc = (thdcap_t)thdcap;
+	arcvcap_t   rc = rcp_global[cos_cpuid()];
+	asndcap_t   sc = scp_global[cos_cpuid()];
+	int         ret;
+	thdid_t     tid;
+	int         blocked;
+	cycles_t    cycles, now;
+	tcap_time_t thd_timeout;
+
+	/* NON_BLOCKING ASND with 0 as arg*/
+	ret = cos_asnd(sc, 0);
+	ret = cos_asnd(sc, 0);
+	ret = cos_asnd(sc, 0);
+	ret = cos_asnd(sc, 1);
+
+	/* switch */
+	/* child blocked at this point, parent is using child's tcap, this call yields to the child */
+	ret = cos_asnd(sc, 0);
+
+	/* switch */
+	ret = cos_asnd(sc, 0);
+	if (EXPECT_LL_NEQ(0, ret, "Test Async Endpoints")) failure = 1;
+
+	/* switch */
+	ret = cos_asnd(sc, 1);
+	if (EXPECT_LL_NEQ(0, ret, "Test Async Endpoints")) failure = 1;
+
+	/* switch to parent */
+	cos_sched_rcv(rc, 0, 0, &tid, &blocked, &cycles, &thd_timeout);
+	rdtscll(now);
+
+	async_test_flag_[cos_cpuid()] = 0;
+	while (1) cos_thd_switch(tc);
 }
 
 void
 test_async_endpoints(void)
 {
-        thdcap_t  tcp, tcc;
-        tcap_t    tccp, tccc;
-        arcvcap_t rcp, rcc;
-        asndcap_t scr;
-
-        /* parent rcv capabilities */
-        tcp = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_parent,
-                            (void *)BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, 0, 0);
-        if (EXPECT_LL_LT(1, tcp, "Test Async Endpoints")) {
-                return;
-        }
-        tccp = cos_tcap_alloc(&booter_info);
-        if (EXPECT_LL_LT(1, tccp, "Test Async Endpoints")) {
-                return;
-        }
-        rcp = cos_arcv_alloc(&booter_info, tcp, tccp, booter_info.comp_cap, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE);
-        if (EXPECT_LL_LT(1, rcp, "Test Async Endpoints")) {
-                return;
-        }
-        if (EXPECT_LL_NEQ(0,cos_tcap_transfer(rcp, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF, TCAP_PRIO_MAX),
-                                              "Test Async Endpoints")) {
-                return;
-        }
-
-        /* child rcv capabilities */
-        tcc = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_fn, (void *)tcp, 0, 0);
-        if (EXPECT_LL_LT(1, tcc, "Test Async Endpoints")) {
-                return;
-        }
-        tccc = cos_tcap_alloc(&booter_info);
-        if (EXPECT_LL_LT(1, tccc, "Test Async Endpoints")) {
-                return;
-        }
-        rcc = cos_arcv_alloc(&booter_info, tcc, tccc, booter_info.comp_cap, rcp);
-        if (EXPECT_LL_LT(1, rcc, "Test Async Endpoints")) {
-                return;
-        }
-        if (EXPECT_LL_NEQ(0,cos_tcap_transfer(rcc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF,
-                                         TCAP_PRIO_MAX + 1), "Test Async Endpoints")) {
-                return;
-        }
-
-        /* make the snd channel to the child */
-        scp_global[cos_cpuid()] = cos_asnd_alloc(&booter_info, rcc, booter_info.captbl_cap);
-        if (EXPECT_LL_EQ(0, scp_global[cos_cpuid()], "Test Async Endpoints")) return;
-        scr = cos_asnd_alloc(&booter_info, rcp, booter_info.captbl_cap);
-        if (EXPECT_LL_EQ(0, scr, "Test Async Endpoints")) return;
-
-        rcc_global[cos_cpuid()] = rcc;
-        rcp_global[cos_cpuid()] = rcp;
-
-        async_test_flag_[cos_cpuid()] = 1;
-        while (async_test_flag_[cos_cpuid()]) cos_asnd(scr, 1);
-
-        CHECK_STATUS_FLAG();
-        PRINTC("\t%s: \t\tSuccess\n", "Asynchronous Endpoints");
-        EXIT_FN();
+	thdcap_t  tcp, tcc;
+	tcap_t    tccp, tccc;
+	arcvcap_t rcp, rcc;
+	asndcap_t scr;
+
+	/* parent rcv capabilities */
+	tcp = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_parent,
+			(void *)BOOT_CAPTBL_SELF_INITTHD_CPU_BASE, 0, 0);
+	if (EXPECT_LL_LT(1, tcp, "Test Async Endpoints")) {
+		return;
+	}
+	tccp = cos_tcap_alloc(&booter_info);
+	if (EXPECT_LL_LT(1, tccp, "Test Async Endpoints")) {
+		return;
+	}
+	rcp = cos_arcv_alloc(&booter_info, tcp, tccp, booter_info.comp_cap, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE);
+	if (EXPECT_LL_LT(1, rcp, "Test Async Endpoints")) {
+		return;
+	}
+	if (EXPECT_LL_NEQ(0,cos_tcap_transfer(rcp, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF, TCAP_PRIO_MAX),
+				"Test Async Endpoints")) {
+		return;
+	}
+
+	/* child rcv capabilities */
+	tcc = cos_thd_alloc(&booter_info, booter_info.comp_cap, async_thd_fn, (void *)tcp, 0, 0);
+	if (EXPECT_LL_LT(1, tcc, "Test Async Endpoints")) {
+		return;
+	}
+	tccc = cos_tcap_alloc(&booter_info);
+	if (EXPECT_LL_LT(1, tccc, "Test Async Endpoints")) {
+		return;
+	}
+	rcc = cos_arcv_alloc(&booter_info, tcc, tccc, booter_info.comp_cap, rcp);
+	if (EXPECT_LL_LT(1, rcc, "Test Async Endpoints")) {
+		return;
+	}
+	if (EXPECT_LL_NEQ(0,cos_tcap_transfer(rcc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, TCAP_RES_INF,
+					TCAP_PRIO_MAX + 1), "Test Async Endpoints")) {
+		return;
+	}
+
+	/* make the snd channel to the child */
+	scp_global[cos_cpuid()] = cos_asnd_alloc(&booter_info, rcc, booter_info.captbl_cap);
+	if (EXPECT_LL_EQ(0, scp_global[cos_cpuid()], "Test Async Endpoints")) return;
+	scr = cos_asnd_alloc(&booter_info, rcp, booter_info.captbl_cap);
+	if (EXPECT_LL_EQ(0, scr, "Test Async Endpoints")) return;
+
+	rcc_global[cos_cpuid()] = rcc;
+	rcp_global[cos_cpuid()] = rcp;
+
+	async_test_flag_[cos_cpuid()] = 1;
+	while (async_test_flag_[cos_cpuid()]) cos_asnd(scr, 1);
+
+	CHECK_STATUS_FLAG();
+	PRINTC("\t%s: \t\tSuccess\n", "Asynchronous Endpoints");
+	EXIT_FN();
 }
diff --git a/src/components/implementation/tests/kernel_tests/k_test_captbl.c b/src/components/implementation/tests/kernel_tests/k_test_captbl.c
index d1d3d18551..76532eeef0 100644
--- a/src/components/implementation/tests/kernel_tests/k_test_captbl.c
+++ b/src/components/implementation/tests/kernel_tests/k_test_captbl.c
@@ -14,20 +14,20 @@ extern void *__inv_test_serverfn(int a, int b, int c);
 void
 test_captbl_expands(void)
 {
-        int       i;
-        compcap_t cc;
+	int       i;
+	compcap_t cc;
 
-        cc = cos_comp_alloc(&booter_info, booter_info.captbl_cap, booter_info.pgtbl_cap, 0, (vaddr_t)NULL, (vaddr_t)NULL);
-        if (EXPECT_LL_LT(1, cc, "Capability Table Expansion")) {
-                return;
-        }
-        for (i = 0; i < CAPTBL_ITER; i++) {
-                sinvcap_t ic;
+	cc = cos_comp_alloc(&booter_info, booter_info.captbl_cap, booter_info.pgtbl_cap, 0, (vaddr_t)NULL, (vaddr_t)NULL);
+	if (EXPECT_LL_LT(1, cc, "Capability Table Expansion")) {
+		return;
+	}
+	for (i = 0; i < CAPTBL_ITER; i++) {
+		sinvcap_t ic;
 
-                ic = cos_sinv_alloc(&booter_info, cc, (vaddr_t)__inv_test_serverfn, 0);
-                if(EXPECT_LL_LT(1, ic, "Capability Table: Cannot Allocate")) {
-                        return;
-                }
-        }
-        PRINTC("\t%s: \t\tSuccess\n", "Capability Table Expansion");
+		ic = cos_sinv_alloc(&booter_info, cc, (vaddr_t)__inv_test_serverfn, 0);
+		if(EXPECT_LL_LT(1, ic, "Capability Table: Cannot Allocate")) {
+			return;
+		}
+	}
+	PRINTC("\t%s: \t\tSuccess\n", "Capability Table Expansion");
 }
diff --git a/src/components/implementation/tests/kernel_tests/k_test_inv.c b/src/components/implementation/tests/kernel_tests/k_test_inv.c
index 39c7e3e405..f6833c36ba 100644
--- a/src/components/implementation/tests/kernel_tests/k_test_inv.c
+++ b/src/components/implementation/tests/kernel_tests/k_test_inv.c
@@ -16,7 +16,7 @@ static cycles_t test_results[ARRAY_SIZE] = { 0 };
 int
 test_serverfn(int a, int b, int c)
 {
-        return 0xDEADBEEF;
+	return 0xDEADBEEF;
 }
 
 extern void *__inv_test_serverfn(int a, int b, int c);
@@ -24,68 +24,68 @@ extern void *__inv_test_serverfn(int a, int b, int c);
 static inline int
 call_cap_mb(u32_t cap_no, int arg1, int arg2, int arg3)
 {
-        int ret;
-
-        /*
-         * Which stack should we use for this invocation?  Simple, use
-         * this stack, at the current sp.  This is essentially a
-         * function call into another component, with odd calling
-         * conventions.
-         */
-        cap_no = (cap_no + 1) << COS_CAPABILITY_OFFSET;
-
-        __asm__ __volatile__("pushl %%ebp\n\t"
-                             "movl %%esp, %%ebp\n\t"
-                             "movl %%esp, %%edx\n\t"
-                             "movl $1f, %%ecx\n\t"
-                             "sysenter\n\t"
-                             "1:\n\t"
-                             "popl %%ebp"
-                             : "=a"(ret)
-                             : "a"(cap_no), "b"(arg1), "S"(arg2), "D"(arg3)
-                             : "memory", "cc", "ecx", "edx");
-
-        return ret;
+	int ret;
+
+	/*
+	 * Which stack should we use for this invocation?  Simple, use
+	 * this stack, at the current sp.  This is essentially a
+	 * function call into another component, with odd calling
+	 * conventions.
+	 */
+	cap_no = (cap_no + 1) << COS_CAPABILITY_OFFSET;
+
+	__asm__ __volatile__("pushl %%ebp\n\t"
+			"movl %%esp, %%ebp\n\t"
+			"movl %%esp, %%edx\n\t"
+			"movl $1f, %%ecx\n\t"
+			"sysenter\n\t"
+			"1:\n\t"
+			"popl %%ebp"
+			: "=a"(ret)
+			: "a"(cap_no), "b"(arg1), "S"(arg2), "D"(arg3)
+			: "memory", "cc", "ecx", "edx");
+
+	return ret;
 }
 
 void
 test_inv(void)
 {
-        compcap_t        cc;
-        sinvcap_t        ic;
-        unsigned int r;
-        int                  i;
-        cycles_t         start_cycles = 0LL, end_cycles = 0LL;
-
-        perfdata_init(&result, "SINV", test_results, ARRAY_SIZE);
-
-        cc = cos_comp_alloc(&booter_info, booter_info.captbl_cap, booter_info.pgtbl_cap, 0, (vaddr_t)NULL, (vaddr_t)NULL);
-        if (EXPECT_LL_LT(1, cc, "Invocation: Cannot Allocate")) return;
-        ic = cos_sinv_alloc(&booter_info, cc, (vaddr_t)__inv_test_serverfn, 0);
-        if (EXPECT_LL_LT(1, ic, "Invocation: Cannot Allocate")) return;
-
-        r = call_cap_mb(ic, 1, 2, 3);
-        if (EXPECT_LLU_NEQ(0xDEADBEEF, r, "Test Invocation")) return;
-
-        for (i = 0; i < ITER; i++) {
-                rdtscll(start_cycles);
-                call_cap_mb(ic, 1, 2, 3);
-                rdtscll(end_cycles);
-
-                perfdata_add(&result, end_cycles - start_cycles);
-        }
-
-        perfdata_calc(&result);
-        result_sinv.avg = perfdata_avg(&result);
-        result_sinv.max = perfdata_avg(&result);
-        result_sinv.min = perfdata_avg(&result);
-        result_sinv.sz = perfdata_avg(&result);
-        result_sinv.sd = perfdata_avg(&result);
-        result_sinv.p90tile = perfdata_avg(&result);
-        result_sinv.p95tile = perfdata_avg(&result);
-        result_sinv.p99tile = perfdata_avg(&result);
-
-        CHECK_STATUS_FLAG();
-        PRINTC("\t%s: \t\tSuccess\n", "Synchronous Invocations");
-        EXIT_FN();
+	compcap_t        cc;
+	sinvcap_t        ic;
+	unsigned int r;
+	int                  i;
+	cycles_t         start_cycles = 0LL, end_cycles = 0LL;
+
+	perfdata_init(&result, "SINV", test_results, ARRAY_SIZE);
+
+	cc = cos_comp_alloc(&booter_info, booter_info.captbl_cap, booter_info.pgtbl_cap, 0, (vaddr_t)NULL, (vaddr_t)NULL);
+	if (EXPECT_LL_LT(1, cc, "Invocation: Cannot Allocate")) return;
+	ic = cos_sinv_alloc(&booter_info, cc, (vaddr_t)__inv_test_serverfn, 0);
+	if (EXPECT_LL_LT(1, ic, "Invocation: Cannot Allocate")) return;
+
+	r = call_cap_mb(ic, 1, 2, 3);
+	if (EXPECT_LLU_NEQ(0xDEADBEEF, r, "Test Invocation")) return;
+
+	for (i = 0; i < ITER; i++) {
+		rdtscll(start_cycles);
+		call_cap_mb(ic, 1, 2, 3);
+		rdtscll(end_cycles);
+
+		perfdata_add(&result, end_cycles - start_cycles);
+	}
+
+	perfdata_calc(&result);
+	result_sinv.avg = perfdata_avg(&result);
+	result_sinv.max = perfdata_avg(&result);
+	result_sinv.min = perfdata_avg(&result);
+	result_sinv.sz = perfdata_avg(&result);
+	result_sinv.sd = perfdata_avg(&result);
+	result_sinv.p90tile = perfdata_avg(&result);
+	result_sinv.p95tile = perfdata_avg(&result);
+	result_sinv.p99tile = perfdata_avg(&result);
+
+	CHECK_STATUS_FLAG();
+	PRINTC("\t%s: \t\tSuccess\n", "Synchronous Invocations");
+	EXIT_FN();
 }
diff --git a/src/components/implementation/tests/kernel_tests/k_test_mem.c b/src/components/implementation/tests/kernel_tests/k_test_mem.c
index b10fa54e94..4da2919749 100644
--- a/src/components/implementation/tests/kernel_tests/k_test_mem.c
+++ b/src/components/implementation/tests/kernel_tests/k_test_mem.c
@@ -13,48 +13,48 @@
 void
 test_mem_alloc(void)
 {
-        char *      p, *s, *t, *prev;
-        int         i;
-        const char *chk = "SUCCESS";
-        int         fail_contiguous = 0;
+	char *      p, *s, *t, *prev;
+	int         i;
+	const char *chk = "SUCCESS";
+	int         fail_contiguous = 0;
 
-        p = cos_page_bump_alloc(&booter_info);
-        if (p == NULL) {
-                EXPECT_LL_NEQ(0, 1, "Memory Test: Cannot Allocate");
-                return;
-        }
-        PRINTC("\t%s: \t\t\tSuccess\n", "Memory => Allocation");
-        strcpy(p, chk);
+	p = cos_page_bump_alloc(&booter_info);
+	if (p == NULL) {
+		EXPECT_LL_NEQ(0, 1, "Memory Test: Cannot Allocate");
+		return;
+	}
+	PRINTC("\t%s: \t\t\tSuccess\n", "Memory => Allocation");
+	strcpy(p, chk);
 
-        if (EXPECT_LL_NEQ(0, strcmp(chk, p), "Memory Test: Wrong STRCPY")) {
-                return;
-        }
+	if (EXPECT_LL_NEQ(0, strcmp(chk, p), "Memory Test: Wrong STRCPY")) {
+		return;
+	}
 
-        s = cos_page_bump_alloc(&booter_info);
-        assert(s);
-        prev = s;
-        for (i = 0; i < TEST_NPAGES; i++) {
-                t = cos_page_bump_alloc(&booter_info);
-                if (t == NULL){
-                        EXPECT_LL_EQ(0, 1, "Memory Test: Cannot Allocate");
-                        return;
-                }
-                if (t != prev + PAGE_SIZE) {
-                        fail_contiguous = 1;
-                }
-                prev = t;
-        }
-        if (!fail_contiguous) {
-                memset(s, 0, TEST_NPAGES * PAGE_SIZE);
-        } else if (EXPECT_LL_EQ(i, TEST_NPAGES,"Memory Test: Cannot Allocate contiguous")) {
-                return;
-        }
+	s = cos_page_bump_alloc(&booter_info);
+	assert(s);
+	prev = s;
+	for (i = 0; i < TEST_NPAGES; i++) {
+		t = cos_page_bump_alloc(&booter_info);
+		if (t == NULL){
+			EXPECT_LL_EQ(0, 1, "Memory Test: Cannot Allocate");
+			return;
+		}
+		if (t != prev + PAGE_SIZE) {
+			fail_contiguous = 1;
+		}
+		prev = t;
+	}
+	if (!fail_contiguous) {
+		memset(s, 0, TEST_NPAGES * PAGE_SIZE);
+	} else if (EXPECT_LL_EQ(i, TEST_NPAGES,"Memory Test: Cannot Allocate contiguous")) {
+		return;
+	}
 
-        t = cos_page_bump_allocn(&booter_info, TEST_NPAGES * PAGE_SIZE);
-        if (t == NULL) {
-                EXPECT_LL_NEQ(0, 1, "Memory Test: Cannot Allocate");
-                return;
-        }
-        memset(t, 0, TEST_NPAGES * PAGE_SIZE);
-        PRINTC("\t%s: \t\t\tSuccess\n", "Memory => R & W");
+	t = cos_page_bump_allocn(&booter_info, TEST_NPAGES * PAGE_SIZE);
+	if (t == NULL) {
+		EXPECT_LL_NEQ(0, 1, "Memory Test: Cannot Allocate");
+		return;
+	}
+	memset(t, 0, TEST_NPAGES * PAGE_SIZE);
+	PRINTC("\t%s: \t\t\tSuccess\n", "Memory => R & W");
 }
diff --git a/src/components/implementation/tests/kernel_tests/k_test_tcap.c b/src/components/implementation/tests/kernel_tests/k_test_tcap.c
index 952174ba2d..6b1a311552 100644
--- a/src/components/implementation/tests/kernel_tests/k_test_tcap.c
+++ b/src/components/implementation/tests/kernel_tests/k_test_tcap.c
@@ -20,7 +20,7 @@ static cycles_t test_results[ARRAY_SIZE] = { 0 };
 static void
 spinner(void *d)
 {
-        while (1);
+	while (1);
 }
 
 void
@@ -40,136 +40,136 @@ sched_events_clear(void)
 void
 test_timer(void)
 {
-        thdcap_t    tc;
-        cycles_t    c = 0, p = 0;
-        int         i, ret;
-        cycles_t    s, e;
-        cycles_t    cycles, now, utime;
-        long long   time, mask;
-        tcap_time_t timer, thd_timeout;
-
-        tc = cos_thd_alloc(&booter_info, booter_info.comp_cap, spinner, NULL, 0, 0);
-
-        perfdata_init(&result, "COS THD => COS_THD_SWITCH", test_results, ARRAY_SIZE);
-
-        for (i = 0; i <= TEST_ITER; i++){
-                rdtscll(now);
-                timer = tcap_cyc2time(now + GRANULARITY * cyc_per_usec);
-                cos_switch(tc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, 0, timer, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE,
-                           cos_sched_sync());
-                p = c;
-                rdtscll(c);
-                time = (c - now - (cycles_t)(GRANULARITY * cyc_per_usec));
-                mask = (time >> (sizeof(long long) * CHAR_BIT - 1));
-                utime = (time + mask) ^ mask;
-
-                if (i > 0) {
-                        perfdata_add(&result, utime);
-
-                        if (EXPECT_LLU_LT((long long unsigned)(c-now), (unsigned)(GRANULARITY * cyc_per_usec * MAX_THDS),
-                                            "Timer: Failure on  MAX") ||
-                                EXPECT_LLU_LT((unsigned)(GRANULARITY * cyc_per_usec * MIN_THDS), (long long unsigned)(c-now),
-                                            "Timer: failure on MIN")) {
-                                return;
-                        }
-                }
-                sched_events_clear();
-        }
-
-        perfdata_calc(&result);
-        result_test_timer.avg = perfdata_avg(&result);
-        result_test_timer.max = perfdata_avg(&result);
-        result_test_timer.min = perfdata_avg(&result);
-        result_test_timer.sz = perfdata_avg(&result);
-        result_test_timer.sd = perfdata_avg(&result);
-        result_test_timer.p90tile = perfdata_avg(&result);
-        result_test_timer.p95tile = perfdata_avg(&result);
-        result_test_timer.p99tile = perfdata_avg(&result);
-
-        /* Timer in past */
-        c = 0, p = 0;
-
-        rdtscll(c);
-        timer = tcap_cyc2time(c - GRANULARITY * cyc_per_usec);
-        cos_switch(tc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, 0, timer, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE,
-                    cos_sched_sync());
-        p = c;
-        rdtscll(c);
-
-        if (EXPECT_LLU_LT((long long unsigned)(c-p), (unsigned)(GRANULARITY * cyc_per_usec), "Timer: Past")) {
-                return;
-        }
-
-        sched_events_clear();
-
-        /* Timer now */
-        c = 0, p = 0;
-
-        rdtscll(c);
-        timer = tcap_cyc2time(c);
-        cos_switch(tc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, 0, timer, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE,
-                    cos_sched_sync());
-        p = c;
-        rdtscll(c);
-
-        if (EXPECT_LLU_LT((long long unsigned)(c-p), (unsigned)(GRANULARITY * cyc_per_usec), "Timer:  Now")) {
-                return;
-        }
+	thdcap_t    tc;
+	cycles_t    c = 0, p = 0;
+	int         i, ret;
+	cycles_t    s, e;
+	cycles_t    cycles, now, utime;
+	long long   time, mask;
+	tcap_time_t timer, thd_timeout;
+
+	tc = cos_thd_alloc(&booter_info, booter_info.comp_cap, spinner, NULL, 0, 0);
+
+	perfdata_init(&result, "COS THD => COS_THD_SWITCH", test_results, ARRAY_SIZE);
+
+	for (i = 0; i <= TEST_ITER; i++){
+		rdtscll(now);
+		timer = tcap_cyc2time(now + GRANULARITY * cyc_per_usec);
+		cos_switch(tc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, 0, timer, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE,
+				cos_sched_sync());
+		p = c;
+		rdtscll(c);
+		time = (c - now - (cycles_t)(GRANULARITY * cyc_per_usec));
+		mask = (time >> (sizeof(long long) * CHAR_BIT - 1));
+		utime = (time + mask) ^ mask;
+
+		if (i > 0) {
+			perfdata_add(&result, utime);
+
+			if (EXPECT_LLU_LT((long long unsigned)(c-now), (unsigned)(GRANULARITY * cyc_per_usec * MAX_THDS),
+						"Timer: Failure on  MAX") ||
+					EXPECT_LLU_LT((unsigned)(GRANULARITY * cyc_per_usec * MIN_THDS), (long long unsigned)(c-now),
+						"Timer: failure on MIN")) {
+				return;
+			}
+		}
+		sched_events_clear();
+	}
+
+	perfdata_calc(&result);
+	result_test_timer.avg = perfdata_avg(&result);
+	result_test_timer.max = perfdata_avg(&result);
+	result_test_timer.min = perfdata_avg(&result);
+	result_test_timer.sz = perfdata_avg(&result);
+	result_test_timer.sd = perfdata_avg(&result);
+	result_test_timer.p90tile = perfdata_avg(&result);
+	result_test_timer.p95tile = perfdata_avg(&result);
+	result_test_timer.p99tile = perfdata_avg(&result);
+
+	/* Timer in past */
+	c = 0, p = 0;
+
+	rdtscll(c);
+	timer = tcap_cyc2time(c - GRANULARITY * cyc_per_usec);
+	cos_switch(tc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, 0, timer, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE,
+			cos_sched_sync());
+	p = c;
+	rdtscll(c);
+
+	if (EXPECT_LLU_LT((long long unsigned)(c-p), (unsigned)(GRANULARITY * cyc_per_usec), "Timer: Past")) {
+		return;
+	}
+
+	sched_events_clear();
+
+	/* Timer now */
+	c = 0, p = 0;
+
+	rdtscll(c);
+	timer = tcap_cyc2time(c);
+	cos_switch(tc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE, 0, timer, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE,
+			cos_sched_sync());
+	p = c;
+	rdtscll(c);
+
+	if (EXPECT_LLU_LT((long long unsigned)(c-p), (unsigned)(GRANULARITY * cyc_per_usec), "Timer:  Now")) {
+		return;
+	}
 
 	struct cos_sched_event ev;
 	cos_ul_sched_rcv(BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, 0, 0, &ev);
 	cycles = ev.evt.elapsed_cycs;
 
-        EXPECT_LLU_LT((long long unsigned)cycles, (long long unsigned)(c-p), "Timer => Cycles time");
+	EXPECT_LLU_LT((long long unsigned)cycles, (long long unsigned)(c-p), "Timer => Cycles time");
 
-        sched_events_clear();
-        PRINTC("\t%s: \t\t\tSuccess\n", "One-Shot Timeout");
+	sched_events_clear();
+	PRINTC("\t%s: \t\t\tSuccess\n", "One-Shot Timeout");
 }
 
 struct exec_cluster {
-        thdcap_t        tc;
-        arcvcap_t   rc;
-        tcap_t          tcc;
-        cycles_t        cyc;
-        asndcap_t   sc;                 /* send-cap to send to rc */
-        tcap_prio_t prio;
-        int                 xseq;           /* expected activation sequence number for this thread */
+	thdcap_t        tc;
+	arcvcap_t   rc;
+	tcap_t          tcc;
+	cycles_t        cyc;
+	asndcap_t   sc;                 /* send-cap to send to rc */
+	tcap_prio_t prio;
+	int                 xseq;           /* expected activation sequence number for this thread */
 };
 
 struct budget_test_data {
-        /* p=parent, c=child, g=grand-child */
-        struct exec_cluster p, c, g;
+	/* p=parent, c=child, g=grand-child */
+	struct exec_cluster p, c, g;
 } bt[NUM_CPU], mbt[NUM_CPU];
 
-static int
+	static int
 exec_cluster_alloc(struct exec_cluster *e, cos_thd_fn_t fn, void *d, arcvcap_t parentc)
 {
-        e->tcc = cos_tcap_alloc(&booter_info);
-        if (EXPECT_LL_LT(1, e->tcc, "Cluster Allocation: TCAP ALLOC")) return -1;
-        e->tc = cos_thd_alloc(&booter_info, booter_info.comp_cap, fn, d, 0, 0);
-        if (EXPECT_LL_LT(1, e->tc, "Cluster Allocation: THD ALLOC")) return -1;
-        e->rc = cos_arcv_alloc(&booter_info, e->tc, e->tcc, booter_info.comp_cap, parentc);
-        if (EXPECT_LL_LT(1, e->rc, "Cluster Allocation: ARCV ALLOC")) return -1;
-        e->sc = cos_asnd_alloc(&booter_info, e->rc, booter_info.captbl_cap);
-        if (EXPECT_LL_LT(1, e->sc, "Cluster Allocation: ASND ALLOC")) return -1;
-
-        e->cyc = 0;
-
-        return 0;
+	e->tcc = cos_tcap_alloc(&booter_info);
+	if (EXPECT_LL_LT(1, e->tcc, "Cluster Allocation: TCAP ALLOC")) return -1;
+	e->tc = cos_thd_alloc(&booter_info, booter_info.comp_cap, fn, d, 0, 0);
+	if (EXPECT_LL_LT(1, e->tc, "Cluster Allocation: THD ALLOC")) return -1;
+	e->rc = cos_arcv_alloc(&booter_info, e->tc, e->tcc, booter_info.comp_cap, parentc);
+	if (EXPECT_LL_LT(1, e->rc, "Cluster Allocation: ARCV ALLOC")) return -1;
+	e->sc = cos_asnd_alloc(&booter_info, e->rc, booter_info.captbl_cap);
+	if (EXPECT_LL_LT(1, e->sc, "Cluster Allocation: ASND ALLOC")) return -1;
+
+	e->cyc = 0;
+
+	return 0;
 }
 
 static void
 parent(void *d)
 {
-        assert(0);
+	assert(0);
 }
 
 static void
 spinner_cyc(void *d)
 {
-        cycles_t *p = (cycles_t *)d;
+	cycles_t *p = (cycles_t *)d;
 
-        while (1) rdtscll(*p);
+	while (1) rdtscll(*p);
 }
 
 #define TIMER_TIME 100
@@ -177,70 +177,70 @@ spinner_cyc(void *d)
 void
 test_2timers(void)
 {
-        int ret;
-        cycles_t        s, e, timer;
-
-        if (EXPECT_LL_NEQ(0, exec_cluster_alloc(&bt[cos_cpuid()].p, parent, &bt[cos_cpuid()].p,
-                                                BOOT_CAPTBL_SELF_INITRCV_CPU_BASE), "TCAP v. Timer: Cannot Allocate")) {
-                return;
-        }
-        if (EXPECT_LL_NEQ(0, exec_cluster_alloc(&bt[cos_cpuid()].c, spinner, &bt[cos_cpuid()].c,
-                                                bt[cos_cpuid()].p.rc), "TCAP v. Timer: Cannot Allocate")) {
-                return;
-        }
-
-        /* Timer > TCAP */
-
-        ret = cos_tcap_transfer(bt[cos_cpuid()].c.rc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE,
-                                GRANULARITY * TIMER_TIME, TCAP_PRIO_MAX + 2);
-        if (EXPECT_LL_NEQ(0, ret, "TCAP v. Timer : TCAP Transfer")) {
-                return;
-        }
-
-        rdtscll(s);
-        timer = tcap_cyc2time(s + GRANULARITY * cyc_per_usec);
-        if (cos_switch(bt[cos_cpuid()].c.tc, bt[cos_cpuid()].c.tcc, TCAP_PRIO_MAX + 2,
-                       timer, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, cos_sched_sync())) {
-                EXPECT_LL_NEQ(0, 1, "TCAP v. Timer: COS Switch");
-                return;
-        }
-        rdtscll(e);
-
-        if (EXPECT_LLU_LT((long long unsigned)(e-s), (unsigned)(GRANULARITY * cyc_per_usec),
-                                          "TCAP v. Timer: Timer > TCAP") ||
-                EXPECT_LLU_LT((unsigned)(GRANULARITY * TIMER_TIME), (long long unsigned)(e-s),
-                                          "TCAP v. Timer: Interreupt Under")) {
-                return;
-        }
-
-        sched_events_clear();
-
-        /* Timer < TCAP */
-
-        ret = cos_tcap_transfer(bt[cos_cpuid()].c.rc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE,
-                                GRANULARITY * cyc_per_usec, TCAP_PRIO_MAX + 2);
-        if (EXPECT_LL_NEQ(0, ret, "TCAP v. Timer: TCAP Transfer")) {
-                return;
-        }
-
-        rdtscll(s);
-        timer = tcap_cyc2time(s + GRANULARITY * TIMER_TIME);
-        if (EXPECT_LL_NEQ(0, cos_switch(bt[cos_cpuid()].c.tc, bt[cos_cpuid()].c.tcc, TCAP_PRIO_MAX + 2, timer,
-                                        BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, cos_sched_sync()), "TCAP v. TImer: COS Switch")) {
-                return;
-        }
-
-        rdtscll(e);
-
-        if (EXPECT_LLU_LT((long long unsigned)(e-s), (unsigned)(GRANULARITY * cyc_per_usec),
-                          "TCAP v. Timer: Timer < TCAP") ||
-                EXPECT_LLU_LT((unsigned)(GRANULARITY * TIMER_TIME), (long long unsigned)(e-s),
-                               "TCAP v. Timer: Interreupt Under")) {
-                return;
-        }
-
-        sched_events_clear();
-        PRINTC("\t%s: \t\tSuccess\n", "Timer => Timeout v. Budget");
+	int ret;
+	cycles_t        s, e, timer;
+
+	if (EXPECT_LL_NEQ(0, exec_cluster_alloc(&bt[cos_cpuid()].p, parent, &bt[cos_cpuid()].p,
+					BOOT_CAPTBL_SELF_INITRCV_CPU_BASE), "TCAP v. Timer: Cannot Allocate")) {
+		return;
+	}
+	if (EXPECT_LL_NEQ(0, exec_cluster_alloc(&bt[cos_cpuid()].c, spinner, &bt[cos_cpuid()].c,
+					bt[cos_cpuid()].p.rc), "TCAP v. Timer: Cannot Allocate")) {
+		return;
+	}
+
+	/* Timer > TCAP */
+
+	ret = cos_tcap_transfer(bt[cos_cpuid()].c.rc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE,
+			GRANULARITY * TIMER_TIME, TCAP_PRIO_MAX + 2);
+	if (EXPECT_LL_NEQ(0, ret, "TCAP v. Timer : TCAP Transfer")) {
+		return;
+	}
+
+	rdtscll(s);
+	timer = tcap_cyc2time(s + GRANULARITY * cyc_per_usec);
+	if (cos_switch(bt[cos_cpuid()].c.tc, bt[cos_cpuid()].c.tcc, TCAP_PRIO_MAX + 2,
+				timer, BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, cos_sched_sync())) {
+		EXPECT_LL_NEQ(0, 1, "TCAP v. Timer: COS Switch");
+		return;
+	}
+	rdtscll(e);
+
+	if (EXPECT_LLU_LT((long long unsigned)(e-s), (unsigned)(GRANULARITY * cyc_per_usec),
+				"TCAP v. Timer: Timer > TCAP") ||
+			EXPECT_LLU_LT((unsigned)(GRANULARITY * TIMER_TIME), (long long unsigned)(e-s),
+				"TCAP v. Timer: Interreupt Under")) {
+		return;
+	}
+
+	sched_events_clear();
+
+	/* Timer < TCAP */
+
+	ret = cos_tcap_transfer(bt[cos_cpuid()].c.rc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE,
+			GRANULARITY * cyc_per_usec, TCAP_PRIO_MAX + 2);
+	if (EXPECT_LL_NEQ(0, ret, "TCAP v. Timer: TCAP Transfer")) {
+		return;
+	}
+
+	rdtscll(s);
+	timer = tcap_cyc2time(s + GRANULARITY * TIMER_TIME);
+	if (EXPECT_LL_NEQ(0, cos_switch(bt[cos_cpuid()].c.tc, bt[cos_cpuid()].c.tcc, TCAP_PRIO_MAX + 2, timer,
+					BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, cos_sched_sync()), "TCAP v. TImer: COS Switch")) {
+		return;
+	}
+
+	rdtscll(e);
+
+	if (EXPECT_LLU_LT((long long unsigned)(e-s), (unsigned)(GRANULARITY * cyc_per_usec),
+				"TCAP v. Timer: Timer < TCAP") ||
+			EXPECT_LLU_LT((unsigned)(GRANULARITY * TIMER_TIME), (long long unsigned)(e-s),
+				"TCAP v. Timer: Interreupt Under")) {
+		return;
+	}
+
+	sched_events_clear();
+	PRINTC("\t%s: \t\tSuccess\n", "Timer => Timeout v. Budget");
 }
 
 #define BUDGET_TIME 100
@@ -248,64 +248,64 @@ test_2timers(void)
 static void
 test_tcap_budgets_single(void)
 {
-        int         i;
-        cycles_t    s = 0, e = 0;
-        cycles_t    time, mask;
-        int         ret;
-
-        perfdata_init(&result, "Timer => Budget based", test_results, ARRAY_SIZE);
-
-        if (EXPECT_LL_NEQ(0, exec_cluster_alloc(&bt[cos_cpuid()].p, parent, &bt[cos_cpuid()].p,
-                          BOOT_CAPTBL_SELF_INITRCV_CPU_BASE), "Single Budget: Cannot Allocate") ||
-                EXPECT_LL_NEQ(0, exec_cluster_alloc(&bt[cos_cpuid()].c, spinner, &bt[cos_cpuid()].c,
-                              bt[cos_cpuid()].p.rc), "Single Budget: Cannot Allocate")) {
-                return;
-        }
-        for (i = 1; i <= TEST_ITER; i++) {
-
-                ret = cos_tcap_transfer(bt[cos_cpuid()].c.rc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE,
-                                        GRANULARITY * BUDGET_TIME, TCAP_PRIO_MAX + 2);
-                if (EXPECT_LL_NEQ(0, ret, "Single Budget: TCAP Transfer")) {
-                        return;
-                }
-
-                rdtscll(s);
-                if (cos_switch(bt[cos_cpuid()].c.tc, bt[cos_cpuid()].c.tcc, TCAP_PRIO_MAX + 2, TCAP_TIME_NIL,
-                               BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, cos_sched_sync())){
-                        EXPECT_LL_NEQ(0, 1, "Single Budget: COS Switch");
-                        return;
-                }
-                rdtscll(e);
-
-                if (i > 1) {
-                        /* Performant absolute value function instead of branching */
-                        time = (e - s - (GRANULARITY * BUDGET_TIME));
-                        mask = (time >> (sizeof(cycles_t) * CHAR_BIT - 1));
-                        time = (time + mask) ^ mask;
-
-                        perfdata_add(&result, time);
-
-                        if (EXPECT_LLU_LT((long long unsigned)(e-s), (unsigned)(GRANULARITY * BUDGET_TIME * MAX_THDS),
-                                          "Single Budget: MAX Bound") ||
-                                EXPECT_LLU_LT((unsigned)(GRANULARITY * BUDGET_TIME * MIN_THDS), (long long unsigned)(e-s),
-                                               "Single Budget: MIN Bound")) {
-                                return;
-                        }
-                }
-                sched_events_clear();
-        }
-
-        perfdata_calc(&result);
-        result_budgets_single.avg = perfdata_avg(&result);
-        result_budgets_single.max = perfdata_avg(&result);
-        result_budgets_single.min = perfdata_avg(&result);
-        result_budgets_single.sz = perfdata_avg(&result);
-        result_budgets_single.sd = perfdata_avg(&result);
-        result_budgets_single.p90tile = perfdata_avg(&result);
-        result_budgets_single.p95tile = perfdata_avg(&result);
-        result_budgets_single.p99tile = perfdata_avg(&result);
-
-        PRINTC("\t%s: \t\t\tSuccess\n", "Timer => Budget based");
+	int         i;
+	cycles_t    s = 0, e = 0;
+	cycles_t    time, mask;
+	int         ret;
+
+	perfdata_init(&result, "Timer => Budget based", test_results, ARRAY_SIZE);
+
+	if (EXPECT_LL_NEQ(0, exec_cluster_alloc(&bt[cos_cpuid()].p, parent, &bt[cos_cpuid()].p,
+					BOOT_CAPTBL_SELF_INITRCV_CPU_BASE), "Single Budget: Cannot Allocate") ||
+			EXPECT_LL_NEQ(0, exec_cluster_alloc(&bt[cos_cpuid()].c, spinner, &bt[cos_cpuid()].c,
+					bt[cos_cpuid()].p.rc), "Single Budget: Cannot Allocate")) {
+		return;
+	}
+	for (i = 1; i <= TEST_ITER; i++) {
+
+		ret = cos_tcap_transfer(bt[cos_cpuid()].c.rc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE,
+				GRANULARITY * BUDGET_TIME, TCAP_PRIO_MAX + 2);
+		if (EXPECT_LL_NEQ(0, ret, "Single Budget: TCAP Transfer")) {
+			return;
+		}
+
+		rdtscll(s);
+		if (cos_switch(bt[cos_cpuid()].c.tc, bt[cos_cpuid()].c.tcc, TCAP_PRIO_MAX + 2, TCAP_TIME_NIL,
+					BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, cos_sched_sync())){
+			EXPECT_LL_NEQ(0, 1, "Single Budget: COS Switch");
+			return;
+		}
+		rdtscll(e);
+
+		if (i > 1) {
+			/* Performant absolute value function instead of branching */
+			time = (e - s - (GRANULARITY * BUDGET_TIME));
+			mask = (time >> (sizeof(cycles_t) * CHAR_BIT - 1));
+			time = (time + mask) ^ mask;
+
+			perfdata_add(&result, time);
+
+			if (EXPECT_LLU_LT((long long unsigned)(e-s), (unsigned)(GRANULARITY * BUDGET_TIME * MAX_THDS),
+						"Single Budget: MAX Bound") ||
+					EXPECT_LLU_LT((unsigned)(GRANULARITY * BUDGET_TIME * MIN_THDS), (long long unsigned)(e-s),
+						"Single Budget: MIN Bound")) {
+				return;
+			}
+		}
+		sched_events_clear();
+	}
+
+	perfdata_calc(&result);
+	result_budgets_single.avg = perfdata_avg(&result);
+	result_budgets_single.max = perfdata_avg(&result);
+	result_budgets_single.min = perfdata_avg(&result);
+	result_budgets_single.sz = perfdata_avg(&result);
+	result_budgets_single.sd = perfdata_avg(&result);
+	result_budgets_single.p90tile = perfdata_avg(&result);
+	result_budgets_single.p95tile = perfdata_avg(&result);
+	result_budgets_single.p99tile = perfdata_avg(&result);
+
+	PRINTC("\t%s: \t\t\tSuccess\n", "Timer => Budget based");
 }
 
 #define RATE_1 1600
@@ -314,78 +314,78 @@ test_tcap_budgets_single(void)
 static void
 test_tcap_budgets_multi(void)
 {
-        int i;
-
-        if(EXPECT_LL_NEQ(0, exec_cluster_alloc(&mbt[cos_cpuid()].p, spinner_cyc, &(mbt[cos_cpuid()].p.cyc),
-                                         BOOT_CAPTBL_SELF_INITRCV_CPU_BASE), "Multi Budget: Cannot Allocate") ||
-           EXPECT_LL_NEQ(0, exec_cluster_alloc(&mbt[cos_cpuid()].c, spinner_cyc, &(mbt[cos_cpuid()].c.cyc),
-                                         mbt[cos_cpuid()].p.rc), "Multi Budget: Cannot Allocate") ||
-           EXPECT_LL_NEQ(0, exec_cluster_alloc(&mbt[cos_cpuid()].g, spinner_cyc, &(mbt[cos_cpuid()].g.cyc),
-                                         mbt[cos_cpuid()].c.rc), "Multi Budget: Cannot allocate")) {
-                return;
-        }
-
-        for (i = 1; i <= TEST_ITER; i++) {
-                tcap_res_t  res;
-                cycles_t    s, e;
-                tcap_time_t thd_timeout;
-
-                                        /* test both increasing budgets and constant budgets */
-                if (i > (TEST_ITER/2))
-                        res = GRANULARITY * RATE_1;
-                else
-                        res = i * GRANULARITY * RATE_2;
-
-                if (EXPECT_LL_NEQ(0, cos_tcap_transfer(mbt[cos_cpuid()].p.rc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE,
-                                  res, TCAP_PRIO_MAX + 2), "Multi Budget: TCAP Transfer") ||
-                        EXPECT_LL_NEQ(0, cos_tcap_transfer(mbt[cos_cpuid()].c.rc, mbt[cos_cpuid()].p.tcc, res / 2,
-                                      TCAP_PRIO_MAX + 2), "Multi Budget: TCAP Transfer") ||
-                        EXPECT_LL_NEQ(0, cos_tcap_transfer(mbt[cos_cpuid()].g.rc, mbt[cos_cpuid()].c.tcc, res / 4,
-                                      TCAP_PRIO_MAX + 2), "Multi Budget: TCAP Transfer")) {
-                        return;
-                }
-
-                mbt[cos_cpuid()].p.cyc = mbt[cos_cpuid()].c.cyc = mbt[cos_cpuid()].g.cyc = 0;
-                rdtscll(s);
-                if (cos_switch(mbt[cos_cpuid()].g.tc, mbt[cos_cpuid()].g.tcc, TCAP_PRIO_MAX + 2, TCAP_TIME_NIL,
-                               BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, cos_sched_sync())) {
-                        EXPECT_LL_NEQ(0, 1, "Multi Budget: COS Switch");
-                        return;
-                }
-                rdtscll(e);
+	int i;
+
+	if(EXPECT_LL_NEQ(0, exec_cluster_alloc(&mbt[cos_cpuid()].p, spinner_cyc, &(mbt[cos_cpuid()].p.cyc),
+					BOOT_CAPTBL_SELF_INITRCV_CPU_BASE), "Multi Budget: Cannot Allocate") ||
+			EXPECT_LL_NEQ(0, exec_cluster_alloc(&mbt[cos_cpuid()].c, spinner_cyc, &(mbt[cos_cpuid()].c.cyc),
+					mbt[cos_cpuid()].p.rc), "Multi Budget: Cannot Allocate") ||
+			EXPECT_LL_NEQ(0, exec_cluster_alloc(&mbt[cos_cpuid()].g, spinner_cyc, &(mbt[cos_cpuid()].g.cyc),
+					mbt[cos_cpuid()].c.rc), "Multi Budget: Cannot allocate")) {
+		return;
+	}
+
+	for (i = 1; i <= TEST_ITER; i++) {
+		tcap_res_t  res;
+		cycles_t    s, e;
+		tcap_time_t thd_timeout;
+
+		/* test both increasing budgets and constant budgets */
+		if (i > (TEST_ITER/2))
+			res = GRANULARITY * RATE_1;
+		else
+			res = i * GRANULARITY * RATE_2;
+
+		if (EXPECT_LL_NEQ(0, cos_tcap_transfer(mbt[cos_cpuid()].p.rc, BOOT_CAPTBL_SELF_INITTCAP_CPU_BASE,
+						res, TCAP_PRIO_MAX + 2), "Multi Budget: TCAP Transfer") ||
+				EXPECT_LL_NEQ(0, cos_tcap_transfer(mbt[cos_cpuid()].c.rc, mbt[cos_cpuid()].p.tcc, res / 2,
+						TCAP_PRIO_MAX + 2), "Multi Budget: TCAP Transfer") ||
+				EXPECT_LL_NEQ(0, cos_tcap_transfer(mbt[cos_cpuid()].g.rc, mbt[cos_cpuid()].c.tcc, res / 4,
+						TCAP_PRIO_MAX + 2), "Multi Budget: TCAP Transfer")) {
+			return;
+		}
+
+		mbt[cos_cpuid()].p.cyc = mbt[cos_cpuid()].c.cyc = mbt[cos_cpuid()].g.cyc = 0;
+		rdtscll(s);
+		if (cos_switch(mbt[cos_cpuid()].g.tc, mbt[cos_cpuid()].g.tcc, TCAP_PRIO_MAX + 2, TCAP_TIME_NIL,
+					BOOT_CAPTBL_SELF_INITRCV_CPU_BASE, cos_sched_sync())) {
+			EXPECT_LL_NEQ(0, 1, "Multi Budget: COS Switch");
+			return;
+		}
+		rdtscll(e);
 
 		sched_events_clear();
 
-                if ( i > 1) {
-
-                        /* To measure time of execution, we need a min time
-                         * as well as a max time to determine
-                         * if the interrupt happened when it was supposed to
-                         * thus MAX bound and MIN bound
-                         * MAX_THDS and MIN_THDS are #defined to give it some flexibility
-                         * from the user
-                         */
-
-                        if (EXPECT_LLU_LT((mbt[cos_cpuid()].g.cyc - s), (res / 4 * MAX_THDS), "Multi Budget: G")       ||
-                            EXPECT_LLU_LT(mbt[cos_cpuid()].g.cyc - s, res / 4 * MAX_THDS, "Multi Budget: G MAX Bound") ||
-                            EXPECT_LLU_LT(res / 4 * MIN_THDS, mbt[cos_cpuid()].g.cyc - s, "Multi Budget: G MIN Bound") ||
-                            EXPECT_LLU_LT(mbt[cos_cpuid()].c.cyc - s, res / 2 * MAX_THDS, "Multi Budget: C MAX Bound") ||
-                            EXPECT_LLU_LT(res / 2 * MIN_THDS, mbt[cos_cpuid()].c.cyc - s, "Multi Budget: C MIN Bound") ||
-                            EXPECT_LLU_LT(mbt[cos_cpuid()].p.cyc - s, res * MAX_THDS, "Multi Budget: P MAX Bound")     ||
-                            EXPECT_LLU_LT(res * MIN_THDS, mbt[cos_cpuid()].p.cyc - s, "Multi Budget: P MIN BOund")) {
-                            return;
-                        }
-                }
-        }
-        PRINTC("\t%s: \t\tSuccess\n", "Timer => Hierarchical Budget");
+		if ( i > 1) {
+
+			/* To measure time of execution, we need a min time
+			 * as well as a max time to determine
+			 * if the interrupt happened when it was supposed to
+			 * thus MAX bound and MIN bound
+			 * MAX_THDS and MIN_THDS are #defined to give it some flexibility
+			 * from the user
+			 */
+
+			if (EXPECT_LLU_LT((mbt[cos_cpuid()].g.cyc - s), (res / 4 * MAX_THDS), "Multi Budget: G")       ||
+					EXPECT_LLU_LT(mbt[cos_cpuid()].g.cyc - s, res / 4 * MAX_THDS, "Multi Budget: G MAX Bound") ||
+					EXPECT_LLU_LT(res / 4 * MIN_THDS, mbt[cos_cpuid()].g.cyc - s, "Multi Budget: G MIN Bound") ||
+					EXPECT_LLU_LT(mbt[cos_cpuid()].c.cyc - s, res / 2 * MAX_THDS, "Multi Budget: C MAX Bound") ||
+					EXPECT_LLU_LT(res / 2 * MIN_THDS, mbt[cos_cpuid()].c.cyc - s, "Multi Budget: C MIN Bound") ||
+					EXPECT_LLU_LT(mbt[cos_cpuid()].p.cyc - s, res * MAX_THDS, "Multi Budget: P MAX Bound")     ||
+					EXPECT_LLU_LT(res * MIN_THDS, mbt[cos_cpuid()].p.cyc - s, "Multi Budget: P MIN BOund")) {
+				return;
+			}
+		}
+	}
+	PRINTC("\t%s: \t\tSuccess\n", "Timer => Hierarchical Budget");
 }
 
 void
 test_tcap_budgets(void)
 {
-        /* single-level budgets test */
-        test_tcap_budgets_single();
+	/* single-level budgets test */
+	test_tcap_budgets_single();
 
-        /* multi-level budgets test */
-        test_tcap_budgets_multi();
+	/* multi-level budgets test */
+	test_tcap_budgets_multi();
 }
diff --git a/src/components/implementation/tests/kernel_tests/k_test_thd.c b/src/components/implementation/tests/kernel_tests/k_test_thd.c
index c5f1185c8d..a4cffad5c7 100644
--- a/src/components/implementation/tests/kernel_tests/k_test_thd.c
+++ b/src/components/implementation/tests/kernel_tests/k_test_thd.c
@@ -11,50 +11,50 @@ static int          failure = 0;
 static void
 test_thd_arg(void *d)
 {
-        int ret = 0;
+	int ret = 0;
 
-        if (EXPECT_LL_NEQ((int)d, THD_ARG, "Thread Creation: Argument Incorrect")) failure = 1;
-        while (1) cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
-        PRINTC("Error, shouldn't get here!\n");
+	if (EXPECT_LL_NEQ((int)d, THD_ARG, "Thread Creation: Argument Incorrect")) failure = 1;
+	while (1) cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
+	PRINTC("Error, shouldn't get here!\n");
 }
 
 static void
 test_thds_create_switch(void)
 {
-        thdcap_t ts;
-        intptr_t i = THD_ARG;
-        int      ret;
-
-        ts = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_thd_arg, (void *)i, 0, 0);
-        if (EXPECT_LL_LT(1, ts, "Thread Creation: Cannot Allocate")) {
-                return;
-        }
-        ret = cos_thd_switch(ts);
-        EXPECT_LL_NEQ(0, ret, "COS Switch Error");
-
-        CHECK_STATUS_FLAG();
-        PRINTC("\t%s: \t\t\tSuccess\n", "THD => Creation & ARG");
-        EXIT_FN();
+	thdcap_t ts;
+	intptr_t i = THD_ARG;
+	int      ret;
+
+	ts = cos_thd_alloc(&booter_info, booter_info.comp_cap, test_thd_arg, (void *)i, 0, 0);
+	if (EXPECT_LL_LT(1, ts, "Thread Creation: Cannot Allocate")) {
+		return;
+	}
+	ret = cos_thd_switch(ts);
+	EXPECT_LL_NEQ(0, ret, "COS Switch Error");
+
+	CHECK_STATUS_FLAG();
+	PRINTC("\t%s: \t\t\tSuccess\n", "THD => Creation & ARG");
+	EXIT_FN();
 }
 
 static void
 thd_fn_mthds_ring(void *d)
 {
-        int ret;
+	int ret;
 
-        if (count != (int) d) cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
+	if (count != (int) d) cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
 
-        int next = (++count) % TEST_NTHDS;
-        if (!next) cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
+	int next = (++count) % TEST_NTHDS;
+	if (!next) cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
 
-        ret = cos_thd_switch(thd_test[next]);
-        if (EXPECT_LL_NEQ(0, ret, "Thread Ring: COS Switch Error")) failure = 1;
+	ret = cos_thd_switch(thd_test[next]);
+	if (EXPECT_LL_NEQ(0, ret, "Thread Ring: COS Switch Error")) failure = 1;
 
-        while (1) {
-                cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
-        }
-        EXPECT_LL_NEQ(1, 0, "Error, shouldn't get here!\n");
-        assert(0);
+	while (1) {
+		cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
+	}
+	EXPECT_LL_NEQ(1, 0, "Error, shouldn't get here!\n");
+	assert(0);
 }
 
 /* Ring Multithreaded Test
@@ -66,39 +66,39 @@ thd_fn_mthds_ring(void *d)
 static void
 test_mthds_ring(void)
 {
-        int   i, ret;
+	int   i, ret;
 
-        count = 0;
+	count = 0;
 
-        for (i = 0; i < TEST_NTHDS; i++) {
-                thd_test[i] = cos_thd_alloc(&booter_info, booter_info.comp_cap, thd_fn_mthds_ring, (void *)i, 0, 0);
-                if (EXPECT_LL_LT(1, thd_test[i], "Thread Ring: Cannot Allocate")) {
-                        return;
-                }
-        }
+	for (i = 0; i < TEST_NTHDS; i++) {
+		thd_test[i] = cos_thd_alloc(&booter_info, booter_info.comp_cap, thd_fn_mthds_ring, (void *)i, 0, 0);
+		if (EXPECT_LL_LT(1, thd_test[i], "Thread Ring: Cannot Allocate")) {
+			return;
+		}
+	}
 
-        ret = cos_thd_switch(thd_test[0]);
-        EXPECT_LL_NEQ(0, ret, "Thread Ring: COS Switch Error");
+	ret = cos_thd_switch(thd_test[0]);
+	EXPECT_LL_NEQ(0, ret, "Thread Ring: COS Switch Error");
 
-        if (EXPECT_LL_NEQ(count, TEST_NTHDS, "Thread Ring: Failure # of THDS")) {
-                return;
-        }
+	if (EXPECT_LL_NEQ(count, TEST_NTHDS, "Thread Ring: Failure # of THDS")) {
+		return;
+	}
 
-        CHECK_STATUS_FLAG();
-        PRINTC("\t%s: \t\t\tSuccess\n", "THD => Switch Cyclic" );
-        EXIT_FN();
+	CHECK_STATUS_FLAG();
+	PRINTC("\t%s: \t\t\tSuccess\n", "THD => Switch Cyclic" );
+	EXIT_FN();
 }
 
 static void
 thd_fn_mthds_classic(void *d)
 {
-        cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
+	cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
 
-        while (1) {
-                cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
-        }
-        EXPECT_LL_NEQ(1, 0, "Error, shouldn't get here!\n");
-        assert(0);
+	while (1) {
+		cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
+	}
+	EXPECT_LL_NEQ(1, 0, "Error, shouldn't get here!\n");
+	assert(0);
 }
 
 /* Classic Multithreaded Test
@@ -109,31 +109,31 @@ thd_fn_mthds_classic(void *d)
 static void
 test_mthds_classic(void)
 {
-        thdcap_t  ts;
-        int       i, ret;
-
-        ts = cos_thd_alloc(&booter_info, booter_info.comp_cap, thd_fn_mthds_classic, NULL, 0, 0);
-        if (EXPECT_LL_LT(1, ts, "Thread Classic: Cannot Allocate")) {
-                return;
-        }
-
-        for (i = 0; i < ITER; i++) {
-                ret = cos_thd_switch(ts);
-                if(EXPECT_LL_NEQ(0, ret, "Thread Classic: COS Switch Error")) return;
-        }
-        CHECK_STATUS_FLAG();
-        PRINTC("\t%s: \t\tSuccess\n", "THD => Switch in pairs");
-        EXIT_FN();
+	thdcap_t  ts;
+	int       i, ret;
+
+	ts = cos_thd_alloc(&booter_info, booter_info.comp_cap, thd_fn_mthds_classic, NULL, 0, 0);
+	if (EXPECT_LL_LT(1, ts, "Thread Classic: Cannot Allocate")) {
+		return;
+	}
+
+	for (i = 0; i < ITER; i++) {
+		ret = cos_thd_switch(ts);
+		if(EXPECT_LL_NEQ(0, ret, "Thread Classic: COS Switch Error")) return;
+	}
+	CHECK_STATUS_FLAG();
+	PRINTC("\t%s: \t\tSuccess\n", "THD => Switch in pairs");
+	EXIT_FN();
 }
 
 static void
 thd_tls(void *d)
 {
-        if (EXPECT_LLU_NEQ((long unsigned)tls_get(0), (long unsigned)tls_test[cos_cpuid()][(int)d],
-                            "Thread TLS: ARG not correct")) failure = 1;
-        while (1) cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
-        EXPECT_LL_NEQ(1, 0, "Error, shouldn't get here!\n");
-        assert(0);
+	if (EXPECT_LLU_NEQ((long unsigned)tls_get(0), (long unsigned)tls_test[cos_cpuid()][(int)d],
+				"Thread TLS: ARG not correct")) failure = 1;
+	while (1) cos_thd_switch(BOOT_CAPTBL_SELF_INITTHD_CPU_BASE);
+	EXPECT_LL_NEQ(1, 0, "Error, shouldn't get here!\n");
+	assert(0);
 }
 
 /* Test the TLS support
@@ -142,32 +142,32 @@ thd_tls(void *d)
 static void
 test_thds_tls(void)
 {
-        thdcap_t ts[TEST_NTHDS];
-        intptr_t i;
-        int      ret;
-
-        for (i = 0; i < TEST_NTHDS; i++) {
-                ts[i] = cos_thd_alloc(&booter_info, booter_info.comp_cap, thd_tls, (void *)i, 0, 0);
-                if (EXPECT_LL_LT(1, ts[i], "Thread TLS: Cannot Allocate")) {
-                        return;
-                }
-                tls_test[cos_cpuid()][i] = i;
-                cos_thd_mod(&booter_info, ts[i], &tls_test[cos_cpuid()][i]);
-                ret = cos_thd_switch(ts[i]);
-                if (EXPECT_LL_NEQ(0, ret, "Thread TLS: COS Switch Error")) return;
-        }
-
-        CHECK_STATUS_FLAG();
-        PRINTC("\t%s: \t\t\tSuccess\n", "THD => Creation & TLS");
-        EXIT_FN();
+	thdcap_t ts[TEST_NTHDS];
+	intptr_t i;
+	int      ret;
+
+	for (i = 0; i < TEST_NTHDS; i++) {
+		ts[i] = cos_thd_alloc(&booter_info, booter_info.comp_cap, thd_tls, (void *)i, 0, 0);
+		if (EXPECT_LL_LT(1, ts[i], "Thread TLS: Cannot Allocate")) {
+			return;
+		}
+		tls_test[cos_cpuid()][i] = i;
+		cos_thd_mod(&booter_info, ts[i], &tls_test[cos_cpuid()][i]);
+		ret = cos_thd_switch(ts[i]);
+		if (EXPECT_LL_NEQ(0, ret, "Thread TLS: COS Switch Error")) return;
+	}
+
+	CHECK_STATUS_FLAG();
+	PRINTC("\t%s: \t\t\tSuccess\n", "THD => Creation & TLS");
+	EXIT_FN();
 }
 
 void
 test_thds(void)
 {
-        test_thds_create_switch();
-        test_thds_tls();
-        test_mthds_classic();
-        test_mthds_ring();
+	test_thds_create_switch();
+	test_thds_tls();
+	test_mthds_classic();
+	test_mthds_ring();
 }
 
diff --git a/src/components/implementation/tests/kernel_tests/kernel_test_booter.c b/src/components/implementation/tests/kernel_tests/kernel_test_booter.c
index 158d9b6dcf..50ebeb8d59 100644
--- a/src/components/implementation/tests/kernel_tests/kernel_test_booter.c
+++ b/src/components/implementation/tests/kernel_tests/kernel_test_booter.c
@@ -14,7 +14,7 @@ int count = 0;
 void
 term_fn(void *d)
 {
-        SPIN();
+	SPIN();
 }
 
 static int test_done[NUM_CPU];
@@ -22,52 +22,52 @@ static int test_done[NUM_CPU];
 void
 cos_init(void)
 {
-        int        cycs, i;
-        static int first_init = 1, init_done = 0;
-
-        cycs = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
-        printc("\t%d cycles per microsecond\n", cycs);
-
-        if (first_init) {
-                first_init = 0;
-                cos_meminfo_init(&booter_info.mi, BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
-                cos_compinfo_init(&booter_info, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_COMP,
-                                                  (vaddr_t)cos_get_heap_ptr(), LLBOOT_CAPTBL_FREE, &booter_info);
-                init_done = 1;
-        }
-
-        while (!init_done);
-
-        termthd[cos_cpuid()] = cos_thd_alloc(&booter_info, booter_info.comp_cap, term_fn, NULL, 0, 0);
-        assert(termthd[cos_cpuid()]);
-        PRINTC("Kernel Tests\n");
-        printc("\nUnit Test Started:\n\n");
-
-        /* Kernel Tests */
-        cyc_per_usec = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
-        test_timer();
-        test_tcap_budgets();
-        test_2timers();
-        test_thds();
-        test_mem_alloc();
-        test_async_endpoints();
-        test_inv();
-        test_captbl_expands();
-
-        printc("\nuBenchamarks Started:\n\n");
-
-        test_run_perf_kernel();
-
-        /* NOTE: This is just to make sense of the output on HW! To understand that microbooter runs to completion on all cores! */
-        test_done[cos_cpuid()] = 1;
-        for (i = 0; i < NUM_CPU; i++) {
-                while (!test_done[i]) ;
-        }
-
-        printc("\n");
-        PRINTC("Kernel Tests done.\n");
-
-        cos_thd_switch(termthd[cos_cpuid()]);
-
-        return;
+	int        cycs, i;
+	static int first_init = 1, init_done = 0;
+
+	cycs = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
+	printc("\t%d cycles per microsecond\n", cycs);
+
+	if (first_init) {
+		first_init = 0;
+		cos_meminfo_init(&booter_info.mi, BOOT_MEM_KM_BASE, COS_MEM_KERN_PA_SZ, BOOT_CAPTBL_SELF_UNTYPED_PT);
+		cos_compinfo_init(&booter_info, BOOT_CAPTBL_SELF_PT, BOOT_CAPTBL_SELF_CT, BOOT_CAPTBL_SELF_COMP,
+				(vaddr_t)cos_get_heap_ptr(), LLBOOT_CAPTBL_FREE, &booter_info);
+		init_done = 1;
+	}
+
+	while (!init_done);
+
+	termthd[cos_cpuid()] = cos_thd_alloc(&booter_info, booter_info.comp_cap, term_fn, NULL, 0, 0);
+	assert(termthd[cos_cpuid()]);
+	PRINTC("Kernel Tests\n");
+	printc("\nUnit Test Started:\n\n");
+
+	/* Kernel Tests */
+	cyc_per_usec = cos_hw_cycles_per_usec(BOOT_CAPTBL_SELF_INITHW_BASE);
+	test_timer();
+	test_tcap_budgets();
+	test_2timers();
+	test_thds();
+	test_mem_alloc();
+	test_async_endpoints();
+	test_inv();
+	test_captbl_expands();
+
+	printc("\nuBenchamarks Started:\n\n");
+
+	test_run_perf_kernel();
+
+	/* NOTE: This is just to make sense of the output on HW! To understand that microbooter runs to completion on all cores! */
+	test_done[cos_cpuid()] = 1;
+	for (i = 0; i < NUM_CPU; i++) {
+		while (!test_done[i]) ;
+	}
+
+	printc("\n");
+	PRINTC("Kernel Tests done.\n");
+
+	cos_thd_switch(termthd[cos_cpuid()]);
+
+	return;
 }
diff --git a/src/components/implementation/tests/kernel_tests/kernel_tests.h b/src/components/implementation/tests/kernel_tests/kernel_tests.h
index 82741bef12..4668e89297 100644
--- a/src/components/implementation/tests/kernel_tests/kernel_tests.h
+++ b/src/components/implementation/tests/kernel_tests/kernel_tests.h
@@ -10,22 +10,22 @@
 #undef assert
 /* On assert, immediately switch to the "exit" thread */
 #define assert(node)                                                        \
-        do {                                                                \
-                if (unlikely(!(node))) {                                    \
-                        debug_print("assert error in @ ");                  \
-                        cos_thd_switch(termthd[cos_cpuid()]);               \
-                }                                                           \
-        } while (0)
+	do {                                                                \
+		if (unlikely(!(node))) {                                    \
+			debug_print("assert error in @ ");                  \
+			cos_thd_switch(termthd[cos_cpuid()]);               \
+		}                                                           \
+	} while (0)
 
 #define EXIT_FN()                                                           \
-                exit_fn: return;
+exit_fn: return;
 
 #define CHECK_STATUS_FLAG()                                                 \
-        do {                                                                \
-                if (failure) {                                              \
-                        goto exit_fn;                                       \
-                }                                                           \
-        } while (0)
+	do {                                                                \
+		if (failure) {                                              \
+			goto exit_fn;                                       \
+		}                                                           \
+	} while (0)
 
 #include <cos_component.h>
 #include <cobj_format.h>
@@ -56,30 +56,30 @@ extern unsigned long    thd_test[TEST_NTHDS];
 extern int              num, den, count;
 
 struct results {
-        long long unsigned avg;
-        long long unsigned max;
-        long long unsigned min;
-        long long unsigned sd;
-        int                sz;
-        long long unsigned p90tile;
-        long long unsigned p95tile;
-        long long unsigned p99tile;
+	long long unsigned avg;
+	long long unsigned max;
+	long long unsigned min;
+	long long unsigned sd;
+	int                sz;
+	long long unsigned p90tile;
+	long long unsigned p95tile;
+	long long unsigned p99tile;
 };
 
-static unsigned long
+	static unsigned long
 tls_get(size_t off)
 {
-        unsigned long val;
+	unsigned long val;
 
-        __asm__ __volatile__("movl %%gs:(%1), %0" : "=r"(val) : "r"(off) :);
+	__asm__ __volatile__("movl %%gs:(%1), %0" : "=r"(val) : "r"(off) :);
 
-        return val;
+	return val;
 }
 
-static void
+	static void
 tls_set(size_t off, unsigned long val)
 {
-        __asm__ __volatile__("movl %0, %%gs:(%1)" : : "r"(val), "r"(off) : "memory");
+	__asm__ __volatile__("movl %0, %%gs:(%1)" : : "r"(val), "r"(off) : "memory");
 }
 
 extern void test_run_perf_kernel(void);