From 212cf8689f99e7eef9a62c584341def084f02bf4 Mon Sep 17 00:00:00 2001 From: Jacek Maksymowicz Date: Thu, 21 Mar 2024 13:34:25 +0100 Subject: [PATCH 1/7] armv7a: refactor CPU init code, small improvements in assembly Move memory barrier in spinlock clear operation zynq7000: Add memory barriers to SLCR locking/unlocking JIRA: RTOS-796 --- hal/armv7a/_armv7a.S | 5 +- hal/armv7a/_interrupts.S | 97 ++++++++-------- hal/armv7a/exceptions.c | 2 +- hal/armv7a/spinlock.c | 8 +- hal/armv7a/zynq7000/_init.S | 215 +++++++++++++++++++----------------- hal/armv7a/zynq7000/zynq.c | 3 + 6 files changed, 174 insertions(+), 156 deletions(-) diff --git a/hal/armv7a/_armv7a.S b/hal/armv7a/_armv7a.S index 71524ae7b..2116747a7 100644 --- a/hal/armv7a/_armv7a.S +++ b/hal/armv7a/_armv7a.S @@ -281,9 +281,8 @@ hal_jmp: cps #IRQ_MODE mov r5, #0x10 tst r4, #1 - orrne r5, r5, #(1 << 5) - push {r5} - push {r4} + orrne r5, r5, #THUMB_STATE + push {r4, r5} rfefd sp! .size hal_jmp, .-hal_jmp .ltorg diff --git a/hal/armv7a/_interrupts.S b/hal/armv7a/_interrupts.S index 7a3fcc652..4bde63230 100644 --- a/hal/armv7a/_interrupts.S +++ b/hal/armv7a/_interrupts.S @@ -19,14 +19,21 @@ .arm +.macro push_fpu_state reg_tmp + vpush {d16-d31} + vpush {d0-d15} + vmrs \reg_tmp, fpscr + push {\reg_tmp} +.endm + .globl _exception_undef .type _exception_undef, %function _exception_undef: cpsid if stmfd sp, {r0-r4} - mov r0, #1 + mov r0, #1 /* exc_undef */ mrs r3, spsr - tst r3, #0x20 + tst r3, #THUMB_STATE subeq r2, lr, #4 subne r2, lr, #2 b _exceptions_dispatch @@ -38,7 +45,7 @@ _exception_undef: _exception_prefetch: cpsid if stmfd sp, {r0-r4} - mov r0, #3 + mov r0, #3 /* exc_prefetch */ sub r2, lr, #4 b _exceptions_dispatch .size _exception_prefetch, .-_exception_prefetch @@ -49,7 +56,7 @@ _exception_prefetch: _exception_abort: cpsid if stmfd sp, {r0-r4} - mov r0, #4 + mov r0, #4 /* exc_abort */ sub r2, lr, #8 b _exceptions_dispatch .size _exception_abort, .-_exception_abort @@ -58,9 +65,13 @@ _exception_abort: .globl _exceptions_dispatch .type _exceptions_dispatch, %function _exceptions_dispatch: + /* Contents of registers: + * r0 - exception number + * r2 - PC of the instruction that caused the exception + * registers r0-r4 from previous context are saved below SP */ mrs r3, spsr sub r1, sp, #0x14 - mrc p15, 0, r4, c13, c0, 4 + mrc p15, 0, r4, c13, c0, 4 /* TPIDRPRW */ cps #SYS_MODE tst r3, #0x0f movne r4, sp @@ -69,24 +80,25 @@ _exceptions_dispatch: mov sp, r4 ldmfd r1, {r4-r8} push {r3-r8} - vpush {d16-d31} - vpush {d0-d15} - vmrs r4,fpscr - push {r4} + + push_fpu_state r4 + sub sp, sp, #8 str sp, [sp] - mrc p15, 0, r1, c6, c0, 2 + + /* Push exception context */ + mrc p15, 0, r1, c6, c0, 2 /* IFAR */ push {r1} - mrc p15, 0, r1, c5, c0, 1 + mrc p15, 0, r1, c5, c0, 1 /* IFSR */ push {r1} - mrc p15, 0, r1, c6, c0, 0 + mrc p15, 0, r1, c6, c0, 0 /* DFAR */ push {r1} - mrc p15, 0, r1, c5, c0, 0 + mrc p15, 0, r1, c5, c0, 0 /* DFSR */ push {r1} + mov r1, sp - ldr lr, =exceptions_dispatch - blx lr + blx exceptions_dispatch ldr sp, [sp, #0x10] add sp, sp, #8 @@ -122,19 +134,14 @@ _hal_cpuRestoreCtx: .type hal_cpuReschedule, %function hal_cpuReschedule: cpsid if + /* Store CPU registers */ + str sp, [sp, #-12] + push {lr} /* Push LR as both PC and LR */ push {lr} - stmfd sp, {r0-r14} - sub sp, sp, #0x3c - - mov r3, r1 /* Save spinlock context for later */ - - /* Default return value - EOK */ - mov r1, #0 - str r1, [sp] - - ldr r1, [sp, #0x34] - add r1, #4 - str r1, [sp, #0x34] + sub sp, #4 /* Skip over SP, already saved */ + push {r1-r12} + mov r3, #0 + push {r3} /* Push default return value (EOK) as R0 */ mrs r4, cpsr @@ -144,30 +151,26 @@ hal_cpuReschedule: add r0, #12 /* Spinlock clear */ -spinlock: - ldrexb r1, [r0] - add r1, r1, #1 dmb - strexb r2, r1, [r0] +spinlock: + ldrexb r3, [r0] + add r3, r3, #1 + strexb r2, r3, [r0] cmp r2, #0 bne spinlock - ldrb r1, [r3] + ldrb r1, [r1] bic r4, #0xff and r1, #0xff orr r4, r4, r1 1: - /* store CPSR with adjusted M and I flags */ + /* store CPSR with adjusted I, F, T flags */ + bic r4, #0xe0 and r5, lr, #1 /* extract Thumb flag from LR address */ orr r4, r4, r5, lsl #5 - bic r4, #0xc0 push {r4} - /* Store fpu context */ - vpush {d16-d31} - vpush {d0-d15} - vmrs r4,fpscr - push {r4} + push_fpu_state r4 sub r1, sp, #8 push {r1} @@ -212,11 +215,7 @@ _interrupts_dispatch: ldmfd r0, {r3-r6} push {r2-r6} - /* Store fpu context */ - vpush {d16-d31} - vpush {d0-d15} - vmrs r4,fpscr - push {r4} + push_fpu_state r4 /* save SP on top of the stack and pass it as arg1 to IRQ handler (it is cpu_context_t *) */ sub r1, sp, #8 @@ -247,20 +246,18 @@ _syscalls_dispatch: biceq r0, r0, #0xff000000 ldrneh r0, [r2, #-2] bicne r0, r0, #0xff00 - mrc p15, 0, r4, c13, c0, 4 + mrc p15, 0, r4, c13, c0, 4 /* TPIDRPRW */ cpsie af, #SYS_MODE + /* Store CPU state onto kernel stack */ stmfd r4!, {r2} stmfd r4!, {r5-r14} mov r2, sp mov sp, r4 ldmfd r1, {r4-r8} push {r3-r8} - vpush {d16-d31} - vpush {d0-d15} - vmrs r4,fpscr - push {r4} + push_fpu_state r4 sub r1, sp, #8 push {r1} push {r1} @@ -273,8 +270,6 @@ _syscalls_dispatch: cpsid if - str r0, [sp, #272] - ldr sp, [sp] add sp, sp, #8 b _hal_cpuRestoreCtx diff --git a/hal/armv7a/exceptions.c b/hal/armv7a/exceptions.c index 5d5b0da6f..308ead1b4 100644 --- a/hal/armv7a/exceptions.c +++ b/hal/armv7a/exceptions.c @@ -80,7 +80,7 @@ void hal_exceptionsDumpContext(char *buff, exc_context_t *ctx, int n) i += hal_i2s(" fp=", &buff[i], ctx->cpuCtx.fp, 16, 1); i += hal_i2s("\n ip=", &buff[i], ctx->cpuCtx.ip, 16, 1); - i += hal_i2s(" sp=", &buff[i], (u32)ctx + 21 * 4, 16, 1); + i += hal_i2s(" sp=", &buff[i], ctx->cpuCtx.sp, 16, 1); i += hal_i2s(" lr=", &buff[i], ctx->cpuCtx.lr, 16, 1); i += hal_i2s(" pc=", &buff[i], ctx->cpuCtx.pc, 16, 1); diff --git a/hal/armv7a/spinlock.c b/hal/armv7a/spinlock.c index 25fd4450b..840e08423 100644 --- a/hal/armv7a/spinlock.c +++ b/hal/armv7a/spinlock.c @@ -25,6 +25,7 @@ static struct { void hal_spinlockSet(spinlock_t *spinlock, spinlock_ctx_t *sc) { + /* clang-format off */ __asm__ volatile(" \ mrs r1, cpsr; \ cpsid if; \ @@ -41,24 +42,27 @@ void hal_spinlockSet(spinlock_t *spinlock, spinlock_ctx_t *sc) : : "r" (sc), "r" (&spinlock->lock) : "r1", "r2", "memory", "cc"); + /* clang-format on */ } void hal_spinlockClear(spinlock_t *spinlock, spinlock_ctx_t *sc) { + /* clang-format off */ __asm__ volatile (" \ + dmb; \ 1 : \ ldrexb r1, [%0]; \ add r1, r1, #1; \ - dmb; \ strexb r2, r1, [%0]; \ cmp r2, #0; \ bne 1b; \ ldrb r1, [%1]; \ - msr cpsr_c, r1;" + msr cpsr_c, r1" : : "r" (&spinlock->lock), "r" (sc) : "r1", "r2", "memory"); + /* clang-format on */ } diff --git a/hal/armv7a/zynq7000/_init.S b/hal/armv7a/zynq7000/_init.S index 3dda2feda..4802aa0f7 100644 --- a/hal/armv7a/zynq7000/_init.S +++ b/hal/armv7a/zynq7000/_init.S @@ -24,21 +24,26 @@ .extern relOffs .extern _end -#define ADDR_TTL1 (pmap_common - VADDR_KERNEL + ADDR_DDR) -#define ADDR_TTL2_K (ADDR_TTL1 + 4 * SIZE_PAGE) -#define ADDR_TTL2_EXC (ADDR_TTL2_K + SIZE_PAGE) -#define ADDR_STACK 0x004ff000 - -#define VADDR_SYSPAGE (_end + SIZE_PAGE - 1) -#define VADDR_UART0 (VADDR_SYSPAGE + 2 * SIZE_PAGE) -#define VADDR_UART1 (VADDR_UART0 + SIZE_PAGE) -#define VADDR_GIC (VADDR_UART1 + SIZE_PAGE) -#define VADDR_TTC (VADDR_GIC + 4 * SIZE_PAGE) - -#define PADDR_UART0 0xe0000000 -#define PADDR_UART1 0xe0001000 -#define PADDR_SLCR 0xf8000000 -#define PADDR_TTC 0xf8001000 +#define PA_KERNEL (ADDR_DDR) +#define PA_OF(va) ((va) - VADDR_KERNEL + PA_KERNEL) +#define TTL1_OFFSET_OF(va) ((va >> 20) << 2) + +#define VA_TTL1 (pmap_common) /* u32 kpdir[0x1000] */ +#define VA_TTL2_K (VA_TTL1 + 4 * SIZE_PAGE) /* u32 kptab[0x400] */ +#define VA_TTL2_EXC (VA_TTL2_K + SIZE_PAGE) /* u32 excptab[0x400] */ +#define VA_TTL_END (VA_TTL2_EXC + SIZE_PAGE) +#define PA_STACK (ADDR_DDR + 4 * 1024 * 1024 - SIZE_PAGE) + +#define VA_SYSPAGE (_end + SIZE_PAGE - 1) +#define VA_UART0 (VA_SYSPAGE + 2 * SIZE_PAGE) +#define VA_UART1 (VA_UART0 + SIZE_PAGE) +#define VA_GIC (VA_UART1 + SIZE_PAGE) +#define VA_TTC (VA_GIC + 4 * SIZE_PAGE) + +#define PA_UART0 0xe0000000 +#define PA_UART1 0xe0001000 +#define PA_SLCR 0xf8000000 +#define PA_TTC 0xf8001000 .arm @@ -58,6 +63,13 @@ _vector_table: b _interrupts_dispatch b _interrupts_dispatch +/* + Fill 4 words in memory starting from r0 with value r1 incremented by r2 after every step + r0 - pointer to destination, set past-the-end on return + r1 - initial value, set past-the-end on return + r2 - increment step, constant + r3 - will be set to 0 + */ _cpy4: mov r3, #4 str r1, [r0], #4 @@ -69,48 +81,9 @@ _cpy4: /* startup code */ _start: + /* r9 now contains PA of syspage from PLO */ cpsid aif, #SYS_MODE - ldr r0, =relOffs - ldr r1, =#VADDR_SYSPAGE - lsr r1, #12 - lsl r1, #12 - - sub r0, r0, #VADDR_KERNEL - add r0, r0, #ADDR_DDR - sub r2, r1, r9 - str r2, [r0] - - ldr r0, [r9, #4] /* syspage size address - syspage address + sizeof(hal_syspage_t) */ - add r2, r9, r0 /* end of the syspage */ - - ldr r0, =syspage - sub r0, r0, #VADDR_KERNEL - add r0, r0, #ADDR_DDR - str r1, [r0] - - sub r1, r1, #VADDR_KERNEL - add r1, r1, #ADDR_DDR - -syspage_cpy: - ldr r3, [r9], #4 - str r3, [r1], #4 - cmp r9, r2 - blo syspage_cpy - - /* Only CPU0 performs initialization, others go into WFI */ - mrc p15, 0, r1, c0, c0, 5 /* Read Multiprocessor Affinity Register */ - and r1, r1, #0xf /* Extract CPU ID */ - cmp r1, #0 - beq initialize - -wait_loop: -/* TODO: make appropriate action when other core than CPU0 is running */ - wfi - b wait_loop - - -initialize: /* Enable PMU */ mrc p15, 0, r0, c9, c12, 0 /* Read PMCR (Performance Monitor Control Register) */ orr r0, #0x7 /* Cycle counter reset - bit[2], Performance counter reset - bit[1], enable all counters - bit[0] */ @@ -162,52 +135,86 @@ set_loop: /* Invalidate TLB */ mcr p15, 0, r1, c8, c7, 0 - /* Init TTL1 */ - ldr r5, =ADDR_TTL1 + /* Read CPU ID, only core 0 continues initialization */ + mrc p15, 0, r1, c0, c0, 5 /* Read Multiprocessor Affinity Register */ + ands r1, r1, #0xf /* Extract CPU ID */ + beq structs_init +wait_loop: + wfi + b wait_loop + +structs_init: + /* init memory structures (relOffs, syspage, TTLs) */ + ldr r1, =#VA_SYSPAGE + lsr r1, #12 + lsl r1, #12 + sub r2, r1, r9 + + ldr r0, =#PA_OF(relOffs) + str r2, [r0] + + ldr r0, [r9, #4] /* load syspage size from syspage address + sizeof(hal_syspage_t) */ + add r2, r9, r0 /* set r2 to end of the syspage */ + + ldr r0, =#PA_OF(syspage) + str r1, [r0] + + sub r1, r1, #VADDR_KERNEL + add r1, r1, #ADDR_DDR + mov r0, r9 + +syspage_cpy: + ldr r3, [r0], #4 + str r3, [r1], #4 + cmp r0, r2 + blo syspage_cpy + + /* Initialize MMU translation tables */ + /* Clear everything to 0 (invalid entry) */ + ldr r5, =PA_OF(VA_TTL1) mov r1, #0 - mov r2, #(4096 * 6 - 4) /* Size of kpdir, kptab and excptab in pmap_common */ -clear_ttl1: - str r1, [r5, r2] + mov r2, #(VA_TTL_END - VA_TTL1) +clear_ttls: subs r2, #4 - bne clear_ttl1 - str r1, [r5] + str r1, [r5, r2] + bne clear_ttls - /* In order to execute first stage of kernel */ - /* Map 4 MB P 0x00100000 -> V 0x00100000 */ - add r0, r5, #((ADDR_DDR >> 20) << 2) /* Entry address: TTL1 base address + entry index * 4 B (entry size) */ - ldr r1, =((ADDR_DDR & ~0xfffff) | (0x1 << 10) | 0x2) /* Section entry: base addres - DDR, AP = 01, APX = 0 (privileged access only) */ - mov r2, #0x100000 /* Size of section: 1 MB */ - bl _cpy4 /* Fill 4 entries in TTL1 */ + /* Needed to execute first stage of kernel, should be unmapped later */ + /* Map 4 MB V 0x00100000 -> P 0x00100000 */ + add r0, r5, #TTL1_OFFSET_OF(PA_KERNEL) /* Entry address: TTL1 base address + entry index * 4 B (entry size) */ + ldr r1, =((PA_KERNEL & ~0xfffff) | (0x1 << 10) | 0x2) /* Section entry: base address - DDR, AP = 01, APX = 0 (privileged access only) */ + mov r2, #0x100000 /* Size of section: 1 MB */ + bl _cpy4 /* Fill 4 entries in TTL1 */ /* Kernel TTL1 entries - * map 4 MB P 0x00100000 -> V 0xc0000000 */ - add r0, r5, #((VADDR_KERNEL >> 20) << 2) /* Entry address: virtual kernel address + entry index * 4 B (entry size) */ - ldr r1, =(ADDR_TTL2_K + 1) /* Ptr to kernel's TTL2 (pmap_common.kptab); bits [1:0] = 1 defines TTL2 */ - mov r2, #0x400 /* Size of TTL2 */ - bl _cpy4 /* Fill TTL1 with 4 TTL2's addresses; pmap_common.kptab consists of 4 TTL2 */ + * map 4 MB V VADDR_KERNEL -> TTL2 in pmap_common.kptab */ + add r0, r5, #TTL1_OFFSET_OF(VADDR_KERNEL) /* Entry address: virtual kernel address + entry index * 4 B (entry size) */ + ldr r1, =(PA_OF(VA_TTL2_K) + 1) /* Ptr to kernel's TTL2 (pmap_common.kptab); bits [1:0] = 1 defines TTL2 */ + mov r2, #0x400 /* Size of TTL2 */ + bl _cpy4 /* Fill TTL1 with 4 TTL2's addresses; pmap_common.kptab consists of 4 TTL2 */ - /* Exceptions vectors and stack TTL1 entry - * map 4MB V 0xffc00000 -> TTL2 of pmap_common.excptab */ - ldr r0, =(ADDR_TTL1 + (0xffc << 2)) /* Entry address: TTL1 address + index (0xffc - 4 entries in TTL1) * 4 B*/ - ldr r1, =(ADDR_TTL2_EXC + 1) /* Ptr to exception's TTL2 (pmap_common.excptab); bits [1:0] = 1 defines TTL2 */ - bl _cpy4 /* Fill TTL1 with 4 TTL2's addresses; pmap_common.excptab consists of 4 TTL2 */ + /* Exception vectors and stack TTL1 entry + * map 4MB V 0xffc00000 -> TTL2 in pmap_common.excptab */ + ldr r0, =(PA_OF(VA_TTL1) + TTL1_OFFSET_OF(0xffc00000)) /* Entry address: TTL1 address + index (0xffc - 4 entries in TTL1) * 4 B*/ + ldr r1, =(PA_OF(VA_TTL2_EXC) + 1) /* Ptr to exceptions' TTL2 (pmap_common.excptab); bits [1:0] = 1 defines TTL2 */ + bl _cpy4 /* Fill TTL1 with 4 TTL2's addresses; pmap_common.excptab consists of 4 TTL2 */ ldr r8, =(ADDR_DDR) /* Exceptions vectors TTL2 entry */ - /* Map P 0x00100000 -> V 0xffff0000 */ - ldr r0, =(ADDR_TTL2_EXC + (0x3f0 << 2)) /* Entry address: 4 entries from the end in last TTL2 in pmap_common.excptab */ + /* Map V 0xffff0000 -> P 0x00100000 */ + ldr r0, =(PA_OF(VA_TTL2_EXC) + (0x3f0 << 2)) /* Entry address: 4 entries from the end in last TTL2 in pmap_common.excptab */ orr r1, r8, #0x1a /* Ptr to physical address. Attributes: XN = 0, B = 0, C = 0, AP = 0x3, TEX = 0 */ str r1, [r0] /* Fill TTL2 entry */ - /* Stack TTL2 entry */ - /* Map P ADDR_STACK -> V 0xfffff000 */ - ldr r0, =(ADDR_TTL2_EXC + (0x3ff << 2)) /* Entry address: the last entry in 4 TTL2 in pmap_common.excptab */ - ldr r1, =((ADDR_STACK & ~0xfff) | 0x1e) /* Ptr to physical address. Attributes: XN = 0, B = 1, C = 1, AP = 0x3, TEX = 0 */ - str r1, [r0] /* Fill TTL2 entry */ + /* Stack TTL2 entry */ + /* Map V 0xfffff000 -> P PA_STACK */ + ldr r0, =(PA_OF(VA_TTL2_EXC) + (0x3ff << 2)) /* Entry address: the last entry in 4 TTL2 in pmap_common.excptab */ + ldr r1, =((PA_STACK & ~0xfff) | 0x1e) /* Ptr to physical address. Attributes: XN = 0, B = 1, C = 1, AP = 0x3, TEX = 0 */ + str r1, [r0] /* Set vector table pointer to virtual address */ ldr r0, =_vector_table @@ -215,8 +222,8 @@ clear_ttl1: /* Kernel TTL2 entries (pmap_common.kptab) */ - ldr r0, =ADDR_TTL2_K - ldr r1, =((ADDR_DDR & ~0xfff) + (1024 * SIZE_PAGE) | 0x1e) /* Ptr to physical address. Attributes: XN = 0, B = 1, C = 1, AP = 0x3, TEX = 0 */ + ldr r0, =PA_OF(VA_TTL2_K) + ldr r1, =((PA_KERNEL & ~0xfff) + (1024 * SIZE_PAGE) | 0x1e) /* Ptr past-the-end of physical addresses. Attributes: XN = 0, B = 1, C = 1, AP = 0x3, TEX = 0 */ mov r2, #(4 * 1024) /* size of pmap_common.kptab, it contains 4 TTL2 */ /* Map the whole kernel memory */ kernel_ttl2: @@ -226,10 +233,10 @@ kernel_ttl2: bne kernel_ttl2 - /* Kernel page directory: change attributes of pmap_common structure */ + /* Change memory attributes of kernel page directory within TTL2 */ ldr r1, =(pmap_common - VADDR_KERNEL) /* offset of pmap_common.kpdir */ - add r0, r1, lsr #10 /* r0 = ADDR_TTL2_K + (offset of: pmap_common.kpdir >> 10) */ - add r1, r1, #ADDR_DDR /* physical address of pmap_common.kpdir */ + add r0, r1, lsr #10 /* r0 = PA_OF(VA_TTL2_K) + (offset of: pmap_common.kpdir >> 10) */ + add r1, r1, #PA_KERNEL /* physical address of pmap_common.kpdir */ orr r1, r1, #0x1f /* Attributes: XN = 1, B = 1, C = 1, AP = 0x3, TEX = 0 */ mov r2, #0x1000 bl _cpy4 @@ -237,21 +244,21 @@ kernel_ttl2: bl _cpy4 bl _cpy4 - /* Kernel page tables */ + /* Also change attributes of kernel page tables */ bl _cpy4 /* Map perpehrals addresses */ /* Map UART0 4 KB P 0xE0000000 -> V CEIL(_end, SIZE_PAGE) */ - ldr r0, =(VADDR_UART0 - VADDR_KERNEL) + ldr r0, =(VA_UART0 - VADDR_KERNEL) lsr r0, #12 lsl r0, #2 - ldr r1, =ADDR_TTL2_K + ldr r1, =PA_OF(VA_TTL2_K) add r0, r0, r1 - ldr r1, =(PADDR_UART0 | 0x12) + ldr r1, =(PA_UART0 | 0x12) str r1, [r0], #4 /* Map UART1 4KB P 0xE0001000 -> V CEIL(_end + SIZE_PAGE, SIZE_PAGE) */ - ldr r1, =(PADDR_UART1 | 0x12) + ldr r1, =(PA_UART1 | 0x12) str r1, [r0], #4 /* Map GIC 16 KB after UARTs */ @@ -263,18 +270,18 @@ kernel_ttl2: bl _cpy4 /* Map SLCR after GIC */ - ldr r1, =(PADDR_SLCR | 0x12) + ldr r1, =(PA_SLCR | 0x12) str r1, [r0], #4 /* Map TTC after SLCR */ - ldr r1, =(PADDR_TTC | 0x12) + ldr r1, =(PA_TTC | 0x12) str r1, [r0], #4 /* Initialize MMU */ mov r1, #1 mcr p15, 0, r1, c2, c0, 2 /* Write Translation Table Base Control Register */ - ldr r1, =ADDR_TTL1 + ldr r1, =PA_OF(VA_TTL1) orr r1, r1, #(1 | (1 << 6)) /* Inner cacheability */ orr r1, r1, #(3 << 3) /* Outer cacheability */ mcr p15, 0, r1, c2, c0, 0 /* Write Translation Table Base Register 0 */ @@ -284,6 +291,9 @@ kernel_ttl2: ldr r1, =0x55555555 mcr p15, 0, r1, c3, c0, 0 /* Write Domain Access Control Register */ + mov r1, #0 + mcr p15, 0, r1, c13, c0, 1 /* Set ASID == 0 in CONTEXTIDR */ + /* Enable L1 Caches */ mrc p15, 0, r1, c1, c0, 0 /* Read SCTLR (System Control Register) data */ orr r1, r1, #(0x1 << 2) /* Enable data cache */ @@ -300,9 +310,15 @@ kernel_ttl2: dsb isb + /* Reset fault status/address registers to a known value */ + mov r0, #0xFFFFFFFF + mcr p15, 0, r0, c5, c0, 0 /* DFSR */ + mcr p15, 0, r0, c6, c0, 0 /* DFAR */ + mcr p15, 0, r0, c5, c0, 1 /* IFSR */ + mcr p15, 0, r0, c6, c0, 2 /* IFAR */ /* Setup stacks */ - eor r0, r0 /* initial SP */ + eor r0, r0 /* initialize SP to top of memory */ /* FIQ mode stack */ msr CPSR_c, #(FIQ_MODE | NO_INT) @@ -342,6 +358,7 @@ kernel_ttl2: orr r0, r0, #(0x1 << 30) /* FPU enable bit */ vmsr fpexc, r0 + /* Jump to main() in virtual memory */ ldr pc, =main #include "hal/armv7a/_interrupts.S" diff --git a/hal/armv7a/zynq7000/zynq.c b/hal/armv7a/zynq7000/zynq.c index 713aa145e..c5e4ae310 100644 --- a/hal/armv7a/zynq7000/zynq.c +++ b/hal/armv7a/zynq7000/zynq.c @@ -14,6 +14,7 @@ */ #include "hal/cpu.h" +#include "hal/armv7a/armv7a.h" #include "hal/spinlock.h" #include "include/arch/armv7a/zynq7000/zynq7000.h" @@ -79,6 +80,7 @@ extern unsigned int _end; static void _zynq_slcrLock(void) { + hal_cpuDataMemoryBarrier(); /* Ensure previous writes are committed before locking */ *(zynq_common.slcr + slcr_lock) = 0x0000767b; } @@ -86,6 +88,7 @@ static void _zynq_slcrLock(void) static void _zynq_slcrUnlock(void) { *(zynq_common.slcr + slcr_unlock) = 0x0000df0d; + hal_cpuDataMemoryBarrier(); /* Ensure subsequent writes are committed after unlocking */ } From b317d9c2bc2e72c5f2a81bb62486866b896c5658 Mon Sep 17 00:00:00 2001 From: Jacek Maksymowicz Date: Wed, 17 Apr 2024 17:28:30 +0200 Subject: [PATCH 2/7] armv7a: change page directory handling Page directories were previously 0x4000 bytes, but the upper half was never used by the hardware - only by pmap_resolve(). pmap_resolve() behavior was modified to closer align with how hardware resolves virtual memory addresses. JIRA: RTOS-796 --- hal/armv7a/arch/cpu.h | 2 +- hal/armv7a/pmap.c | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/hal/armv7a/arch/cpu.h b/hal/armv7a/arch/cpu.h index e054c4fd7..7241a4b17 100644 --- a/hal/armv7a/arch/cpu.h +++ b/hal/armv7a/arch/cpu.h @@ -19,7 +19,7 @@ #include "hal/types.h" #define SIZE_PAGE 0x1000 -#define SIZE_PDIR 0x4000 +#define SIZE_PDIR 0x2000 #ifndef SIZE_KSTACK #define SIZE_KSTACK (8 * 1024) diff --git a/hal/armv7a/pmap.c b/hal/armv7a/pmap.c index 900fc8967..e06b4f975 100644 --- a/hal/armv7a/pmap.c +++ b/hal/armv7a/pmap.c @@ -167,8 +167,7 @@ int pmap_create(pmap_t *pmap, pmap_t *kpmap, page_t *p, void *vaddr) pmap->addr = p->addr; pmap->asid_ix = 0; - hal_memset(pmap->pdir, 0, (VADDR_KERNEL) >> 18); - hal_memcpy(&pmap->pdir[ID_PDIR(VADDR_KERNEL)], &kpmap->pdir[ID_PDIR(VADDR_KERNEL)], (VADDR_MAX - VADDR_KERNEL + 1) >> 18); + hal_memset(pmap->pdir, 0, SIZE_PDIR); hal_cpuDataMemoryBarrier(); hal_cpuDataSyncBarrier(); @@ -387,7 +386,9 @@ addr_t pmap_resolve(pmap_t *pmap, void *vaddr) hal_spinlockSet(&pmap_common.lock, &sc); - if (!(addr = pmap->pdir[pdi])) { + u32 *pdir = ((ptr_t)vaddr >= VADDR_USR_MAX) ? pmap_common.kpdir : pmap->pdir; + addr = pdir[pdi]; + if (addr == 0) { hal_spinlockClear(&pmap_common.lock, &sc); return 0; } From 484b9539b87049e4af57e2e68da8c069d5bab00f Mon Sep 17 00:00:00 2001 From: Jacek Maksymowicz Date: Tue, 26 Mar 2024 11:51:04 +0100 Subject: [PATCH 3/7] armv7a: add provisions for SMP to common architecture Add scheduler lock Add TLB management broadcast functions Implement correct CPU ID and CPU count functions Add inter-processor interrupt Add core count to CPU info print at startup JIRA: RTOS-796 --- hal/armv7a/_armv7a.S | 37 ++++++++++++++++++++ hal/armv7a/_interrupts.S | 21 +++++++++++ hal/armv7a/arch/cpu.h | 60 ++++++++++++++++++++++++++++++-- hal/armv7a/armv7a.h | 12 +++++++ hal/armv7a/cpu.c | 15 +++----- hal/armv7a/hal.c | 27 ++++++++++++++ hal/armv7a/imx6ull/config.h | 1 + hal/armv7a/imx6ull/imx6ull.c | 17 +++++++++ hal/armv7a/imx6ull/interrupts.c | 19 ++++++---- hal/armv7a/pmap.c | 21 +++++++---- hal/armv7a/zynq7000/config.h | 2 ++ hal/armv7a/zynq7000/interrupts.c | 37 ++++++++++++++++---- hal/armv7a/zynq7000/zynq.c | 46 ++++++++++++++++++++++++ 13 files changed, 283 insertions(+), 32 deletions(-) diff --git a/hal/armv7a/_armv7a.S b/hal/armv7a/_armv7a.S index 2116747a7..9960fb1c1 100644 --- a/hal/armv7a/_armv7a.S +++ b/hal/armv7a/_armv7a.S @@ -161,6 +161,43 @@ hal_cpuInvalTLB: .ltorg +.globl hal_cpuInvalASID_IS +.type hal_cpuInvalASID_IS, %function +hal_cpuInvalASID_IS: + dsb + and r0, r0, #0xff + mcr p15, 0, r0, c8, c3, 2 /* TLBIASIDIS */ + dsb + isb + bx lr +.size hal_cpuInvalASID_IS, .-hal_cpuInvalASID_IS +.ltorg + + +.globl hal_cpuInvalVA_IS +.type hal_cpuInvalVA_IS, %function +hal_cpuInvalVA_IS: + dsb + mcr p15, 0, r0, c8, c3, 1 /* TLBIMVAIS */ + dsb + isb + bx lr +.size hal_cpuInvalVA_IS, .-hal_cpuInvalVA_IS +.ltorg + + +.globl hal_cpuInvalTLB_IS +.type hal_cpuInvalTLB_IS, %function +hal_cpuInvalTLB_IS: + dsb + mcr p15, 0, r0, c8, c3, 0 /* TLBIALLIS */ + dsb + isb + bx lr +.size hal_cpuInvalTLB_IS, .-hal_cpuInvalTLB_IS +.ltorg + + .globl hal_cpuGetTTBR0 .type hal_cpuGetTTBR0, %function hal_cpuGetTTBR0: diff --git a/hal/armv7a/_interrupts.S b/hal/armv7a/_interrupts.S index 4bde63230..0a343d459 100644 --- a/hal/armv7a/_interrupts.S +++ b/hal/armv7a/_interrupts.S @@ -19,6 +19,8 @@ .arm +.extern schedulerLocked + .macro push_fpu_state reg_tmp vpush {d16-d31} vpush {d0-d15} @@ -26,6 +28,15 @@ push {\reg_tmp} .endm +.macro unlock_scheduler + ldr r0, =schedulerLocked + mov r1, #0 + dmb + str r1, [r0] + dmb + isb +.endm + .globl _exception_undef .type _exception_undef, %function _exception_undef: @@ -109,6 +120,9 @@ _exceptions_dispatch: .globl _hal_cpuRestoreCtx .type _hal_cpuRestoreCtx, %function _hal_cpuRestoreCtx: + /* CLREX should be executed as part of context switch */ + clrex + /* Restore fpu context */ pop {r4} vmsr fpscr, r4 @@ -180,6 +194,8 @@ spinlock: ldr sp, [sp] add sp, sp, #8 + + unlock_scheduler b _hal_cpuRestoreCtx .size hal_cpuReschedule, .-hal_cpuReschedule @@ -230,6 +246,11 @@ _interrupts_dispatch: ldr sp, [sp] add sp, sp, #8 + + cmp r0, #0 + beq 1f + unlock_scheduler +1: b _hal_cpuRestoreCtx .size _interrupts_dispatch, .-_interrupts_dispatch diff --git a/hal/armv7a/arch/cpu.h b/hal/armv7a/arch/cpu.h index 7241a4b17..8ad745a20 100644 --- a/hal/armv7a/arch/cpu.h +++ b/hal/armv7a/arch/cpu.h @@ -17,6 +17,7 @@ #define _HAL_ARMV7A_CPU_H_ #include "hal/types.h" +#include "config.h" #define SIZE_PAGE 0x1000 #define SIZE_PDIR 0x2000 @@ -183,16 +184,69 @@ static inline int hal_cpuSupervisorMode(cpu_context_t *ctx) static inline unsigned int hal_cpuGetID(void) { - return 0; + unsigned mpidr; + /* clang-format off */ + __asm__ volatile ("mrc p15, 0, %0, c0, c0, 5": "=r"(mpidr)); + /* clang-format on */ + return mpidr & 0xf; } -static inline unsigned int hal_cpuGetCount(void) +static inline void hal_cpuSignalEvent(void) { - return 1; + /* clang-format off */ + __asm__ volatile ("sev"); + /* clang-format on */ } +static inline void hal_cpuWaitForEvent(void) +{ + /* clang-format off */ + __asm__ volatile ("dsb\n wfe"); + /* clang-format on */ +} + + +static inline u32 hal_cpuAtomicGet(volatile u32 *dst) +{ + u32 result; + /* clang-format off */ + __asm__ volatile ( + "dmb\n" + "ldr %0, [%1]\n" + "dmb\n" + : "=r"(result) + : "r"(dst) + ); + /* clang-format on */ + return result; +} + + +static inline void hal_cpuAtomicInc(volatile u32 *dst) +{ + /* clang-format off */ + __asm__ volatile ( + "dmb\n" + "1:\n" + "ldrex r2, [%0]\n" + "add r2, r2, #1\n" + "strex r1, r2, [%0]\n" + "cmp r1, #0\n" + "bne 1b\n" + "dmb\n" + : + : "r"(dst) + : "r1", "r2", "memory" + ); + /* clang-format on */ +} + + +extern unsigned int hal_cpuGetCount(void); + + #endif #endif diff --git a/hal/armv7a/armv7a.h b/hal/armv7a/armv7a.h index 67ce22f07..1f538d30d 100644 --- a/hal/armv7a/armv7a.h +++ b/hal/armv7a/armv7a.h @@ -74,6 +74,18 @@ extern void hal_cpuInvalVA(ptr_t vaddr); extern void hal_cpuInvalTLB(void); +/* Invalidate TLB entries by ASID Match on all cores in Inner Shareable domain */ +extern void hal_cpuInvalASID_IS(u8 asid); + + +/* Invalidate Unified TLB by MVA on all cores in Inner Shareable domain */ +extern void hal_cpuInvalVA_IS(ptr_t vaddr); + + +/* Invalidate entire Unified TLB on all cores in Inner Shareable domain */ +extern void hal_cpuInvalTLB_IS(void); + + /* Read Translation Table Base Register 0 with properties */ extern addr_t hal_cpuGetTTBR0(void); diff --git a/hal/armv7a/cpu.c b/hal/armv7a/cpu.c index 10e310c31..e009025ff 100644 --- a/hal/armv7a/cpu.c +++ b/hal/armv7a/cpu.c @@ -162,6 +162,10 @@ char *hal_cpuInfo(char *info) info[n++] = 'p'; info[n++] = '0' + (midr & 0xf); + info[n++] = ' '; + info[n++] = 'x'; + info[n++] = '0' + hal_cpuGetCount(); + info[n] = '\0'; return info; @@ -242,17 +246,6 @@ void hal_cpuLowPower(time_t us, spinlock_t *spinlock, spinlock_ctx_t *sc) } -void hal_cpuBroadcastIPI(unsigned int intr) -{ -} - - -void hal_cpuSmpSync(void) -{ - /* Nothing to do */ -} - - /* cache management */ diff --git a/hal/armv7a/hal.c b/hal/armv7a/hal.c index 8b008c469..197e47296 100644 --- a/hal/armv7a/hal.c +++ b/hal/armv7a/hal.c @@ -23,9 +23,11 @@ struct { syspage_t *syspage; unsigned int relOffs; +u32 schedulerLocked = 0; extern void _hal_platformInit(void); +extern void _hal_cpuInit(void); void *hal_syspageRelocate(void *data) @@ -54,11 +56,34 @@ void _hal_start(void) void hal_lockScheduler(void) { +#if NUM_CPUS != 1 + /* clang-format off */ + __asm__ volatile( + "1:\n" + "dmb\n" + "mov r2, #1\n" + "ldrex r1, [%0]\n" + "cmp r1, #0\n" + "bne 1b\n" + "strex r1, r2, [%0]\n" + "cmp r1, #0\n" + "bne 1b\n" + "dmb\n" + : + : "r" (&schedulerLocked) + : "r1", "r2", "memory", "cc"); + /* clang-format on */ +#else + /* Not necessary on single-core systems */ + (void)schedulerLocked; + return; +#endif } __attribute__ ((section (".init"))) void _hal_init(void) { + schedulerLocked = 0; _hal_spinlockInit(); _hal_platformInit(); _hal_consoleInit(); @@ -66,6 +91,8 @@ __attribute__ ((section (".init"))) void _hal_init(void) _hal_exceptionsInit(); _hal_interruptsInit(); + _hal_cpuInit(); + _hal_timerInit(SYSTICK_INTERVAL); hal_common.started = 0; diff --git a/hal/armv7a/imx6ull/config.h b/hal/armv7a/imx6ull/config.h index 852875fba..a9cf03ac2 100644 --- a/hal/armv7a/imx6ull/config.h +++ b/hal/armv7a/imx6ull/config.h @@ -19,6 +19,7 @@ #define ADDR_DDR 0x80000000 #define SIZE_DDR 0x7ffffff +#define NUM_CPUS 1 #ifndef __ASSEMBLY__ diff --git a/hal/armv7a/imx6ull/imx6ull.c b/hal/armv7a/imx6ull/imx6ull.c index 23c7aae87..2827079fd 100644 --- a/hal/armv7a/imx6ull/imx6ull.c +++ b/hal/armv7a/imx6ull/imx6ull.c @@ -443,3 +443,20 @@ void _hal_platformInit(void) /* Restore output clocks state */ *(imx6ull_common.ccm + ccm_ccgr4) = tmp; } + + +void _hal_cpuInit(void) +{ +} + + +unsigned int hal_cpuGetCount(void) +{ + return NUM_CPUS; +} + + +void hal_cpuSmpSync(void) +{ + /* Nothing to do */ +} diff --git a/hal/armv7a/imx6ull/interrupts.c b/hal/armv7a/imx6ull/interrupts.c index 9d25a42af..c76d78e5c 100644 --- a/hal/armv7a/imx6ull/interrupts.c +++ b/hal/armv7a/imx6ull/interrupts.c @@ -48,16 +48,18 @@ extern int threads_schedule(unsigned int n, cpu_context_t *context, void *arg); extern unsigned int _end; -void interrupts_dispatch(unsigned int n, cpu_context_t *ctx) +int interrupts_dispatch(unsigned int n, cpu_context_t *ctx) { intr_handler_t *h; int reschedule = 0; spinlock_ctx_t sc; - n = *(interrupts.gic + iar) & 0x3ff; + u32 iarValue = *(interrupts.gic + iar); + n = iarValue & 0x3ff; - if (n >= SIZE_INTERRUPTS) - return; + if (n >= SIZE_INTERRUPTS) { + return 0; + } hal_spinlockSet(&interrupts.spinlock[n], &sc); @@ -72,11 +74,11 @@ void interrupts_dispatch(unsigned int n, cpu_context_t *ctx) if (reschedule) threads_schedule(n, ctx, NULL); - *(interrupts.gic + eoir) = n; + *(interrupts.gic + eoir) = iarValue; hal_spinlockClear(&interrupts.spinlock[n], &sc); - return; + return reschedule; } @@ -204,3 +206,8 @@ void _hal_interruptsInit(void) *(interrupts.gic + bpr) = 0; *(interrupts.gic + pmr) = 0xff; } + + +void hal_cpuBroadcastIPI(unsigned int intr) +{ +} diff --git a/hal/armv7a/pmap.c b/hal/armv7a/pmap.c index e06b4f975..3ee3bb7b8 100644 --- a/hal/armv7a/pmap.c +++ b/hal/armv7a/pmap.c @@ -24,6 +24,14 @@ #include "halsyspage.h" +#if NUM_CPUS != 1 +#define hal_cpuInvalVAAll hal_cpuInvalVA_IS +#define hal_cpuInvalASIDAll hal_cpuInvalASID_IS +#else +#define hal_cpuInvalVAAll hal_cpuInvalVA +#define hal_cpuInvalASIDAll hal_cpuInvalASID +#endif + extern unsigned int _end; extern unsigned int _etext; @@ -142,6 +150,7 @@ static void _pmap_asidDealloc(pmap_t *pmap) addr_t tmp; if (pmap->asid_ix != 0) { + hal_cpuInvalASIDAll(pmap_common.asids[pmap->asid_ix]); if (pmap->asid_ix != pmap_common.asidptr) { pmap_common.asid_map[pmap->asid_ix] = last = pmap_common.asid_map[pmap_common.asidptr]; last->asid_ix = pmap->asid_ix; @@ -237,13 +246,16 @@ static void _pmap_writeEntry(ptr_t *ptable, void *va, addr_t pa, int attributes, int pti = ID_PTABLE((ptr_t)va); hal_cpuCleanDataCache((ptr_t)&ptable[pti], (ptr_t)&ptable[pti] + sizeof(ptr_t)); + ptr_t oldEntry = ptable[pti]; if (attributes & PGHD_PRESENT) ptable[pti] = (pa & ~0xfff) | attrMap[attributes & 0x1f]; else ptable[pti] = 0; hal_cpuDataSyncBarrier(); - hal_cpuInvalVA(((ptr_t)va & ~0xfff) | asid); + if ((oldEntry & 0x3) != 0) { + hal_cpuInvalVAAll(((ptr_t)va & ~0xfff) | asid); + } hal_cpuBranchInval(); hal_cpuICacheInval(); @@ -263,10 +275,7 @@ static void _pmap_addTable(pmap_t *pmap, int pdi, addr_t pa) pmap->pdir[pdi + 2] = pa + 0x800; pmap->pdir[pdi + 3] = pa + 0xc00; - hal_cpuInvalASID(pmap_common.asids[pmap->asid_ix]); - - hal_cpuDataSyncBarrier(); - hal_cpuInstrBarrier(); + hal_cpuInvalASIDAll(pmap_common.asids[pmap->asid_ix]); } @@ -561,7 +570,7 @@ void _pmap_init(pmap_t *pmap, void **vstart, void **vend) /* Remove initial kernel mapping */ for (i = 0; i < 4; ++i) { pmap->pdir[ID_PDIR(pmap_common.minAddr) + i] = 0; - hal_cpuInvalVA(pmap_common.minAddr + i * (1 << 2)); + hal_cpuInvalVAAll(pmap_common.minAddr + i * (1 << 2)); } pmap->start = (void *)VADDR_KERNEL; diff --git a/hal/armv7a/zynq7000/config.h b/hal/armv7a/zynq7000/config.h index d6839326e..89f724176 100644 --- a/hal/armv7a/zynq7000/config.h +++ b/hal/armv7a/zynq7000/config.h @@ -27,6 +27,8 @@ #define ADDR_DDR 0x00100000 #define SIZE_DDR 0x7ffffff +#define NUM_CPUS 2 + #ifndef __ASSEMBLY__ #define HAL_NAME_PLATFORM "Xilinx Zynq-7000 " diff --git a/hal/armv7a/zynq7000/interrupts.c b/hal/armv7a/zynq7000/interrupts.c index 0f4b95a8a..64b999d72 100644 --- a/hal/armv7a/zynq7000/interrupts.c +++ b/hal/armv7a/zynq7000/interrupts.c @@ -13,6 +13,8 @@ * %LICENSE% */ +#include "hal/armv7a/armv7a.h" + #include "hal/cpu.h" #include "hal/spinlock.h" #include "hal/interrupts.h" @@ -25,7 +27,12 @@ #define SIZE_HANDLERS 4 #define SPI_FIRST_IRQID 32 +#define SGI_FLT_USE_LIST 0 /* Send SGI to CPUs according to targetList */ +#define SGI_FLT_OTHER_CPUS 1 /* Send SGI to all CPUs except the one that called this function */ +#define SGI_FLT_THIS_CPU 2 /* Send SGI to the CPU that called this function */ + +/* clang-format off */ enum { /* Interrupt interface registers */ cicr = 0x40, cpmr, cbpr, ciar, ceoir, crpr, chpir, cabpr, @@ -41,6 +48,7 @@ enum { enum { reserved = 0, high_lvl = 1, rising_edge = 3 }; +/* clang-format on */ struct { @@ -69,16 +77,18 @@ extern int threads_schedule(unsigned int n, cpu_context_t *context, void *arg); extern unsigned int _end; -void interrupts_dispatch(unsigned int n, cpu_context_t *ctx) +int interrupts_dispatch(unsigned int n, cpu_context_t *ctx) { intr_handler_t *h; int reschedule = 0; spinlock_ctx_t sc; - n = *(interrupts_common.gic + ciar) & 0x3ff; + u32 ciarValue = *(interrupts_common.gic + ciar); + n = ciarValue & 0x3ff; - if (n >= SIZE_INTERRUPTS) - return; + if (n >= SIZE_INTERRUPTS) { + return 0; + } hal_spinlockSet(&interrupts_common.spinlock[n], &sc); @@ -93,11 +103,11 @@ void interrupts_dispatch(unsigned int n, cpu_context_t *ctx) if (reschedule) threads_schedule(n, ctx, NULL); - *(interrupts_common.gic + ceoir) = n; + *(interrupts_common.gic + ceoir) = ciarValue; hal_spinlockClear(&interrupts_common.spinlock[n], &sc); - return; + return reschedule; } @@ -220,6 +230,8 @@ void _hal_interruptsInit(void) interrupts_setCPU(i, 0x1); } + /* SGI and PPI interrupts are fixed to always be on both CPUs */ + /* Disable interrupts */ *(interrupts_common.gic + dicer0) = 0xffffffff; *(interrupts_common.gic + dicer0 + 1) = 0xffffffff; @@ -237,3 +249,16 @@ void _hal_interruptsInit(void) /* EnableS = 1; EnableNS = 1; AckCtl = 1; FIQEn = 0 */ *(interrupts_common.gic + cicr) |= 0x7; } + + +static void hal_cpuSendSGI(u8 targetFilter, u8 targetList, u8 intID) +{ + *(interrupts_common.gic + dsgir) = ((targetFilter & 0x3) << 24) | (targetList << 16) | (intID & 0xf); + hal_cpuDataMemoryBarrier(); +} + + +void hal_cpuBroadcastIPI(unsigned int intr) +{ + hal_cpuSendSGI(SGI_FLT_OTHER_CPUS, 0, intr); +} diff --git a/hal/armv7a/zynq7000/zynq.c b/hal/armv7a/zynq7000/zynq.c index c5e4ae310..0c5bff0ca 100644 --- a/hal/armv7a/zynq7000/zynq.c +++ b/hal/armv7a/zynq7000/zynq.c @@ -13,6 +13,7 @@ * %LICENSE% */ +#include "hal/armv7a/armv7a.h" #include "hal/cpu.h" #include "hal/armv7a/armv7a.h" #include "hal/spinlock.h" @@ -72,6 +73,7 @@ enum { struct { spinlock_t pltctlSp; volatile u32 *slcr; + unsigned int nCpus; } zynq_common; @@ -654,3 +656,47 @@ void _hal_platformInit(void) hal_spinlockCreate(&zynq_common.pltctlSp, "pltctl"); zynq_common.slcr = (void *)(((u32)&_end + 9 * SIZE_PAGE - 1) & ~(SIZE_PAGE - 1)); } + + +unsigned int hal_cpuGetCount(void) +{ + return zynq_common.nCpus; +} + + +static u32 checkNumCPUs(void) +{ + /* First check if MPIDR indicates uniprocessor system or no MP extensions */ + unsigned mpidr; + /* clang-format off */ + __asm__ volatile ("mrc p15, 0, %0, c0, c0, 5": "=r"(mpidr)); + /* clang-format on */ + if ((mpidr >> 30) != 0x2) { + return 1; + } + + /* Otherwise we are in a multiprocessor system and we can check SCU for number of cores in SMP */ + volatile u32 *scu = (void *)(((u32)&_end + 5 * SIZE_PAGE - 1) & ~(SIZE_PAGE - 1)); + /* We cannot use SCU_CPU_Power_Status_Register because it's not implemented correctly on QEMU */ + u32 powerStatus = (*(scu + 1)) >> 4; /* SCU_CONFIGURATION_REGISTER */ + u32 cpusAvailable = 0; + for (int i = 0; i < 4; i++, powerStatus >>= 1) { + if ((powerStatus & 0x1) == 1) { + cpusAvailable++; + } + } + + return cpusAvailable; +} + + +void _hal_cpuInit(void) +{ + zynq_common.nCpus = checkNumCPUs(); +} + + +void hal_cpuSmpSync(void) +{ + /* TODO: not implemented yet */ +} From 4b7e010bfa04d12273b95d91b26fdb59f578e9cd Mon Sep 17 00:00:00 2001 From: Jacek Maksymowicz Date: Thu, 25 Apr 2024 17:23:54 +0200 Subject: [PATCH 4/7] armv7a: change memory attributes Set memory as shareable and enable write-allocate - needed for SMP Change AP values of 110 into 111 - previous value is deprecated JIRA: RTOS-796 --- hal/armv7a/pmap.c | 104 +++++++++++++++++++----------------- hal/armv7a/zynq7000/_init.S | 40 ++++++++------ 2 files changed, 79 insertions(+), 65 deletions(-) diff --git a/hal/armv7a/pmap.c b/hal/armv7a/pmap.c index 3ee3bb7b8..c8d32c0da 100644 --- a/hal/armv7a/pmap.c +++ b/hal/armv7a/pmap.c @@ -37,25 +37,27 @@ extern unsigned int _etext; #define SIZE_EXTEND_BSS 18 * SIZE_PAGE -#define TT2S_ATTR_MASK 0xfff -#define TT2S_NOTGLOBAL 0x800 -#define TT2S_SHAREABLE 0x400 -#define TT2S_READONLY 0x200 +#define TT2S_ATTR_MASK 0xfff +#define TT2S_NOTGLOBAL 0x800 +#define TT2S_SHAREABLE 0x400 +#define TT2S_SMALLPAGE 0x002 +#define TT2S_EXECNEVER 0x001 /* Memory region attributes (encodes TT2 descriptor bits [11:0]: ---T EX-- CB--) */ -#define TT2S_ORDERED 0x000 -#define TT2S_SHARED_DEV 0x004 -#define TT2S_CACHED 0x00c -#define TT2S_NOTCACHED 0x040 -#define TT2S_NOTSHARED_DEV 0x080 -#define TT2S_PL0ACCESS 0x020 -#define TT2S_ACCESSFLAG 0x010 -#define TT2S_SMALLPAGE 0x002 -#define TT2S_EXECNEVER 0x001 - -#define TT2S_CACHING_ATTR TT2S_CACHED - -/* Page dirs & tables are write-back no write-allocate inner/outer cachable */ -#define TTBR_CACHE_CONF (1 | (1 << 6) | (3 << 3)) +#define TT2S_ORDERED 0x000 +#define TT2S_SHARED_DEV 0x004 +#define TT2S_CACHED 0x04c +#define TT2S_NOTCACHED 0x040 +#define TT2S_NOTSHARED_DEV 0x080 +/* Access permission bits AP[2:0] */ +#define TT2S_READONLY 0x200 +#define TT2S_PL0ACCESS 0x020 +#define TT2S_ACCESSFLAG 0x010 + +#define TT2S_COMMON_ATTR (TT2S_SMALLPAGE | TT2S_ACCESSFLAG | TT2S_SHAREABLE) +#define TT2S_CACHING_ATTR TT2S_CACHED + +/* Page dirs & tables are write-back no write-allocate inner/outer cacheable, shareable */ +#define TTBR_CACHE_CONF (1 | (1 << 6) | (3 << 3) | 2) #define ID_PDIR(vaddr) (((ptr_t)(vaddr) >> 20)) #define ID_PTABLE(vaddr) (((ptr_t)(vaddr) >> 12) & 0x3ff) @@ -85,40 +87,42 @@ struct { static const char *const marksets[4] = { "BBBBBBBBBBBBBBBB", "KYCPMSHKKKKKKKKK", "AAAAAAAAAAAAAAAA", "UUUUUUUUUUUUUUUU" }; +/* clang-format off */ static const u16 attrMap[] = { - TT2S_SMALLPAGE | TT2S_ACCESSFLAG | TT2S_CACHING_ATTR | TT2S_EXECNEVER | TT2S_READONLY, - TT2S_SMALLPAGE | TT2S_ACCESSFLAG | TT2S_SHARED_DEV | TT2S_EXECNEVER | TT2S_READONLY, - TT2S_SMALLPAGE | TT2S_ACCESSFLAG | TT2S_CACHING_ATTR | TT2S_READONLY, - TT2S_SMALLPAGE | TT2S_ACCESSFLAG | TT2S_SHARED_DEV | TT2S_READONLY, - TT2S_SMALLPAGE | TT2S_ACCESSFLAG | TT2S_CACHING_ATTR | TT2S_EXECNEVER, - TT2S_SMALLPAGE | TT2S_ACCESSFLAG | TT2S_SHARED_DEV | TT2S_EXECNEVER, - TT2S_SMALLPAGE | TT2S_ACCESSFLAG | TT2S_CACHING_ATTR, - TT2S_SMALLPAGE | TT2S_ACCESSFLAG | TT2S_SHARED_DEV, - TT2S_SMALLPAGE | TT2S_CACHING_ATTR | TT2S_EXECNEVER | TT2S_READONLY | TT2S_PL0ACCESS | TT2S_NOTGLOBAL, - TT2S_SMALLPAGE | TT2S_SHARED_DEV | TT2S_EXECNEVER | TT2S_READONLY | TT2S_PL0ACCESS | TT2S_NOTGLOBAL, - TT2S_SMALLPAGE | TT2S_CACHING_ATTR | TT2S_READONLY | TT2S_PL0ACCESS | TT2S_NOTGLOBAL, - TT2S_SMALLPAGE | TT2S_SHARED_DEV | TT2S_READONLY | TT2S_PL0ACCESS | TT2S_NOTGLOBAL, - TT2S_SMALLPAGE | TT2S_ACCESSFLAG | TT2S_CACHING_ATTR | TT2S_EXECNEVER | TT2S_PL0ACCESS | TT2S_NOTGLOBAL, - TT2S_SMALLPAGE | TT2S_ACCESSFLAG | TT2S_SHARED_DEV | TT2S_EXECNEVER | TT2S_PL0ACCESS | TT2S_NOTGLOBAL, - TT2S_SMALLPAGE | TT2S_ACCESSFLAG | TT2S_CACHING_ATTR | TT2S_PL0ACCESS | TT2S_NOTGLOBAL, - TT2S_SMALLPAGE | TT2S_ACCESSFLAG | TT2S_SHARED_DEV | TT2S_PL0ACCESS | TT2S_NOTGLOBAL, - TT2S_SMALLPAGE | TT2S_ACCESSFLAG | TT2S_NOTCACHED | TT2S_EXECNEVER | TT2S_READONLY, - TT2S_SMALLPAGE | TT2S_ACCESSFLAG | TT2S_SHARED_DEV | TT2S_EXECNEVER | TT2S_READONLY, - TT2S_SMALLPAGE | TT2S_ACCESSFLAG | TT2S_NOTCACHED | TT2S_READONLY, - TT2S_SMALLPAGE | TT2S_ACCESSFLAG | TT2S_SHARED_DEV | TT2S_READONLY, - TT2S_SMALLPAGE | TT2S_ACCESSFLAG | TT2S_NOTCACHED | TT2S_EXECNEVER, - TT2S_SMALLPAGE | TT2S_ACCESSFLAG | TT2S_SHARED_DEV | TT2S_EXECNEVER, - TT2S_SMALLPAGE | TT2S_ACCESSFLAG | TT2S_NOTCACHED, - TT2S_SMALLPAGE | TT2S_ACCESSFLAG | TT2S_SHARED_DEV, - TT2S_SMALLPAGE | TT2S_NOTCACHED | TT2S_EXECNEVER | TT2S_READONLY | TT2S_PL0ACCESS | TT2S_NOTGLOBAL, - TT2S_SMALLPAGE | TT2S_SHARED_DEV | TT2S_EXECNEVER | TT2S_READONLY | TT2S_PL0ACCESS | TT2S_NOTGLOBAL, - TT2S_SMALLPAGE | TT2S_NOTCACHED | TT2S_READONLY | TT2S_PL0ACCESS | TT2S_NOTGLOBAL, - TT2S_SMALLPAGE | TT2S_SHARED_DEV | TT2S_READONLY | TT2S_PL0ACCESS | TT2S_NOTGLOBAL, - TT2S_SMALLPAGE | TT2S_ACCESSFLAG | TT2S_NOTCACHED | TT2S_EXECNEVER | TT2S_PL0ACCESS | TT2S_NOTGLOBAL, - TT2S_SMALLPAGE | TT2S_ACCESSFLAG | TT2S_SHARED_DEV | TT2S_EXECNEVER | TT2S_PL0ACCESS | TT2S_NOTGLOBAL, - TT2S_SMALLPAGE | TT2S_ACCESSFLAG | TT2S_NOTCACHED | TT2S_PL0ACCESS | TT2S_NOTGLOBAL, - TT2S_SMALLPAGE | TT2S_ACCESSFLAG | TT2S_SHARED_DEV | TT2S_PL0ACCESS | TT2S_NOTGLOBAL + TT2S_COMMON_ATTR | TT2S_CACHING_ATTR | TT2S_EXECNEVER | TT2S_READONLY, + TT2S_COMMON_ATTR | TT2S_SHARED_DEV | TT2S_EXECNEVER | TT2S_READONLY, + TT2S_COMMON_ATTR | TT2S_CACHING_ATTR | TT2S_READONLY, + TT2S_COMMON_ATTR | TT2S_SHARED_DEV | TT2S_READONLY, + TT2S_COMMON_ATTR | TT2S_CACHING_ATTR | TT2S_EXECNEVER, + TT2S_COMMON_ATTR | TT2S_SHARED_DEV | TT2S_EXECNEVER, + TT2S_COMMON_ATTR | TT2S_CACHING_ATTR, + TT2S_COMMON_ATTR | TT2S_SHARED_DEV, + TT2S_COMMON_ATTR | TT2S_CACHING_ATTR | TT2S_EXECNEVER | TT2S_READONLY | TT2S_PL0ACCESS | TT2S_NOTGLOBAL, + TT2S_COMMON_ATTR | TT2S_SHARED_DEV | TT2S_EXECNEVER | TT2S_READONLY | TT2S_PL0ACCESS | TT2S_NOTGLOBAL, + TT2S_COMMON_ATTR | TT2S_CACHING_ATTR | TT2S_READONLY | TT2S_PL0ACCESS | TT2S_NOTGLOBAL, + TT2S_COMMON_ATTR | TT2S_SHARED_DEV | TT2S_READONLY | TT2S_PL0ACCESS | TT2S_NOTGLOBAL, + TT2S_COMMON_ATTR | TT2S_CACHING_ATTR | TT2S_EXECNEVER | TT2S_PL0ACCESS | TT2S_NOTGLOBAL, + TT2S_COMMON_ATTR | TT2S_SHARED_DEV | TT2S_EXECNEVER | TT2S_PL0ACCESS | TT2S_NOTGLOBAL, + TT2S_COMMON_ATTR | TT2S_CACHING_ATTR | TT2S_PL0ACCESS | TT2S_NOTGLOBAL, + TT2S_COMMON_ATTR | TT2S_SHARED_DEV | TT2S_PL0ACCESS | TT2S_NOTGLOBAL, + TT2S_COMMON_ATTR | TT2S_NOTCACHED | TT2S_EXECNEVER | TT2S_READONLY, + TT2S_COMMON_ATTR | TT2S_SHARED_DEV | TT2S_EXECNEVER | TT2S_READONLY, + TT2S_COMMON_ATTR | TT2S_NOTCACHED | TT2S_READONLY, + TT2S_COMMON_ATTR | TT2S_SHARED_DEV | TT2S_READONLY, + TT2S_COMMON_ATTR | TT2S_NOTCACHED | TT2S_EXECNEVER, + TT2S_COMMON_ATTR | TT2S_SHARED_DEV | TT2S_EXECNEVER, + TT2S_COMMON_ATTR | TT2S_NOTCACHED, + TT2S_COMMON_ATTR | TT2S_SHARED_DEV, + TT2S_COMMON_ATTR | TT2S_NOTCACHED | TT2S_EXECNEVER | TT2S_READONLY | TT2S_PL0ACCESS | TT2S_NOTGLOBAL, + TT2S_COMMON_ATTR | TT2S_SHARED_DEV | TT2S_EXECNEVER | TT2S_READONLY | TT2S_PL0ACCESS | TT2S_NOTGLOBAL, + TT2S_COMMON_ATTR | TT2S_NOTCACHED | TT2S_READONLY | TT2S_PL0ACCESS | TT2S_NOTGLOBAL, + TT2S_COMMON_ATTR | TT2S_SHARED_DEV | TT2S_READONLY | TT2S_PL0ACCESS | TT2S_NOTGLOBAL, + TT2S_COMMON_ATTR | TT2S_NOTCACHED | TT2S_EXECNEVER | TT2S_PL0ACCESS | TT2S_NOTGLOBAL, + TT2S_COMMON_ATTR | TT2S_SHARED_DEV | TT2S_EXECNEVER | TT2S_PL0ACCESS | TT2S_NOTGLOBAL, + TT2S_COMMON_ATTR | TT2S_NOTCACHED | TT2S_PL0ACCESS | TT2S_NOTGLOBAL, + TT2S_COMMON_ATTR | TT2S_SHARED_DEV | TT2S_PL0ACCESS | TT2S_NOTGLOBAL }; +/* clang-format on */ static void _pmap_asidAlloc(pmap_t *pmap) diff --git a/hal/armv7a/zynq7000/_init.S b/hal/armv7a/zynq7000/_init.S index 4802aa0f7..a82f0945f 100644 --- a/hal/armv7a/zynq7000/_init.S +++ b/hal/armv7a/zynq7000/_init.S @@ -45,6 +45,16 @@ #define PA_SLCR 0xf8000000 #define PA_TTC 0xf8001000 +/* Attributes for kernel pages: + * Small page, XN = 0 (allow Execute), TEX = 1 C = 1 B = 1 (Outer and Inner Write-Back, Write-Allocate), + * AP = 0x1 (R/W from PL1 only), S = 1 (Shareable), nG = 0 (Global) */ +#define KERNEL_PAGE_ATTR 0x45E + +/* Attributes for device pages: + * Small page, XN = 1 (Execute Never), TEX = 0 C = 0 B = 1 (Shared Device), + * AP = 0x1 (R/W from PL1 only), S = 0 (unused), nG = 0 (Global) */ +#define DEVICE_PAGE_ATTR 0x17 + .arm .section .init, "ax" @@ -206,14 +216,14 @@ clear_ttls: /* Exceptions vectors TTL2 entry */ /* Map V 0xffff0000 -> P 0x00100000 */ ldr r0, =(PA_OF(VA_TTL2_EXC) + (0x3f0 << 2)) /* Entry address: 4 entries from the end in last TTL2 in pmap_common.excptab */ - orr r1, r8, #0x1a /* Ptr to physical address. Attributes: XN = 0, B = 0, C = 0, AP = 0x3, TEX = 0 */ + orr r1, r8, #0x1a /* Ptr to physical address. Attributes: XN = 0, B = 0, C = 0, AP = 0x1, TEX = 0 */ str r1, [r0] /* Fill TTL2 entry */ /* Stack TTL2 entry */ /* Map V 0xfffff000 -> P PA_STACK */ ldr r0, =(PA_OF(VA_TTL2_EXC) + (0x3ff << 2)) /* Entry address: the last entry in 4 TTL2 in pmap_common.excptab */ - ldr r1, =((PA_STACK & ~0xfff) | 0x1e) /* Ptr to physical address. Attributes: XN = 0, B = 1, C = 1, AP = 0x3, TEX = 0 */ + ldr r1, =((PA_STACK & ~0xfff) | KERNEL_PAGE_ATTR) str r1, [r0] /* Set vector table pointer to virtual address */ @@ -223,8 +233,8 @@ clear_ttls: /* Kernel TTL2 entries (pmap_common.kptab) */ ldr r0, =PA_OF(VA_TTL2_K) - ldr r1, =((PA_KERNEL & ~0xfff) + (1024 * SIZE_PAGE) | 0x1e) /* Ptr past-the-end of physical addresses. Attributes: XN = 0, B = 1, C = 1, AP = 0x3, TEX = 0 */ - mov r2, #(4 * 1024) /* size of pmap_common.kptab, it contains 4 TTL2 */ + ldr r1, =((PA_KERNEL & ~0xfff) + (1024 * SIZE_PAGE) | KERNEL_PAGE_ATTR) /* Ptr past-the-end of physical addresses */ + mov r2, #(4 * 1024) /* size of pmap_common.kptab, it contains 4 TTL2 */ /* Map the whole kernel memory */ kernel_ttl2: subs r2, r2, #4 @@ -237,7 +247,8 @@ kernel_ttl2: ldr r1, =(pmap_common - VADDR_KERNEL) /* offset of pmap_common.kpdir */ add r0, r1, lsr #10 /* r0 = PA_OF(VA_TTL2_K) + (offset of: pmap_common.kpdir >> 10) */ add r1, r1, #PA_KERNEL /* physical address of pmap_common.kpdir */ - orr r1, r1, #0x1f /* Attributes: XN = 1, B = 1, C = 1, AP = 0x3, TEX = 0 */ + ldr r3, =#(KERNEL_PAGE_ATTR | 0x01) /* Attributes: default + XN bit */ + orr r1, r1, r3 mov r2, #0x1000 bl _cpy4 bl _cpy4 @@ -254,38 +265,37 @@ kernel_ttl2: lsl r0, #2 ldr r1, =PA_OF(VA_TTL2_K) add r0, r0, r1 - ldr r1, =(PA_UART0 | 0x12) + ldr r1, =(PA_UART0 | DEVICE_PAGE_ATTR) str r1, [r0], #4 /* Map UART1 4KB P 0xE0001000 -> V CEIL(_end + SIZE_PAGE, SIZE_PAGE) */ - ldr r1, =(PA_UART1 | 0x12) + ldr r1, =(PA_UART1 | DEVICE_PAGE_ATTR) str r1, [r0], #4 /* Map GIC 16 KB after UARTs */ mrc p15, 4, r1, c15, c0, 0 /* Get GIC paddr */ lsr r1, #16 lsl r1, #16 - orr r1, r1, #0x12 + orr r1, r1, #DEVICE_PAGE_ATTR mov r2, #(1 << 12) bl _cpy4 /* Map SLCR after GIC */ - ldr r1, =(PA_SLCR | 0x12) + ldr r1, =(PA_SLCR | DEVICE_PAGE_ATTR) str r1, [r0], #4 /* Map TTC after SLCR */ - ldr r1, =(PA_TTC | 0x12) + ldr r1, =(PA_TTC | DEVICE_PAGE_ATTR) str r1, [r0], #4 /* Initialize MMU */ mov r1, #1 - mcr p15, 0, r1, c2, c0, 2 /* Write Translation Table Base Control Register */ + mcr p15, 0, r1, c2, c0, 2 /* Write Translation Table Base Control Register */ ldr r1, =PA_OF(VA_TTL1) - orr r1, r1, #(1 | (1 << 6)) /* Inner cacheability */ - orr r1, r1, #(3 << 3) /* Outer cacheability */ - mcr p15, 0, r1, c2, c0, 0 /* Write Translation Table Base Register 0 */ - mcr p15, 0, r1, c2, c0, 1 /* Write Translation Table Base Register 1 */ + orr r1, r1, #(1 | (1 << 6) | (3 << 3) | 2) /* Inner and outer cacheability */ + mcr p15, 0, r1, c2, c0, 0 /* Write Translation Table Base Register 0 */ + mcr p15, 0, r1, c2, c0, 1 /* Write Translation Table Base Register 1 */ /* Set all Domains to Client */ ldr r1, =0x55555555 From 48f7197c72f09257130a0a1c67eb46d6298f3c7c Mon Sep 17 00:00:00 2001 From: Jacek Maksymowicz Date: Fri, 10 May 2024 15:46:18 +0200 Subject: [PATCH 5/7] zynq7000: initialize all CPU cores Activate SCU and cache/TLB mainenance broadcast JIRA: RTOS-796 --- hal/armv7a/zynq7000/_init.S | 82 ++++++++++++++++++++++++-------- hal/armv7a/zynq7000/interrupts.c | 7 +++ hal/armv7a/zynq7000/zynq.c | 12 +++++ 3 files changed, 80 insertions(+), 21 deletions(-) diff --git a/hal/armv7a/zynq7000/_init.S b/hal/armv7a/zynq7000/_init.S index a82f0945f..750be15dd 100644 --- a/hal/armv7a/zynq7000/_init.S +++ b/hal/armv7a/zynq7000/_init.S @@ -22,6 +22,7 @@ .extern pmap_common .extern syspage .extern relOffs +.extern nCpusStarted .extern _end #define PA_KERNEL (ADDR_DDR) @@ -102,10 +103,10 @@ _start: orr r0, #1 << 31 /* Enable cycle counter */ mcr p15, 0, r0, c9, c12, 1 /* Write CESR (Count Enable Set Register) */ - /* Enable SMP */ - mrc p15, 0, r1, c1, c0, 1 - orr r1, r1, #(1 << 6) - mcr p15, 0, r1, c1, c0, 1 + /* Enable SMP and cache/TLB maintenance broadcast */ + mrc p15, 0, r1, c1, c0, 1 /* read ACTLR (Auxiliary System Control Register) */ + orr r1, r1, #((1 << 6) | (1 << 0)) /* bit 6: SMP, bit 0: Cache and TLB maintenance broadcast */ + mcr p15, 0, r1, c1, c0, 1 /* write ACTLR (Auxiliary System Control Register) */ /* Disable MMU */ mrc p15, 0, r1, c1, c0, 0 /* Read SCTLR (System Control Register) data */ @@ -119,6 +120,11 @@ _start: bic r1, r1, #(0x1 << 2) /* Disable DCache */ mcr p15, 0, r1, c1, c0, 0 /* Write SCTLR (System Control Register) data */ + /* Invalidate SCU (Snoop Control Unit) */ + ldr r1, =0xf8f0000c + ldr r0, =0xffff + str r0, [r1] + /* Invalidate L1 ICache */ mov r1, #0 mcr p15, 0, r1, c7, c5, 0 /* Clear ICIALLU */ @@ -145,15 +151,15 @@ set_loop: /* Invalidate TLB */ mcr p15, 0, r1, c8, c7, 0 - /* Read CPU ID, only core 0 continues initialization */ + ldr r2, =PA_OF(nCpusStarted) + mov r1, #0 + str r1, [r2] + + /* Read CPU ID, core 0 inits memory structures, the rest wait */ mrc p15, 0, r1, c0, c0, 5 /* Read Multiprocessor Affinity Register */ - ands r1, r1, #0xf /* Extract CPU ID */ - beq structs_init -wait_loop: - wfi - b wait_loop + ands r1, r1, #0xf /* Extract CPU ID */ + bne wait_for_structs_init -structs_init: /* init memory structures (relOffs, syspage, TTLs) */ ldr r1, =#VA_SYSPAGE lsr r1, #12 @@ -189,7 +195,7 @@ clear_ttls: str r1, [r5, r2] bne clear_ttls - /* Needed to execute first stage of kernel, should be unmapped later */ + /* Needed to execute first stage of kernel, should be unmapped after all CPUs have jumped to virtual memory */ /* Map 4 MB V 0x00100000 -> P 0x00100000 */ add r0, r5, #TTL1_OFFSET_OF(PA_KERNEL) /* Entry address: TTL1 base address + entry index * 4 B (entry size) */ ldr r1, =((PA_KERNEL & ~0xfffff) | (0x1 << 10) | 0x2) /* Section entry: base address - DDR, AP = 01, APX = 0 (privileged access only) */ @@ -220,16 +226,16 @@ clear_ttls: str r1, [r0] /* Fill TTL2 entry */ - /* Stack TTL2 entry */ + /* Stacks TTL2 entry (one stack per CPU) */ /* Map V 0xfffff000 -> P PA_STACK */ ldr r0, =(PA_OF(VA_TTL2_EXC) + (0x3ff << 2)) /* Entry address: the last entry in 4 TTL2 in pmap_common.excptab */ ldr r1, =((PA_STACK & ~0xfff) | KERNEL_PAGE_ATTR) - str r1, [r0] - - /* Set vector table pointer to virtual address */ - ldr r0, =_vector_table - mcr p15, 0, r0, c12, c0, 0 /* Write to VBAR (Vector Base Address Register) */ - + mov r2, #NUM_CPUS +stack_for_cpu: + str r1, [r0], #-4 /* Fill TTL2 entry */ + sub r1, #SIZE_PAGE + subs r2, #1 + bne stack_for_cpu /* Kernel TTL2 entries (pmap_common.kptab) */ ldr r0, =PA_OF(VA_TTL2_K) @@ -288,6 +294,19 @@ kernel_ttl2: ldr r1, =(PA_TTC | DEVICE_PAGE_ATTR) str r1, [r0], #4 + b per_core_init + +wait_for_structs_init: + dsb + wfe + ldr r0, [r2] + cmp r0, #0 + beq wait_for_structs_init + +per_core_init: + /* Set vector table pointer to virtual address */ + ldr r0, =_vector_table + mcr p15, 0, r0, c12, c0, 0 /* Write to VBAR (Vector Base Address Register) */ /* Initialize MMU */ mov r1, #1 @@ -312,6 +331,11 @@ kernel_ttl2: bic r1, r1, #(0x1 << 28) /* Disable TEX remap */ mcr p15, 0, r1, c1, c0, 0 /* Write SCTLR (System Control Register) data */ + /* Enable SCU */ + ldr r1, =0xf8f00000 + ldr r0, [r1] + orr r0, r0, #0x1 + str r0, [r1] /* Enable MMU */ mrc p15, 0, r1, c1, c0, 0 /* Read Control Register configuration data */ @@ -327,8 +351,13 @@ kernel_ttl2: mcr p15, 0, r0, c5, c0, 1 /* IFSR */ mcr p15, 0, r0, c6, c0, 2 /* IFAR */ + /* Get CPU ID so we can calculate which stack to use */ + mrc p15, 0, r1, c0, c0, 5 /* Read Multiprocessor Affinity Register */ + and r1, r1, #0xf /* Extract CPU ID */ + /* Setup stacks */ eor r0, r0 /* initialize SP to top of memory */ + sub r0, r1, lsl #12 /* every subsequent CPU goes one page down */ /* FIQ mode stack */ msr CPSR_c, #(FIQ_MODE | NO_INT) @@ -368,8 +397,19 @@ kernel_ttl2: orr r0, r0, #(0x1 << 30) /* FPU enable bit */ vmsr fpexc, r0 - /* Jump to main() in virtual memory */ - ldr pc, =main + /* Jump to virtual memory */ + cmp r1, #0 + ldreq pc, =main + ldr pc, =other_core_main + +other_core_main: + blx _hal_interruptsInitPerCPU + blx _hal_cpuInit + cpsie aif +other_core_wait: + wfi + b other_core_wait + #include "hal/armv7a/_interrupts.S" #include "hal/armv7a/_armv7a.S" diff --git a/hal/armv7a/zynq7000/interrupts.c b/hal/armv7a/zynq7000/interrupts.c index 64b999d72..5eb21ba12 100644 --- a/hal/armv7a/zynq7000/interrupts.c +++ b/hal/armv7a/zynq7000/interrupts.c @@ -71,6 +71,7 @@ static const u8 spiConf[] = { /* IRQID: 88-95 */ high_lvl, high_lvl, high_lvl, high_lvl, rising_edge, reserved, reserved, reserved }; +void _hal_interruptsInitPerCPU(void); extern int threads_schedule(unsigned int n, cpu_context_t *context, void *arg); @@ -240,6 +241,12 @@ void _hal_interruptsInit(void) /* enable_secure = 1 */ *(interrupts_common.gic + ddcr) |= 0x3; + _hal_interruptsInitPerCPU(); +} + + +void _hal_interruptsInitPerCPU(void) +{ *(interrupts_common.gic + cicr) &= ~0x3; /* Initialize CPU Interface of the gic diff --git a/hal/armv7a/zynq7000/zynq.c b/hal/armv7a/zynq7000/zynq.c index 0c5bff0ca..accceeb43 100644 --- a/hal/armv7a/zynq7000/zynq.c +++ b/hal/armv7a/zynq7000/zynq.c @@ -78,6 +78,7 @@ struct { extern unsigned int _end; +volatile unsigned int nCpusStarted = 0; static void _zynq_slcrLock(void) @@ -693,6 +694,17 @@ static u32 checkNumCPUs(void) void _hal_cpuInit(void) { zynq_common.nCpus = checkNumCPUs(); + hal_cpuAtomicInc(&nCpusStarted); + if (hal_cpuAtomicGet(&nCpusStarted) == 1) { + /* This is necessary because other CPU is still in physical memory + * with L1 cache turned off so SCU cannot enforce cache coherence */ + hal_cpuFlushDataCache((ptr_t)&nCpusStarted, (ptr_t)((&nCpusStarted) + 1)); + } + + hal_cpuSignalEvent(); + while (hal_cpuAtomicGet(&nCpusStarted) != zynq_common.nCpus) { + hal_cpuWaitForEvent(); + } } From 97a8988d316d47782433a220992d2dc60ce65b02 Mon Sep 17 00:00:00 2001 From: Jacek Maksymowicz Date: Mon, 8 Apr 2024 16:24:50 +0200 Subject: [PATCH 6/7] zynq7000: adjust interrupts for SMP Allow interrupts to run on any core Schedule timer interrupt in a round-robin fashion across both cores Adjust timer to keep the current scheduling period on both cores JIRA: RTOS-796 --- hal/armv7a/zynq7000/interrupts.c | 8 +++++--- hal/armv7a/zynq7000/timer.c | 17 ++++++++++++----- hal/armv7a/zynq7000/zynq.h | 3 +++ 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/hal/armv7a/zynq7000/interrupts.c b/hal/armv7a/zynq7000/interrupts.c index 5eb21ba12..e602e7c65 100644 --- a/hal/armv7a/zynq7000/interrupts.c +++ b/hal/armv7a/zynq7000/interrupts.c @@ -31,6 +31,8 @@ #define SGI_FLT_OTHER_CPUS 1 /* Send SGI to all CPUs except the one that called this function */ #define SGI_FLT_THIS_CPU 2 /* Send SGI to the CPU that called this function */ +#define DEFAULT_CPU_MASK 0x3 + /* clang-format off */ enum { @@ -133,7 +135,7 @@ static void interrupts_setConf(unsigned int irqn, u32 conf) } -static void interrupts_setCPU(unsigned int irqn, u32 cpuID) +void _zynq_interrupts_setCPU(unsigned int irqn, u32 cpuID) { u32 mask; @@ -167,7 +169,7 @@ int hal_interruptsSetHandler(intr_handler_t *h) HAL_LIST_ADD(&interrupts_common.handlers[h->n], h); interrupts_setPriority(h->n, 0xa); - interrupts_setCPU(h->n, 0x1); + _zynq_interrupts_setCPU(h->n, DEFAULT_CPU_MASK); interrupts_enableIRQ(h->n); hal_spinlockClear(&interrupts_common.spinlock[h->n], &sc); @@ -228,7 +230,7 @@ void _hal_interruptsInit(void) /* Set required configuration and CPU_0 as a default processor */ for (i = SPI_FIRST_IRQID; i < SIZE_INTERRUPTS; ++i) { interrupts_setConf(i, spiConf[i - SPI_FIRST_IRQID]); - interrupts_setCPU(i, 0x1); + _zynq_interrupts_setCPU(i, DEFAULT_CPU_MASK); } /* SGI and PPI interrupts are fixed to always be on both CPUs */ diff --git a/hal/armv7a/zynq7000/timer.c b/hal/armv7a/zynq7000/timer.c index eac264e6d..11a50a1b5 100644 --- a/hal/armv7a/zynq7000/timer.c +++ b/hal/armv7a/zynq7000/timer.c @@ -18,6 +18,8 @@ #include "hal/spinlock.h" #include "hal/string.h" +#include "zynq.h" + #define TIMER_SRC_CLK_CPU_1x 111111115 /* Hz */ #define TIMER_IRQ_ID 42 @@ -47,13 +49,18 @@ static int _timer_irqHandler(unsigned int n, cpu_context_t *ctx, void *arg) (void)arg; (void)ctx; - u32 st = *(timer_common.ttc + isr); - + spinlock_ctx_t sc; + hal_spinlockSet(&timer_common.sp, &sc); /* Interval IRQ */ - if (st & 0x1) { + if ((*(timer_common.ttc + isr) & 1) != 0) { timer_common.jiffies += timer_common.ticksPerFreq; } + hal_spinlockClear(&timer_common.sp, &sc); + + u32 nextID = hal_cpuGetID() + 1; + u32 nextTargetCPU = (nextID == hal_cpuGetCount()) ? 1 : (1 << nextID); + _zynq_interrupts_setCPU(n, nextTargetCPU); hal_cpuDataSyncBarrier(); return 0; @@ -62,7 +69,7 @@ static int _timer_irqHandler(unsigned int n, cpu_context_t *ctx, void *arg) static time_t hal_timerCyc2Us(time_t cyc) { - return (cyc * 1000LL) / (time_t)(timer_common.ticksPerFreq); + return (cyc * 1000LL) / (time_t)(timer_common.ticksPerFreq * hal_cpuGetCount()); } @@ -167,7 +174,7 @@ void _hal_timerInit(u32 interval) /* Reset counters and restart counting */ *(timer_common.ttc + cnt_ctrl) = 0x10; - hal_timerSetPrescaler(interval); + hal_timerSetPrescaler(interval * hal_cpuGetCount()); hal_spinlockCreate(&timer_common.sp, "timer"); timer_common.handler.f = _timer_irqHandler; diff --git a/hal/armv7a/zynq7000/zynq.h b/hal/armv7a/zynq7000/zynq.h index 9b8528d91..80a31e989 100644 --- a/hal/armv7a/zynq7000/zynq.h +++ b/hal/armv7a/zynq7000/zynq.h @@ -25,4 +25,7 @@ extern int _zynq_setMIO(unsigned int pin, char disableRcvr, char pullup, char io extern int _zynq_setAmbaClk(u32 dev, u32 state); +extern void _zynq_interrupts_setCPU(unsigned int irqn, u32 cpuID); + + #endif From b1884db67e81f959b4b528c60cc892e965cbc7d7 Mon Sep 17 00:00:00 2001 From: Jacek Maksymowicz Date: Fri, 26 Apr 2024 16:35:38 +0200 Subject: [PATCH 7/7] zynq7000: enable L2 cache JIRA: RTOS-796 --- hal/armv7a/zynq7000/zynq.c | 41 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/hal/armv7a/zynq7000/zynq.c b/hal/armv7a/zynq7000/zynq.c index accceeb43..0bbcbb2cf 100644 --- a/hal/armv7a/zynq7000/zynq.c +++ b/hal/armv7a/zynq7000/zynq.c @@ -20,6 +20,7 @@ #include "include/arch/armv7a/zynq7000/zynq7000.h" +/* clang-format off */ /* SLCR (System Level Control Registers) */ enum { /* SLCR protection registers */ @@ -61,6 +62,7 @@ enum { slcr_sd0_wp_cd_sel = 0x20c, slcr_sd1_wp_cd_sel, slcr_lvl_shftr_en = 0x240, slcr_ocm_cfg = 0x244, + slcr_l2c_ram_reg = 0x287, /* GPIO config registers */ slcr_gpiob_ctrl = 0x2c0, slcr_gpiob_cfg_cmos18, slcr_gpiob_cfg_cmos25, slcr_gpiob_cfg_cmos33, slcr_gpiob_cfg_hstl = 0x2c5, slcr_gpiob_drvr_bias_ctrl, @@ -70,9 +72,21 @@ enum { }; +enum { + l2cc_ctrl = 0x40, l2cc_aux_ctrl, l2cc_tag_ram_ctrl, l2cc_data_ram_ctrl, + l2cc_int_mask = 0x85, l2cc_int_mask_status, l2cc_int_raw, l2cc_int_clear, + l2cc_sync = 0x1cc, + l2cc_inval_pa = 0x1dc, l2cc_inval_way = 0x1df, + l2cc_clean_pa = 0x1ec, l2cc_clean_index = 0x1ee, l2cc_clean_way, + l2cc_flush_pa = 0x1fc, l2cc_flush_index = 0x1fe, l2cc_flush_way, +}; +/* clang-format on */ + + struct { spinlock_t pltctlSp; volatile u32 *slcr; + volatile u32 *l2cc; unsigned int nCpus; } zynq_common; @@ -652,10 +666,33 @@ int hal_platformctl(void *ptr) } +static void _zynq_activateL2Cache(void) +{ + *(zynq_common.l2cc + l2cc_ctrl) = 0; /* Disable L2 cache */ + hal_cpuDataMemoryBarrier(); + *(zynq_common.l2cc + l2cc_aux_ctrl) |= 0x72360000; /* Enable all prefetching, Way Size (16 KB) and High Priority for SO and Dev Reads Enable */ + *(zynq_common.l2cc + l2cc_tag_ram_ctrl) = 0x0111; /* 7 Cycles of latency for TAG RAM */ + *(zynq_common.l2cc + l2cc_data_ram_ctrl) = 0x0121; /* 7 Cycles of latency for DATA RAM */ + *(zynq_common.l2cc + l2cc_inval_way) = 0xFFFF; /* Invalidate everything */ + hal_cpuDataMemoryBarrier(); + while (*(zynq_common.l2cc + l2cc_sync) != 0) { + /* wait for completion */ + } + + *(zynq_common.l2cc + l2cc_int_clear) = *(zynq_common.l2cc + l2cc_int_raw); /* Clear pending interrupts */ + _zynq_slcrUnlock(); + *(zynq_common.slcr + slcr_l2c_ram_reg) = 0x00020202; /* Magic value, not described in detail */ + _zynq_slcrLock(); + hal_cpuDataMemoryBarrier(); + *(zynq_common.l2cc + l2cc_ctrl) |= 1; /* Enable L2 cache */ +} + + void _hal_platformInit(void) { hal_spinlockCreate(&zynq_common.pltctlSp, "pltctl"); zynq_common.slcr = (void *)(((u32)&_end + 9 * SIZE_PAGE - 1) & ~(SIZE_PAGE - 1)); + zynq_common.l2cc = (void *)(((u32)&_end + 7 * SIZE_PAGE - 1) & ~(SIZE_PAGE - 1)); } @@ -705,6 +742,10 @@ void _hal_cpuInit(void) while (hal_cpuAtomicGet(&nCpusStarted) != zynq_common.nCpus) { hal_cpuWaitForEvent(); } + + if (hal_cpuGetID() == 0) { + _zynq_activateL2Cache(); + } }