From 43ba9b2e4b1a50b6e7a12bbc58685e5601963260 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Sat, 16 Nov 2024 04:49:09 +0100 Subject: [PATCH 1/2] Add support for Store Queues with MMU (#779) * mmu: Add function mmu_enabled() This function can be used to know whether or not the MMU is in use. * mmu: Support address translation of store queues Reserve the last two TLB entries and map a 1 MiB page each for use with the SQ area. Add the function mmu_set_sq_addr(), which will overwrite the physical address that the SQ area points to in the TLB. * sq: Update sq_lock() prototype to work with MMU Change its prototype: it now returns a pointer, which is the translated address of the source pointer. In non-MMU use, it's returns the same value as SQ_MASK_DEST(src). When MMU is enabled, sq_lock() will reset the address translation for the SQ memory area, and convert the source address as an offset to that memory area, making it possible to use SQs when MMU is enabled. The caveat is that we only map one megabyte of SQ memory area for simplicity, and therefore the SQ memory window is reduced. This should be fine though, as apart from e.g. a whole-VRAM clear, nobody will ever need such a big window. * sq: Update sq_cpy() to work with MMU When using the MMU the SQ area is a 1 MiB page, so we must not copy data outside this area. * sq: Update sq_set32() to work with MMU When using the MMU the SQ area is a 1 MiB page, so we must not copy data outside this area. * mmu: Remove mmu_disable() / mmu_restore() Now that SQs work with MMU, those two functions aren't useful anymore and can be dropped. --------- Signed-off-by: Paul Cercueil --- kernel/arch/dreamcast/hardware/sq.c | 122 ++++++++++++++--------- kernel/arch/dreamcast/include/arch/mmu.h | 22 ++-- kernel/arch/dreamcast/include/dc/sq.h | 5 +- kernel/arch/dreamcast/kernel/mmu.c | 29 ++++-- 4 files changed, 106 insertions(+), 72 deletions(-) diff --git a/kernel/arch/dreamcast/hardware/sq.c b/kernel/arch/dreamcast/hardware/sq.c index 1c442f098..350f42716 100644 --- a/kernel/arch/dreamcast/hardware/sq.c +++ b/kernel/arch/dreamcast/hardware/sq.c @@ -56,9 +56,7 @@ static mutex_t sq_mutex = RECURSIVE_MUTEX_INITIALIZER; typedef struct sq_state { - uint8_t dest0; - uint8_t dest1; - mmu_token_t mmu_token; + uint32_t dest; } sq_state_t; #ifndef SQ_STATE_CACHE_SIZE @@ -67,8 +65,10 @@ typedef struct sq_state { static sq_state_t sq_state_cache[SQ_STATE_CACHE_SIZE] = {0}; -void sq_lock(void *dest) { +uint32_t *sq_lock(void *dest) { sq_state_t *new_state; + bool with_mmu; + uint32_t mask; mutex_lock(&sq_mutex); @@ -76,17 +76,22 @@ void sq_lock(void *dest) { new_state = &sq_state_cache[sq_mutex.count - 1]; - /* Disable MMU, because SQs work differently when it's enabled, and we - * don't support it. */ - new_state->mmu_token = mmu_disable(); + new_state->dest = (uint32_t)dest; - new_state->dest0 = new_state->dest1 = QACR_EXTERN_BITS(dest); + with_mmu = mmu_enabled(); + mask = with_mmu ? 0x000fffe0 : 0x03ffffe0; - SET_QACR_REGS_INNER(new_state->dest0, new_state->dest1); + if (with_mmu) + mmu_set_sq_addr(dest); + else + SET_QACR_REGS(dest, dest); + + return (uint32_t *)(MEM_AREA_SQ_BASE | ((uintptr_t)dest & mask)); } void sq_unlock(void) { sq_state_t *tmp_state; + bool with_mmu; if(sq_mutex.count == 0) { dbglog(DBG_WARNING, "sq_unlock: Called without any lock\n"); @@ -95,13 +100,15 @@ void sq_unlock(void) { tmp_state = &sq_state_cache[sq_mutex.count - 1]; - /* Restore the mmu state that we had started with */ - mmu_restore(tmp_state->mmu_token); - /* If we aren't the last entry, set the regs back where they belong */ if(sq_mutex.count - 1) { tmp_state = &sq_state_cache[sq_mutex.count - 2]; - SET_QACR_REGS_INNER(tmp_state->dest0, tmp_state->dest1); + with_mmu = mmu_enabled(); + + if (with_mmu) + mmu_set_sq_addr((void *)tmp_state->dest); + else + SET_QACR_REGS(tmp_state->dest, tmp_state->dest); } mutex_unlock(&sq_mutex); @@ -115,38 +122,48 @@ void sq_wait(void) { /* Copies n bytes from src to dest, dest must be 32-byte aligned */ __attribute__((noinline)) void *sq_cpy(void *dest, const void *src, size_t n) { - uint32_t *d = SQ_MASK_DEST(dest); const uint32_t *s = src; + void *curr_dest = dest; + uint32_t *d; + size_t nb; /* Fill/write queues as many times necessary */ n >>= 5; - /* Exit early if we dont have enough data to copy */ - if(n == 0) - return dest; - - sq_lock(dest); - - /* If src is not 8-byte aligned, slow path */ - if ((uintptr_t)src & 7) { - while(n--) { - dcache_pref_block(s + 8); /* Prefetch 32 bytes for next loop */ - d[0] = *(s++); - d[1] = *(s++); - d[2] = *(s++); - d[3] = *(s++); - d[4] = *(s++); - d[5] = *(s++); - d[6] = *(s++); - d[7] = *(s++); - sq_flush(d); - d += 8; + while (n > 0) { + /* Transfer maximum 1 MiB at once. This is because when using the + * MMU the SQ area is 2 MiB, and the destination address may + * not be on a page boundary. */ + nb = n > 0x8000 ? 0x8000 : n; + + d = sq_lock(curr_dest); + + curr_dest += nb * 32; + n -= nb; + + /* If src is not 8-byte aligned, slow path */ + if ((uintptr_t)src & 7) { + while(nb--) { + dcache_pref_block(s + 8); /* Prefetch 32 bytes for next loop */ + d[0] = *(s++); + d[1] = *(s++); + d[2] = *(s++); + d[3] = *(s++); + d[4] = *(s++); + d[5] = *(s++); + d[6] = *(s++); + d[7] = *(s++); + sq_flush(d); + d += 8; + } + } else { /* If src is 8-byte aligned, fast path */ + sq_fast_cpy(d, s, nb); + s += nb * 32; } - } else { /* If src is 8-byte aligned, fast path */ - sq_fast_cpy(d, s, n); + + sq_unlock(); } - sq_unlock(); return dest; } @@ -170,25 +187,34 @@ void *sq_set16(void *dest, uint32_t c, size_t n) { /* Fills n bytes at dest with int c, dest must be 32-byte aligned */ void *sq_set32(void *dest, uint32_t c, size_t n) { - uint32_t *d = SQ_MASK_DEST(dest); + void *curr_dest = dest; + uint32_t *d; + size_t nb; /* Write them as many times necessary */ n >>= 5; - /* Exit early if we dont have enough data to set */ - if(n == 0) - return dest; + while (n > 0) { + /* Transfer maximum 1 MiB at once. This is because when using the + * MMU the SQ area is 2 MiB, and the destination address may + * not be on a page boundary. */ + nb = n > 0x8000 ? 0x8000 : n; - sq_lock(dest); + d = sq_lock(curr_dest); + + curr_dest += nb * 32; + n -= nb; + + while(nb--) { + /* Fill both store queues with c */ + d[0] = d[1] = d[2] = d[3] = d[4] = d[5] = d[6] = d[7] = c; + sq_flush(d); + d += 8; + } - while(n--) { - /* Fill both store queues with c */ - d[0] = d[1] = d[2] = d[3] = d[4] = d[5] = d[6] = d[7] = c; - sq_flush(d); - d += 8; + sq_unlock(); } - sq_unlock(); return dest; } diff --git a/kernel/arch/dreamcast/include/arch/mmu.h b/kernel/arch/dreamcast/include/arch/mmu.h index 5b4a6cf83..70c3323b3 100644 --- a/kernel/arch/dreamcast/include/arch/mmu.h +++ b/kernel/arch/dreamcast/include/arch/mmu.h @@ -48,6 +48,8 @@ #include __BEGIN_DECLS +#include + #include #include @@ -195,9 +197,6 @@ typedef struct mmucontext { to do so. */ extern mmucontext_t *mmu_cxt_current; - -struct mmu_token; -typedef struct mmu_token *mmu_token_t; /** \endcond */ /** \brief Set the "current" page tables for TLB handling. @@ -373,19 +372,18 @@ void mmu_shutdown(void); */ void mmu_reset_itlb(void); -/** \brief Temporarily disable MMU address translation. +/** \brief Check if MMU translation is enabled. \ingroup mmu - \return An opaque token to be passed to mmu_restore() + \return True if MMU translation is enabled, false otherwise. */ -mmu_token_t mmu_disable(void); +bool mmu_enabled(void); -/** \brief Restore MMU address translation. - \ingroup mmu - - \param token The opaque token obtained from mmu_disable() - */ -void mmu_restore(mmu_token_t token); +/** \brief Reset the base target address for store queues. + * \ingroup mmu + * + * \param addr The base address to reset to */ +void mmu_set_sq_addr(void *addr); __END_DECLS diff --git a/kernel/arch/dreamcast/include/dc/sq.h b/kernel/arch/dreamcast/include/dc/sq.h index 37ffbeebb..6ab0987cc 100644 --- a/kernel/arch/dreamcast/include/dc/sq.h +++ b/kernel/arch/dreamcast/include/dc/sq.h @@ -69,9 +69,12 @@ __BEGIN_DECLS however, it must be called manually when driving the SQs directly from outside of this API. + \param dest The destination address. + \return The translated address that can be directly written to. + \sa sq_unlock() */ -void sq_lock(void *dest); +uint32_t *sq_lock(void *dest); /** \brief Unlock Store Queues \ingroup store_queues diff --git a/kernel/arch/dreamcast/kernel/mmu.c b/kernel/arch/dreamcast/kernel/mmu.c index 466a15612..aa9bd96ff 100644 --- a/kernel/arch/dreamcast/kernel/mmu.c +++ b/kernel/arch/dreamcast/kernel/mmu.c @@ -738,9 +738,15 @@ int mmu_init(void) { irq_set_handler(EXC_DTLB_PV_WRITE, dtlb_pv_write, NULL); irq_set_handler(EXC_INITIAL_PAGE_WRITE, initial_page_write, NULL); - /* Turn on MMU */ - /* URB=0x3f, URC=0, SQMD=1, SV=0, TI=1, AT=1 */ - SET_MMUCR(0x3f, 0, 1, 0, 1, 1); + /* Reserve TLB entries 62-63 for SQ translation. Register them as read-write + * (since there's no write-only flag) with a 1 MiB page. */ + SET_MMUCR(0x3e, 0x3e, 1, 0, 1, 1); + mmu_ldtlb(0, 0xe0000000, 0, 3, 1, 0, 0, 0, 0); + SET_MMUCR(0x3f, 0x3f, 1, 0, 1, 1); + mmu_ldtlb(0, 0xe0100000, 0, 3, 1, 0, 0, 0, 0); + + /* Set URB to 0x3d to not overwrite the SQ config, reset URC, enable MMU */ + SET_MMUCR(0x3d, 0, 1, 0, 1, 1); /* Clear the ITLB */ mmu_reset_itlb(); @@ -767,14 +773,15 @@ void mmu_shutdown(void) { irq_set_handler(EXC_INITIAL_PAGE_WRITE, NULL, NULL); } -mmu_token_t mmu_disable(void) { - mmu_token_t token = (mmu_token_t)*mmucr; - - *mmucr &= ~0x1; - - return token; +bool mmu_enabled(void) { + return *mmucr & 0x1; } -void mmu_restore(mmu_token_t token) { - *mmucr = (uint32)token; +void mmu_set_sq_addr(void *addr) { + uint32_t ppn1 = (uint32_t)addr & 0x1ff00000; + uint32_t ppn2 = ppn1 + 0x00100000; + + /* Reset the base target address for the SQs */ + *(uint32_t *)(MEM_AREA_UTLB_DATA_ARRAY1_BASE + (0x3e << 8)) = ppn1 | 0x1fc; + *(uint32_t *)(MEM_AREA_UTLB_DATA_ARRAY1_BASE + (0x3f << 8)) = ppn2 | 0x1fc; } From a60375e8ff1fdd99431cfbae34f54a3429d1bd94 Mon Sep 17 00:00:00 2001 From: Falco Girgis Date: Fri, 15 Nov 2024 22:18:48 -0600 Subject: [PATCH 2/2] Gracefully return 0 for ARM/PVR pre-initted stats. (#830) - Rather than asserting and crashing, which isn't really warranted, modified pvr_mem_available() and snd_mem_available() to gracefully return 0 when called before PVR/Sound RAM has been initialized. - This way profilers and monitors in other threads can still boot up and show proper stats while things are being initialized, rather than simply crashing in debug builds and returning garbage in release builds. --- kernel/arch/dreamcast/hardware/pvr/pvr_mem.c | 3 ++- kernel/arch/dreamcast/sound/snd_mem.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/arch/dreamcast/hardware/pvr/pvr_mem.c b/kernel/arch/dreamcast/hardware/pvr/pvr_mem.c index 19fdf92c3..eeddf80c4 100644 --- a/kernel/arch/dreamcast/hardware/pvr/pvr_mem.c +++ b/kernel/arch/dreamcast/hardware/pvr/pvr_mem.c @@ -179,7 +179,8 @@ static uint32 pvr_mem_available_int(void) { } uint32 pvr_mem_available(void) { - CHECK_MEM_BASE; + if(!pvr_mem_base) + return 0; return pvr_mem_available_int() + (PVR_RAM_INT_TOP - (uint32)pvr_mem_base); diff --git a/kernel/arch/dreamcast/sound/snd_mem.c b/kernel/arch/dreamcast/sound/snd_mem.c index ee85418b1..1e6e55277 100644 --- a/kernel/arch/dreamcast/sound/snd_mem.c +++ b/kernel/arch/dreamcast/sound/snd_mem.c @@ -294,7 +294,8 @@ uint32 snd_mem_available(void) { snd_block_t *e; size_t largest = 0; - assert_msg(initted, "Use of snd_mem_available before snd_mem_init"); + if(!initted) + return 0; if(irq_inside_int()) { if(!spinlock_trylock(&snd_mem_mutex)) {