From 7718dc15c752af431686182a37ae8b7bb7c82460 Mon Sep 17 00:00:00 2001
From: Po-Ju Chen <yy214123@gmail.com>
Date: Thu, 30 Oct 2025 02:39:25 +0800
Subject: [PATCH 1/4] Implement direct-mapped instruction cache

Extend the existing architecture to include a direct-mapped
instruction cache that stores recently fetched instructions.

Add related constants and macros for cache size and address fields.
---
 riscv.c | 41 +++++++++++++++++++++++++++++++++--------
 riscv.h | 30 ++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+), 8 deletions(-)

diff --git a/riscv.c b/riscv.c
index 3e00751..1af0a64 100644
--- a/riscv.c
+++ b/riscv.c
@@ -1,4 +1,5 @@
 #include <stdio.h>
+#include <string.h>
 
 #include "common.h"
 #include "device.h"
@@ -180,6 +181,11 @@ static inline uint32_t read_rs2(const hart_t *vm, uint32_t insn)
     return vm->x_regs[decode_rs2(insn)];
 }
 
+static inline void icache_invalidate_all(hart_t *vm)
+{
+    memset(&vm->icache, 0, sizeof(vm->icache));
+}
+
 /* virtual addressing */
 
 void mmu_invalidate(hart_t *vm)
@@ -197,6 +203,7 @@ void mmu_invalidate(hart_t *vm)
             vm->cache_store[set].ways[way].n_pages = 0xFFFFFFFF;
         vm->cache_store[set].lru = 0; /* Reset LRU to way 0 */
     }
+    icache_invalidate_all(vm);
 }
 
 /* Invalidate MMU caches for a specific virtual address range.
@@ -361,11 +368,27 @@ static void mmu_fence(hart_t *vm, uint32_t insn UNUSED)
 
 static void mmu_fetch(hart_t *vm, uint32_t addr, uint32_t *value)
 {
-    uint32_t vpn = addr >> RV_PAGE_SHIFT;
-    if (unlikely(vpn != vm->cache_fetch.n_pages)) {
+    /* cache hit */
+    uint32_t idx = (addr >> ICACHE_OFFSET_BITS) & ICACHE_INDEX_MASK;
+    uint32_t tag = addr >> (ICACHE_OFFSET_BITS + ICACHE_INDEX_BITS);
+    icache_block_t *blk = &vm->icache.block[idx];
+
+    if (likely(blk->valid && blk->tag == tag)) {
 #ifdef MMU_CACHE_STATS
-        vm->cache_fetch.misses++;
+        vm->cache_fetch.hits++;
 #endif
+        uint32_t ofs = addr & ICACHE_BLOCK_MASK;
+        *value = *(const uint32_t *) (blk->base + ofs);
+        return;
+    }
+
+#ifdef MMU_CACHE_STATS
+    vm->cache_fetch.misses++;
+#endif
+
+    /* cache miss, Continue using the original va->pa*/
+    uint32_t vpn = addr >> RV_PAGE_SHIFT;
+    if (unlikely(vpn != vm->cache_fetch.n_pages)) {
         mmu_translate(vm, &addr, (1 << 3), (1 << 6), false, RV_EXC_FETCH_FAULT,
                       RV_EXC_FETCH_PFAULT);
         if (vm->error)
@@ -377,12 +400,14 @@ static void mmu_fetch(hart_t *vm, uint32_t addr, uint32_t *value)
         vm->cache_fetch.n_pages = vpn;
         vm->cache_fetch.page_addr = page_addr;
     }
-#ifdef MMU_CACHE_STATS
-    else {
-        vm->cache_fetch.hits++;
-    }
-#endif
+
     *value = vm->cache_fetch.page_addr[(addr >> 2) & MASK(RV_PAGE_SHIFT - 2)];
+
+    /* fill into the cache */
+    uint32_t block_off = (addr & RV_PAGE_MASK) & ~ICACHE_BLOCK_MASK;
+    blk->base = (const uint8_t *) vm->cache_fetch.page_addr + block_off;
+    blk->tag = tag;
+    blk->valid = true;
 }
 
 static void mmu_load(hart_t *vm,
diff --git a/riscv.h b/riscv.h
index 86619a1..da7dac7 100644
--- a/riscv.h
+++ b/riscv.h
@@ -75,7 +75,37 @@ typedef struct {
 typedef struct __hart_internal hart_t;
 typedef struct __vm_internel vm_t;
 
+/* ICACHE_BLOCKS_SIZE: Size of one instruction-cache block (line).
+ * ICACHE_BLOCKS: Number of blocks (lines) in the instruction cache.
+ *
+ * The cache address is decomposed into [ tag | index | offset ] fields:
+ *   - block-offset bits = log2(ICACHE_BLOCKS_SIZE)
+ *   - index bits        = log2(ICACHE_BLOCKS)
+ */
+#define ICACHE_BLOCKS_SIZE 256
+#define ICACHE_BLOCKS 256
+#define ICACHE_OFFSET_BITS 8
+#define ICACHE_INDEX_BITS 8
+
+/* For power-of-two sizes, (size - 1) sets all low bits to 1,
+ * allowing fast extraction of an address.
+ */
+#define ICACHE_INDEX_MASK (ICACHE_BLOCKS - 1)
+#define ICACHE_BLOCK_MASK (ICACHE_BLOCKS_SIZE - 1)
+#define RV_PAGE_MASK (RV_PAGE_SIZE - 1)
+
+typedef struct {
+    uint32_t tag;
+    const uint8_t *base;
+    bool valid;
+} icache_block_t;
+
+typedef struct {
+    icache_block_t block[ICACHE_BLOCKS];
+} icache_t;
+
 struct __hart_internal {
+    icache_t icache;
     uint32_t x_regs[32];
 
     /* LR reservation virtual address. last bit is 1 if valid */

From e499d87ae38718548c960600f44d9dbd7840e36c Mon Sep 17 00:00:00 2001
From: Po-Ju Chen <yy214123@gmail.com>
Date: Sat, 1 Nov 2025 02:21:19 +0800
Subject: [PATCH 2/4] Adopt 2-entry direct-mapped page cache

Replace the previous 1-entry direct-mapped design with a 2-entry
direct-mapped cache using hash-based indexing (same parity hash as
cache_load). This allows two hot virtual pages to coexist without
thrashing.

Measurement shows that the number of virtual-to-physical translations
during instruction fetch (mmu_translate() calls) decreased by ~10%.
---
 main.c  | 13 ++++++++-----
 riscv.c | 29 +++++++++++++++++------------
 riscv.h |  3 ++-
 3 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/main.c b/main.c
index a1d7b7f..36957a9 100644
--- a/main.c
+++ b/main.c
@@ -1024,8 +1024,11 @@ static void print_mmu_cache_stats(vm_t *vm)
     fprintf(stderr, "\n=== MMU Cache Statistics ===\n");
     for (uint32_t i = 0; i < vm->n_hart; i++) {
         hart_t *hart = vm->hart[i];
-        uint64_t fetch_total =
-            hart->cache_fetch.hits + hart->cache_fetch.misses;
+        uint64_t fetch_hits = 0, fetch_misses = 0;
+        fetch_hits = hart->cache_fetch[1].hits + hart->cache_fetch[2].hits;
+        fetch_misses =
+            hart->cache_fetch[1].misses + hart->cache_fetch[2].misses;
+        uint64_t fetch_total = fetch_hits + fetch_misses;
 
         /* Combine 8-set × 2-way load cache statistics */
         uint64_t load_hits = 0, load_misses = 0;
@@ -1048,11 +1051,11 @@ static void print_mmu_cache_stats(vm_t *vm)
         uint64_t store_total = store_hits + store_misses;
 
         fprintf(stderr, "\nHart %u:\n", i);
-        fprintf(stderr, "  Fetch: %12llu hits, %12llu misses",
-                hart->cache_fetch.hits, hart->cache_fetch.misses);
+        fprintf(stderr, "  Fetch: %12llu hits, %12llu misses", fetch_hits,
+                fetch_misses);
         if (fetch_total > 0)
             fprintf(stderr, " (%.2f%% hit rate)",
-                    100.0 * hart->cache_fetch.hits / fetch_total);
+                    100.0 * fetch_hits / fetch_total);
         fprintf(stderr, "\n");
 
         fprintf(stderr, "  Load:  %12llu hits, %12llu misses (8x2)", load_hits,
diff --git a/riscv.c b/riscv.c
index 1af0a64..a1eb656 100644
--- a/riscv.c
+++ b/riscv.c
@@ -190,7 +190,8 @@ static inline void icache_invalidate_all(hart_t *vm)
 
 void mmu_invalidate(hart_t *vm)
 {
-    vm->cache_fetch.n_pages = 0xFFFFFFFF;
+    vm->cache_fetch[0].n_pages = 0xFFFFFFFF;
+    vm->cache_fetch[1].n_pages = 0xFFFFFFFF;
     /* Invalidate all 8 sets × 2 ways for load cache */
     for (int set = 0; set < 8; set++) {
         for (int way = 0; way < 2; way++)
@@ -234,9 +235,11 @@ void mmu_invalidate_range(hart_t *vm, uint32_t start_addr, uint32_t size)
     uint32_t end_vpn = (uint32_t) end_addr >> RV_PAGE_SHIFT;
 
     /* Cache invalidation for fetch cache */
-    if (vm->cache_fetch.n_pages >= start_vpn &&
-        vm->cache_fetch.n_pages <= end_vpn)
-        vm->cache_fetch.n_pages = 0xFFFFFFFF;
+    for (int i = 0; i < 2; i++) {
+        if (vm->cache_fetch[i].n_pages >= start_vpn &&
+            vm->cache_fetch[i].n_pages <= end_vpn)
+            vm->cache_fetch[i].n_pages = 0xFFFFFFFF;
+    }
 
     /* Invalidate load cache: 8 sets × 2 ways */
     for (int set = 0; set < 8; set++) {
@@ -372,10 +375,12 @@ static void mmu_fetch(hart_t *vm, uint32_t addr, uint32_t *value)
     uint32_t idx = (addr >> ICACHE_OFFSET_BITS) & ICACHE_INDEX_MASK;
     uint32_t tag = addr >> (ICACHE_OFFSET_BITS + ICACHE_INDEX_BITS);
     icache_block_t *blk = &vm->icache.block[idx];
+    uint32_t vpn = addr >> RV_PAGE_SHIFT;
+    uint32_t index = __builtin_parity(vpn) & 0x1;
 
     if (likely(blk->valid && blk->tag == tag)) {
 #ifdef MMU_CACHE_STATS
-        vm->cache_fetch.hits++;
+        vm->cache_fetch[index].hits++;
 #endif
         uint32_t ofs = addr & ICACHE_BLOCK_MASK;
         *value = *(const uint32_t *) (blk->base + ofs);
@@ -383,12 +388,11 @@ static void mmu_fetch(hart_t *vm, uint32_t addr, uint32_t *value)
     }
 
 #ifdef MMU_CACHE_STATS
-    vm->cache_fetch.misses++;
+    vm->cache_fetch[index].misses++;
 #endif
 
     /* cache miss, Continue using the original va->pa*/
-    uint32_t vpn = addr >> RV_PAGE_SHIFT;
-    if (unlikely(vpn != vm->cache_fetch.n_pages)) {
+    if (unlikely(vpn != vm->cache_fetch[index].n_pages)) {
         mmu_translate(vm, &addr, (1 << 3), (1 << 6), false, RV_EXC_FETCH_FAULT,
                       RV_EXC_FETCH_PFAULT);
         if (vm->error)
@@ -397,15 +401,16 @@ static void mmu_fetch(hart_t *vm, uint32_t addr, uint32_t *value)
         vm->mem_fetch(vm, addr >> RV_PAGE_SHIFT, &page_addr);
         if (vm->error)
             return;
-        vm->cache_fetch.n_pages = vpn;
-        vm->cache_fetch.page_addr = page_addr;
+        vm->cache_fetch[index].n_pages = vpn;
+        vm->cache_fetch[index].page_addr = page_addr;
     }
 
-    *value = vm->cache_fetch.page_addr[(addr >> 2) & MASK(RV_PAGE_SHIFT - 2)];
+    *value =
+        vm->cache_fetch[index].page_addr[(addr >> 2) & MASK(RV_PAGE_SHIFT - 2)];
 
     /* fill into the cache */
     uint32_t block_off = (addr & RV_PAGE_MASK) & ~ICACHE_BLOCK_MASK;
-    blk->base = (const uint8_t *) vm->cache_fetch.page_addr + block_off;
+    blk->base = (const uint8_t *) vm->cache_fetch[index].page_addr + block_off;
     blk->tag = tag;
     blk->valid = true;
 }
diff --git a/riscv.h b/riscv.h
index da7dac7..0157f38 100644
--- a/riscv.h
+++ b/riscv.h
@@ -136,7 +136,8 @@ struct __hart_internal {
      */
     uint32_t exc_cause, exc_val;
 
-    mmu_fetch_cache_t cache_fetch;
+    /* 2-entry direct-mapped with hash-based indexing */
+    mmu_fetch_cache_t cache_fetch[2];
     /* 8-set × 2-way set-associative cache with 3-bit parity hash indexing */
     mmu_cache_set_t cache_load[8];
     /* 8-set × 2-way set-associative cache for store operations */

From efde8b13eef3952cdec360fc7fde4073a32ba12b Mon Sep 17 00:00:00 2001
From: Po-Ju Chen <yy214123@gmail.com>
Date: Sun, 2 Nov 2025 16:56:01 +0800
Subject: [PATCH 3/4] Add victim cache for I-cache

Introduce a small victim cache to reduce conflict misses
in the direct-mapped instruction cache. On an I-cache miss,
probe the victim cache; on hit, swap the victim block with
the current I-cache block and return the data.

Measurement shows that the number of virtual-to-physical translations
during instruction fetch (mmu_translate() calls) decreased by ~8%.
---
 riscv.c | 32 ++++++++++++++++++++++++++++++--
 riscv.h | 23 ++++++++++++++++++++++-
 2 files changed, 52 insertions(+), 3 deletions(-)

diff --git a/riscv.c b/riscv.c
index a1eb656..a3ae952 100644
--- a/riscv.c
+++ b/riscv.c
@@ -374,7 +374,7 @@ static void mmu_fetch(hart_t *vm, uint32_t addr, uint32_t *value)
     /* cache hit */
     uint32_t idx = (addr >> ICACHE_OFFSET_BITS) & ICACHE_INDEX_MASK;
     uint32_t tag = addr >> (ICACHE_OFFSET_BITS + ICACHE_INDEX_BITS);
-    icache_block_t *blk = &vm->icache.block[idx];
+    icache_block_t *blk = &vm->icache.i_block[idx];
     uint32_t vpn = addr >> RV_PAGE_SHIFT;
     uint32_t index = __builtin_parity(vpn) & 0x1;
 
@@ -387,6 +387,25 @@ static void mmu_fetch(hart_t *vm, uint32_t addr, uint32_t *value)
         return;
     }
 
+    /* search the victim cache */
+    uint32_t vcache_key = addr >> ICACHE_OFFSET_BITS;
+    for (int i = 0; i < VCACHE_BLOCKS; i++) {
+        victim_cache_block_t *vblk = &vm->icache.v_block[i];
+
+        /* victim cache hit, swap blocks */
+        if (vblk->valid && vblk->tag == vcache_key) {
+            icache_block_t tmp = *blk;
+            *blk = *vblk;
+            *vblk = tmp;
+            blk->tag = tag;
+            vblk->tag = (tmp.tag << ICACHE_INDEX_BITS) | idx;
+
+            uint32_t ofs = addr & ICACHE_BLOCK_MASK;
+            *value = *(const uint32_t *) (blk->base + ofs);
+            return;
+        }
+    }
+
 #ifdef MMU_CACHE_STATS
     vm->cache_fetch[index].misses++;
 #endif
@@ -408,7 +427,16 @@ static void mmu_fetch(hart_t *vm, uint32_t addr, uint32_t *value)
     *value =
         vm->cache_fetch[index].page_addr[(addr >> 2) & MASK(RV_PAGE_SHIFT - 2)];
 
-    /* fill into the cache */
+    /* Move the current icache block into the victim cache before replacement */
+    if (blk->valid) {
+        victim_cache_block_t *vblk = &vm->icache.v_block[vm->icache.v_next];
+        *vblk = *blk;
+        vblk->tag = (blk->tag << ICACHE_INDEX_BITS) | idx;
+        vblk->valid = true;
+        vm->icache.v_next = (vm->icache.v_next + 1) % VCACHE_BLOCKS;
+    }
+
+    /* fill into the icache */
     uint32_t block_off = (addr & RV_PAGE_MASK) & ~ICACHE_BLOCK_MASK;
     blk->base = (const uint8_t *) vm->cache_fetch[index].page_addr + block_off;
     blk->tag = tag;
diff --git a/riscv.h b/riscv.h
index 0157f38..500a175 100644
--- a/riscv.h
+++ b/riscv.h
@@ -87,6 +87,23 @@ typedef struct __vm_internel vm_t;
 #define ICACHE_OFFSET_BITS 8
 #define ICACHE_INDEX_BITS 8
 
+/* VCACHE_BLOCKS_SIZE: Size of one victim-cache block (line).
+ * VCACHE_BLOCKS: Number of blocks (lines) in the victim cache.
+ *
+ * The victim cache is implemented as a small, fully associative cache.
+ * It is designed to serve as a temporary buffer for instruction cache blocks
+ * that were recently evicted from the instruction cache.
+ *
+ * Upon an instruction cache miss, the system first checks the victim cache
+ * for the corresponding data. If the data is found (a victim cache hit),
+ * the instruction cache block and the victim cache block are swapped.
+ * Conversely, when the instruction cache is being filled with new data,
+ * the evicted old data from the instruction cache block is simultaneously
+ * placed into the victim cache.
+ */
+#define VCACHE_BLOCK_SIZE ICACHE_BLOCKS_SIZE
+#define VCACHE_BLOCKS 16
+
 /* For power-of-two sizes, (size - 1) sets all low bits to 1,
  * allowing fast extraction of an address.
  */
@@ -100,8 +117,12 @@ typedef struct {
     bool valid;
 } icache_block_t;
 
+typedef icache_block_t victim_cache_block_t;
+
 typedef struct {
-    icache_block_t block[ICACHE_BLOCKS];
+    icache_block_t i_block[ICACHE_BLOCKS];
+    victim_cache_block_t v_block[VCACHE_BLOCKS];
+    uint32_t v_next;
 } icache_t;
 
 struct __hart_internal {

From bc84f9b5fe93fc0ae36d8907d52d191f93780488 Mon Sep 17 00:00:00 2001
From: Po-Ju Chen <yy214123@gmail.com>
Date: Wed, 12 Nov 2025 02:41:33 +0800
Subject: [PATCH 4/4] Add detailed MMU cache statistics for fetch path

Add detailed MMU cache statistics for fetch path

Introduce fine-grained counters to replace the previous aggregated
fetch hit/miss statistics. The new metrics track:
- total fetches
- icache hits/misses
- victim cache hits/misses
- TLB hits/misses

The old MMU statistics only distinguished total fetch hits/misses,
which no longer reflect real performance after the addition of
icache and victim cache layers. This change enables more accurate
profiling and debugging.

This provides a clearer view of instruction fetch performance and helps
identify which cache level contributes most to stalls or misses.
---
 main.c  | 61 +++++++++++++++++++++++++++++++++++++++++++++------------
 riscv.c | 33 +++++++++++++++++++++++++------
 riscv.h |  9 +++++++--
 3 files changed, 83 insertions(+), 20 deletions(-)

diff --git a/main.c b/main.c
index 36957a9..15f23b1 100644
--- a/main.c
+++ b/main.c
@@ -1024,11 +1024,30 @@ static void print_mmu_cache_stats(vm_t *vm)
     fprintf(stderr, "\n=== MMU Cache Statistics ===\n");
     for (uint32_t i = 0; i < vm->n_hart; i++) {
         hart_t *hart = vm->hart[i];
-        uint64_t fetch_hits = 0, fetch_misses = 0;
-        fetch_hits = hart->cache_fetch[1].hits + hart->cache_fetch[2].hits;
-        fetch_misses =
-            hart->cache_fetch[1].misses + hart->cache_fetch[2].misses;
-        uint64_t fetch_total = fetch_hits + fetch_misses;
+
+        /* Combine 2-entry tlb statistics */
+        uint64_t fetch_hits_tlb = 0, fetch_misses_tlb = 0;
+        fetch_hits_tlb =
+            hart->cache_fetch[0].tlb_hits + hart->cache_fetch[1].tlb_hits;
+        fetch_misses_tlb =
+            hart->cache_fetch[0].tlb_misses + hart->cache_fetch[1].tlb_misses;
+
+        /* Combine icache statistics */
+        uint64_t fetch_hits_icache = 0, fetch_misses_icache = 0;
+        fetch_hits_icache =
+            hart->cache_fetch[0].icache_hits + hart->cache_fetch[1].icache_hits;
+        fetch_misses_icache = hart->cache_fetch[0].icache_misses +
+                              hart->cache_fetch[1].icache_misses;
+
+        /* Combine victim cache statistics */
+        uint64_t fetch_hits_vcache = 0, fetch_misses_vcache = 0;
+        fetch_hits_vcache =
+            hart->cache_fetch[0].vcache_hits + hart->cache_fetch[1].vcache_hits;
+        fetch_misses_vcache = hart->cache_fetch[0].vcache_misses +
+                              hart->cache_fetch[1].vcache_misses;
+
+        uint64_t access_total =
+            hart->cache_fetch[0].total_fetch + hart->cache_fetch[1].total_fetch;
 
         /* Combine 8-set × 2-way load cache statistics */
         uint64_t load_hits = 0, load_misses = 0;
@@ -1050,14 +1069,32 @@ static void print_mmu_cache_stats(vm_t *vm)
         }
         uint64_t store_total = store_hits + store_misses;
 
-        fprintf(stderr, "\nHart %u:\n", i);
-        fprintf(stderr, "  Fetch: %12llu hits, %12llu misses", fetch_hits,
-                fetch_misses);
-        if (fetch_total > 0)
-            fprintf(stderr, " (%.2f%% hit rate)",
-                    100.0 * fetch_hits / fetch_total);
-        fprintf(stderr, "\n");
 
+        fprintf(stderr, "\n=== Introduction Cache Statistics ===\n");
+        fprintf(stderr, "  Total access:  %12llu\n", access_total);
+        fprintf(stderr, "  Icache hits:   %12llu (%.2f%%)\n", fetch_hits_icache,
+                (fetch_hits_icache * 100.0) / access_total);
+        fprintf(stderr, "  Icache misses: %12llu (%.2f%%)\n",
+                fetch_misses_icache,
+                (fetch_misses_icache * 100.0) / access_total);
+        fprintf(stderr,
+                "   ├ Vcache hits:    %8llu (%.2f%% of Icache misses)\n",
+                fetch_hits_vcache,
+                (fetch_hits_vcache * 100.0) / fetch_misses_icache,
+                (fetch_hits_vcache * 100.0) / access_total);
+        fprintf(stderr,
+                "   └ Vcache misses:  %8llu (%.2f%% of Icache misses)\n",
+                fetch_misses_vcache,
+                (fetch_misses_vcache * 100.0) / fetch_misses_icache,
+                (fetch_misses_vcache * 100.0) / access_total);
+        fprintf(stderr, "      ├ TLB hits:     %4llu (%.2f%%)\n",
+                fetch_hits_tlb,
+                (fetch_hits_tlb * 100.0) / (fetch_hits_tlb + fetch_misses_tlb));
+        fprintf(
+            stderr, "      └ TLB misses:   %4llu (%.2f%%)\n", fetch_misses_tlb,
+            (fetch_misses_tlb * 100.0) / (fetch_hits_tlb + fetch_misses_tlb));
+
+        fprintf(stderr, "\n=== Data Cache Statistics ===\n");
         fprintf(stderr, "  Load:  %12llu hits, %12llu misses (8x2)", load_hits,
                 load_misses);
         if (load_total > 0)
diff --git a/riscv.c b/riscv.c
index a3ae952..dbae36c 100644
--- a/riscv.c
+++ b/riscv.c
@@ -371,29 +371,40 @@ static void mmu_fence(hart_t *vm, uint32_t insn UNUSED)
 
 static void mmu_fetch(hart_t *vm, uint32_t addr, uint32_t *value)
 {
-    /* cache hit */
     uint32_t idx = (addr >> ICACHE_OFFSET_BITS) & ICACHE_INDEX_MASK;
     uint32_t tag = addr >> (ICACHE_OFFSET_BITS + ICACHE_INDEX_BITS);
     icache_block_t *blk = &vm->icache.i_block[idx];
     uint32_t vpn = addr >> RV_PAGE_SHIFT;
     uint32_t index = __builtin_parity(vpn) & 0x1;
 
+#ifdef MMU_CACHE_STATS
+    vm->cache_fetch[index].total_fetch++;
+#endif
+
+    /* icache lookup */
     if (likely(blk->valid && blk->tag == tag)) {
 #ifdef MMU_CACHE_STATS
-        vm->cache_fetch[index].hits++;
+        vm->cache_fetch[index].icache_hits++;
 #endif
         uint32_t ofs = addr & ICACHE_BLOCK_MASK;
         *value = *(const uint32_t *) (blk->base + ofs);
         return;
     }
 
-    /* search the victim cache */
+    /* icache miss, try victim cache */
+#ifdef MMU_CACHE_STATS
+    vm->cache_fetch[index].icache_misses++;
+#endif
+
     uint32_t vcache_key = addr >> ICACHE_OFFSET_BITS;
     for (int i = 0; i < VCACHE_BLOCKS; i++) {
         victim_cache_block_t *vblk = &vm->icache.v_block[i];
 
-        /* victim cache hit, swap blocks */
         if (vblk->valid && vblk->tag == vcache_key) {
+            /* victim cache hit, swap blocks */
+#ifdef MMU_CACHE_STATS
+            vm->cache_fetch[index].vcache_hits++;
+#endif
             icache_block_t tmp = *blk;
             *blk = *vblk;
             *vblk = tmp;
@@ -407,11 +418,15 @@ static void mmu_fetch(hart_t *vm, uint32_t addr, uint32_t *value)
     }
 
 #ifdef MMU_CACHE_STATS
-    vm->cache_fetch[index].misses++;
+    vm->cache_fetch[index].vcache_misses++;
 #endif
 
-    /* cache miss, Continue using the original va->pa*/
+    /* TLB lookup */
     if (unlikely(vpn != vm->cache_fetch[index].n_pages)) {
+        /*TLB miss: need to translate VA to PA*/
+#ifdef MMU_CACHE_STATS
+        vm->cache_fetch[index].tlb_misses++;
+#endif
         mmu_translate(vm, &addr, (1 << 3), (1 << 6), false, RV_EXC_FETCH_FAULT,
                       RV_EXC_FETCH_PFAULT);
         if (vm->error)
@@ -423,6 +438,12 @@ static void mmu_fetch(hart_t *vm, uint32_t addr, uint32_t *value)
         vm->cache_fetch[index].n_pages = vpn;
         vm->cache_fetch[index].page_addr = page_addr;
     }
+    /*TLB hit*/
+    else {
+#ifdef MMU_CACHE_STATS
+        vm->cache_fetch[index].tlb_hits++;
+#endif
+    }
 
     *value =
         vm->cache_fetch[index].page_addr[(addr >> 2) & MASK(RV_PAGE_SHIFT - 2)];
diff --git a/riscv.h b/riscv.h
index 500a175..6b39905 100644
--- a/riscv.h
+++ b/riscv.h
@@ -36,8 +36,13 @@ typedef struct {
     uint32_t n_pages;
     uint32_t *page_addr;
 #ifdef MMU_CACHE_STATS
-    uint64_t hits;
-    uint64_t misses;
+    uint64_t total_fetch;
+    uint64_t tlb_hits;
+    uint64_t tlb_misses;
+    uint64_t icache_hits;
+    uint64_t icache_misses;
+    uint64_t vcache_hits;
+    uint64_t vcache_misses;
 #endif
 } mmu_fetch_cache_t;