Skip to content

Commit bb5028e

Browse files
committed
also make cache static instead of dynamic
and move decoded check outside decode_into to save the call if it is not needed
1 parent f47d88a commit bb5028e

2 files changed

Lines changed: 90 additions & 76 deletions

File tree

src/mips_cache_v2.rs

Lines changed: 83 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -433,22 +433,31 @@ impl<T> CacheVec<T> {
433433
/// `KIND` (a `CacheKind` discriminant cast to `u8`) controls whether the L2
434434
/// decoded-instruction array is allocated and which methods are meaningful.
435435
///
436-
/// - `tags`: one u32 per cache line (use `get_tag`/`set_tag` for typed access)
437-
/// - `data`: entire cache as `SIZE/8` u64 chunks
438-
/// - `instrs`: L2 only — one DecodedInstr per 4-byte word (`SIZE/4` entries); empty otherwise
439-
struct Cache<const SIZE: usize, const LINE: usize, const KIND: u8> {
440-
tags: CacheVec<u32>,
441-
data: CacheVec<u64>,
442-
/// L2 decoded-instruction slots (SIZE/4 entries). Empty for L1-I and L1-D.
436+
/// - `tags`: `TAGS` u32 tags inline in the struct (no heap indirection; TAGS = SIZE/LINE)
437+
/// - `data`: `DATA` u64 chunks inline in the struct (no heap indirection; DATA = SIZE/8)
438+
/// - `instrs`: L2 only — heap Vec of SIZE/4 DecodedInstr slots (6MB, contains fn ptrs)
439+
///
440+
/// `TAGS` and `DATA` are redundant with `SIZE`/`LINE` but required as explicit const generics
441+
/// because stable Rust cannot use arithmetic on generic params in array length positions.
442+
struct Cache<const SIZE: usize, const LINE: usize, const KIND: u8,
443+
const TAGS: usize, const DATA: usize> {
444+
/// Heap-allocated tag array — one u32 per cache line. Use `get_tag`/`set_tag` for typed access.
445+
tags: UnsafeCell<Box<[u32; TAGS]>>,
446+
/// Heap-allocated data array — entire cache contents as u64 chunks.
447+
data: UnsafeCell<Box<[u64; DATA]>>,
448+
/// L2 decoded-instruction slots (SIZE/4 entries). Empty Vec for L1-I and L1-D.
443449
instrs: CacheVec<DecodedInstr>,
444450
/// Signals the decode thread to stop (kept for Drop compatibility).
445451
stop: Arc<AtomicBool>,
446452
}
447453

448-
unsafe impl<const SIZE: usize, const LINE: usize, const KIND: u8> Send for Cache<SIZE, LINE, KIND> {}
449-
unsafe impl<const SIZE: usize, const LINE: usize, const KIND: u8> Sync for Cache<SIZE, LINE, KIND> {}
454+
unsafe impl<const SIZE: usize, const LINE: usize, const KIND: u8,
455+
const TAGS: usize, const DATA: usize> Send for Cache<SIZE, LINE, KIND, TAGS, DATA> {}
456+
unsafe impl<const SIZE: usize, const LINE: usize, const KIND: u8,
457+
const TAGS: usize, const DATA: usize> Sync for Cache<SIZE, LINE, KIND, TAGS, DATA> {}
450458

451-
impl<const SIZE: usize, const LINE: usize, const KIND: u8> Cache<SIZE, LINE, KIND> {
459+
impl<const SIZE: usize, const LINE: usize, const KIND: u8,
460+
const TAGS: usize, const DATA: usize> Cache<SIZE, LINE, KIND, TAGS, DATA> {
452461
// ---- Compile-time geometry constants ----
453462
const NUM_LINES: usize = SIZE / LINE;
454463
const LINE_SHIFT: u32 = ctz(LINE);
@@ -471,8 +480,10 @@ impl<const SIZE: usize, const LINE: usize, const KIND: u8> Cache<SIZE, LINE, KIN
471480
Vec::new()
472481
};
473482
Self {
474-
tags: CacheVec::new(vec![0u32; Self::NUM_LINES]),
475-
data: CacheVec::new(vec![0u64; SIZE / 8]),
483+
// SAFETY: u32/u64 are valid at all-zero bit patterns. Box::new_zeroed avoids
484+
// constructing the array on the stack before moving to the heap.
485+
tags: UnsafeCell::new(unsafe { Box::new_zeroed().assume_init() }),
486+
data: UnsafeCell::new(unsafe { Box::new_zeroed().assume_init() }),
476487
instrs: CacheVec::new(instrs),
477488
stop: Arc::new(AtomicBool::new(false)),
478489
}
@@ -498,41 +509,50 @@ impl<const SIZE: usize, const LINE: usize, const KIND: u8> Cache<SIZE, LINE, KIN
498509
(line_idx << Self::CHUNKS_PER_LINE_SHIFT) + chunk_offset
499510
}
500511

512+
#[inline(always)]
513+
fn tags(&self) -> &[u32; TAGS] { unsafe { &**self.tags.get() } }
514+
#[inline(always)]
515+
fn tags_mut(&self) -> &mut [u32; TAGS] { unsafe { &mut **self.tags.get() } }
516+
#[inline(always)]
517+
fn data(&self) -> &[u64; DATA] { unsafe { &**self.data.get() } }
518+
#[inline(always)]
519+
fn data_mut(&self) -> &mut [u64; DATA] { unsafe { &mut **self.data.get() } }
520+
501521
/// Read the tag at `idx` as a typed bitfield struct.
502522
#[inline(always)]
503523
fn get_tag<T: From<u32>>(&self, idx: usize) -> T {
504-
T::from(self.tags.get()[idx])
524+
T::from(unsafe { *self.tags().get_unchecked(idx) })
505525
}
506526

507527
/// Write a typed bitfield tag to `idx`.
508528
#[inline(always)]
509529
fn set_tag<T: Into<u32>>(&self, idx: usize, tag: T) {
510-
self.tags.get_mut()[idx] = tag.into();
530+
unsafe { *self.tags_mut().get_unchecked_mut(idx) = tag.into(); }
511531
}
512532

513533
/// View cache data as a flat &[u32] (two per u64, big-endian word order).
514534
/// XOR word index with 1 to address naturally on a little-endian host.
515535
/// Used by the I-cache to store l2.instrs slot indices.
516536
#[inline(always)]
517537
fn data_as_words(&self) -> &[u32] {
518-
let slice = self.data.get();
519-
unsafe { std::slice::from_raw_parts(slice.as_ptr() as *const u32, slice.len() * 2) }
538+
let arr = self.data();
539+
unsafe { std::slice::from_raw_parts(arr.as_ptr() as *const u32, SIZE / 4) }
520540
}
521541

522542
/// View cache data as a flat &[u16] (big-endian halfword order within each u64).
523543
/// XOR halfword index with 3 to convert MIPS big-endian address to host offset.
524544
#[inline(always)]
525545
fn data_as_halves(&self) -> &[u16] {
526-
let slice = self.data.get();
527-
unsafe { std::slice::from_raw_parts(slice.as_ptr() as *const u16, slice.len() * 4) }
546+
let arr = self.data();
547+
unsafe { std::slice::from_raw_parts(arr.as_ptr() as *const u16, SIZE / 2) }
528548
}
529549

530550
/// View cache data as a flat &[u8] (big-endian byte order within each u64).
531551
/// XOR byte index with 7 to convert MIPS big-endian address to host offset.
532552
#[inline(always)]
533553
fn data_as_bytes(&self) -> &[u8] {
534-
let slice = self.data.get();
535-
unsafe { std::slice::from_raw_parts(slice.as_ptr() as *const u8, slice.len() * 8) }
554+
let arr = self.data();
555+
unsafe { std::slice::from_raw_parts(arr.as_ptr() as *const u8, SIZE) }
536556
}
537557
}
538558

@@ -555,13 +575,13 @@ pub struct R4000Cache {
555575
downstream: Arc<dyn BusDevice>,
556576

557577
// L1 Instruction Cache (16 KB, 16-byte lines)
558-
ic: Cache<IC_SIZE, IC_LINE, { CacheKind::Insn as u8 }>,
578+
ic: ICache,
559579

560580
// L1 Data Cache (16 KB, 16-byte lines)
561-
dc: Cache<DC_SIZE, DC_LINE, { CacheKind::Data as u8 }>,
581+
dc: DCache,
562582

563583
// L2 Unified Cache (1 MB, 128-byte lines)
564-
l2: Cache<L2_SIZE, L2_LINE, { CacheKind::L2 as u8 }>,
584+
l2: L2Cache,
565585

566586
// Load-Linked / Store-Conditional support
567587
llbit: UnsafeCell<bool>,
@@ -593,9 +613,10 @@ unsafe impl Send for R4000Cache {}
593613
unsafe impl Sync for R4000Cache {}
594614

595615
// Type aliases for the concrete cache instances, for brevity in R4000Cache impls.
596-
type ICache = Cache<IC_SIZE, IC_LINE, { CacheKind::Insn as u8 }>;
597-
type DCache = Cache<DC_SIZE, DC_LINE, { CacheKind::Data as u8 }>;
598-
type L2Cache = Cache<L2_SIZE, L2_LINE, { CacheKind::L2 as u8 }>;
616+
// TAGS = SIZE/LINE (one tag per cache line), DATA = SIZE/8 (one u64 per 8 bytes).
617+
type ICache = Cache<IC_SIZE, IC_LINE, { CacheKind::Insn as u8 }, { IC_SIZE / IC_LINE }, { IC_SIZE / 8 }>;
618+
type DCache = Cache<DC_SIZE, DC_LINE, { CacheKind::Data as u8 }, { DC_SIZE / DC_LINE }, { DC_SIZE / 8 }>;
619+
type L2Cache = Cache<L2_SIZE, L2_LINE, { CacheKind::L2 as u8 }, { L2_SIZE / L2_LINE }, { L2_SIZE / 8 }>;
599620

600621
impl R4000Cache {
601622
pub fn new(downstream: Arc<dyn BusDevice>) -> Self {
@@ -778,16 +799,16 @@ impl R4000Cache {
778799
/// L2 line. The caller iterates over `l1_lines_per_l2` indices starting here,
779800
/// stepping by 1 (indices wrap naturally via the cache mask).
780801
#[inline]
781-
fn l2_idx_to_l1_base_idx<const L1_SIZE: usize, const L1_LINE: usize, const L1_KIND: u8>(
782-
&self, l2_idx: usize, pidx: u32, _l1: &Cache<L1_SIZE, L1_LINE, L1_KIND>
802+
fn l2_idx_to_l1_base_idx<const L1_SIZE: usize, const L1_LINE: usize, const L1_KIND: u8, const L1_TAGS: usize, const L1_DATA: usize>(
803+
&self, l2_idx: usize, pidx: u32, _l1: &Cache<L1_SIZE, L1_LINE, L1_KIND, L1_TAGS, L1_DATA>
783804
) -> usize {
784805
// Physical bits of the L2 line start address that are below bit 12 (page boundary)
785806
// These bits are the same in VA and PA, so we can derive them from the L2 index.
786807
let phys_sub_bits = (l2_idx << L2Cache::LINE_SHIFT as usize) & 0xFFF;
787808
// Reconstruct the virtual address bits used for L1 indexing
788809
let virt_index_bits = ((pidx as usize) << L2_PIDX_VADDR_SHIFT as usize) | phys_sub_bits;
789-
(virt_index_bits >> Cache::<L1_SIZE, L1_LINE, L1_KIND>::LINE_SHIFT as usize)
790-
& Cache::<L1_SIZE, L1_LINE, L1_KIND>::NUM_LINES_MASK
810+
(virt_index_bits >> Cache::<L1_SIZE, L1_LINE, L1_KIND, L1_TAGS, L1_DATA>::LINE_SHIFT as usize)
811+
& Cache::<L1_SIZE, L1_LINE, L1_KIND, L1_TAGS, L1_DATA>::NUM_LINES_MASK
791812
}
792813

793814
/// Check if the given physical address overlaps with the Load Linked address.
@@ -945,8 +966,8 @@ impl R4000Cache {
945966
}
946967

947968
// Write data from L1-D to L2
948-
let dc_data = self.dc.data.get();
949-
let l2_data = self.l2.data.get_mut();
969+
let dc_data = self.dc.data();
970+
let l2_data = self.l2.data_mut();
950971

951972
let l1_start_chunk = l1_idx << DCache::CHUNKS_PER_LINE_SHIFT;
952973

@@ -1027,7 +1048,7 @@ impl R4000Cache {
10271048
println!("[CACHE DEBUG] writeback_l2_line: {} idx={}, phys_addr=0x{:08x}, ptag=0x{:05x}, cs={}, WRITING TO MEMORY",
10281049
self.tracking_label_l2_idx(idx), idx, phys_addr, tag.ptag(), cs);
10291050
// Dump the L2 line data being written
1030-
let l2_data = self.l2.data.get();
1051+
let l2_data = self.l2.data();
10311052
let start_chunk = idx << L2Cache::CHUNKS_PER_LINE_SHIFT;
10321053
println!(" L2 line data being written (16 x u64):");
10331054
for i in 0..L2Cache::CHUNKS_PER_LINE {
@@ -1040,7 +1061,7 @@ impl R4000Cache {
10401061
// An L2 writeback/eviction is not a coherency action and must not break LL/SC.
10411062

10421063
// Now write L2 data to memory
1043-
let l2_data = self.l2.data.get();
1064+
let l2_data = self.l2.data();
10441065
let start_chunk = idx << L2Cache::CHUNKS_PER_LINE_SHIFT;
10451066

10461067
for i in 0..L2Cache::CHUNKS_PER_LINE {
@@ -1074,7 +1095,7 @@ impl R4000Cache {
10741095
let line_base = phys_addr & !(L2Cache::LINE_MASK as u64);
10751096

10761097
// Fill line from memory
1077-
let l2_data = self.l2.data.get_mut();
1098+
let l2_data = self.l2.data_mut();
10781099
let start_chunk = l2_idx << L2Cache::CHUNKS_PER_LINE_SHIFT;
10791100

10801101
let instrs_start = l2_idx << L2Cache::INSTR_SHIFT;
@@ -1163,7 +1184,7 @@ impl R4000Cache {
11631184
let ic_line_base = phys_addr & !(ICache::LINE_MASK as u64);
11641185
let l2_word_offset = ((ic_line_base as usize) & L2Cache::LINE_MASK) >> 2;
11651186
let l2_instrs_base = (l2_idx << L2Cache::INSTR_SHIFT) + l2_word_offset;
1166-
let ic_data = self.ic.data.get_mut();
1187+
let ic_data = self.ic.data_mut();
11671188
let ic_data_base = ic_idx * ICache::CHUNKS_PER_LINE;
11681189
for i in 0..ICache::CHUNKS_PER_LINE {
11691190
let idx0 = (l2_instrs_base + i * 2 ) as u32;
@@ -1219,8 +1240,8 @@ impl R4000Cache {
12191240
let l2_line_base = l2_idx << L2Cache::CHUNKS_PER_LINE_SHIFT;
12201241
let offset_in_l2_line = ((dc_line_base & (L2Cache::LINE_MASK as u64)) >> 3) as usize;
12211242

1222-
let l2_data = self.l2.data.get();
1223-
let dc_data = self.dc.data.get_mut();
1243+
let l2_data = self.l2.data();
1244+
let dc_data = self.dc.data_mut();
12241245
let dc_start_chunk = dc_idx << DCache::CHUNKS_PER_LINE_SHIFT;
12251246

12261247
for i in 0..DCache::CHUNKS_PER_LINE {
@@ -1333,7 +1354,7 @@ impl MipsCache for R4000Cache {
13331354
1 => self.dc.data_as_bytes()[data_idx * 8 + ((phys_addr as usize & 7) ^ 7)] as u64,
13341355
2 => self.dc.data_as_halves()[data_idx * 4 + ((phys_addr as usize & 7) >> 1 ^ 3)] as u64,
13351356
4 => self.dc.data_as_words()[data_idx * 2 + ((phys_addr as usize & 7) >> 2 ^ 1)] as u64,
1336-
8 => self.dc.data.get()[data_idx],
1357+
8 => self.dc.data()[data_idx],
13371358
_ => return BusRead64::err(),
13381359
};
13391360
BusRead64::ok(data)
@@ -1369,7 +1390,7 @@ impl MipsCache for R4000Cache {
13691390

13701391
// Write to L1-D cache
13711392
let data_idx = self.dc.get_data_index(virt_addr);
1372-
let dc_data = self.dc.data.get_mut();
1393+
let dc_data = self.dc.data_mut();
13731394
let current = dc_data[data_idx];
13741395
dc_data[data_idx] = (current & !mask) | (val & mask);
13751396

@@ -1763,7 +1784,7 @@ impl MipsCache for R4000Cache {
17631784
_ => "Unknown",
17641785
};
17651786

1766-
let dc_data = self.dc.data.get();
1787+
let dc_data = self.dc.data();
17671788
let start = idx << DCache::CHUNKS_PER_LINE_SHIFT;
17681789

17691790
let mut s = format!("L1-D Line 0x{:x}: Tag=0x{:06x} CS={} ({}) D={}\n Data:",
@@ -1790,7 +1811,7 @@ impl MipsCache for R4000Cache {
17901811
_ => "Reserved",
17911812
};
17921813

1793-
let l2_data = self.l2.data.get();
1814+
let l2_data = self.l2.data();
17941815
let start = idx << L2Cache::CHUNKS_PER_LINE_SHIFT;
17951816

17961817
let mut s = format!("L2 Line 0x{:x}: Tag=0x{:05x} CS={} ({})\n Data:",
@@ -1808,12 +1829,12 @@ impl MipsCache for R4000Cache {
18081829
}
18091830

18101831
fn power_on(&self) {
1811-
self.ic.tags.get_mut().fill(0);
1812-
self.ic.data.get_mut().fill(0);
1813-
self.dc.tags.get_mut().fill(0);
1814-
self.dc.data.get_mut().fill(0);
1815-
self.l2.tags.get_mut().fill(0);
1816-
self.l2.data.get_mut().fill(0);
1832+
self.ic.tags_mut().fill(0);
1833+
self.ic.data_mut().fill(0);
1834+
self.dc.tags_mut().fill(0);
1835+
self.dc.data_mut().fill(0);
1836+
self.l2.tags_mut().fill(0);
1837+
self.l2.data_mut().fill(0);
18171838
for s in self.l2.instrs.get_mut().iter_mut() {
18181839
s.decoded = false;
18191840
s.raw = 0;
@@ -1845,12 +1866,12 @@ impl Drop for R4000Cache {
18451866

18461867
impl Resettable for R4000Cache {
18471868
fn power_on(&self) {
1848-
self.ic.tags.get_mut().fill(0);
1849-
self.ic.data.get_mut().fill(0);
1850-
self.dc.tags.get_mut().fill(0);
1851-
self.dc.data.get_mut().fill(0);
1852-
self.l2.tags.get_mut().fill(0);
1853-
self.l2.data.get_mut().fill(0);
1869+
self.ic.tags_mut().fill(0);
1870+
self.ic.data_mut().fill(0);
1871+
self.dc.tags_mut().fill(0);
1872+
self.dc.data_mut().fill(0);
1873+
self.l2.tags_mut().fill(0);
1874+
self.l2.data_mut().fill(0);
18541875
for s in self.l2.instrs.get_mut().iter_mut() {
18551876
s.decoded = false;
18561877
s.raw = 0;
@@ -1865,17 +1886,15 @@ impl Resettable for R4000Cache {
18651886
// ---- snapshot helpers + MipsCache save/load override ----
18661887

18671888
impl R4000Cache {
1868-
fn save_cache_inner<const S: usize, const L: usize, const K: u8>(c: &Cache<S, L, K>) -> (Vec<u32>, Vec<u64>) {
1869-
(c.tags.get().clone(), c.data.get().clone())
1889+
fn save_cache_inner<const S: usize, const L: usize, const K: u8, const TG: usize, const DA: usize>(c: &Cache<S, L, K, TG, DA>) -> (Vec<u32>, Vec<u64>) {
1890+
(c.tags().to_vec(), c.data().to_vec())
18701891
}
18711892

1872-
fn load_cache_inner<const S: usize, const L: usize, const K: u8>(c: &Cache<S, L, K>, tags: &[u32], data: &[u64]) {
1873-
let t = c.tags.get_mut();
1874-
let tl = tags.len().min(t.len());
1875-
t[..tl].copy_from_slice(&tags[..tl]);
1876-
let d = c.data.get_mut();
1877-
let dl = data.len().min(d.len());
1878-
d[..dl].copy_from_slice(&data[..dl]);
1893+
fn load_cache_inner<const S: usize, const L: usize, const K: u8, const TG: usize, const DA: usize>(c: &Cache<S, L, K, TG, DA>, tags: &[u32], data: &[u64]) {
1894+
let tl = tags.len().min(TG);
1895+
c.tags_mut()[..tl].copy_from_slice(&tags[..tl]);
1896+
let dl = data.len().min(DA);
1897+
c.data_mut()[..dl].copy_from_slice(&data[..dl]);
18791898
}
18801899

18811900
pub fn save_cache_state(&self) -> toml::Value {
@@ -1919,7 +1938,7 @@ impl R4000Cache {
19191938

19201939
// Rebuild l2.instrs from restored l2.data
19211940
{
1922-
let l2_data_slice: Vec<u64> = self.l2.data.get().clone();
1941+
let l2_data_slice = self.l2.data();
19231942
let l2_instrs = self.l2.instrs.get_mut();
19241943
for line in 0..L2Cache::NUM_LINES {
19251944
let chunks_start = line << L2Cache::CHUNKS_PER_LINE_SHIFT;

src/mips_exec.rs

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -827,10 +827,7 @@ For R4000SC/MC CPUs:
827827
pub fn exec(&mut self, instr: u32) -> ExecStatus {
828828
self.ins.raw = instr;
829829
self.ins.decoded = false;
830-
if decode_into::<T, C>(&mut self.ins) {
831-
#[cfg(feature = "developer")]
832-
self.decoded_count.fetch_add(1, Ordering::Relaxed);
833-
}
830+
decode_into::<T, C>(&mut self.ins);
834831
let d: *const DecodedInstr = &self.ins;
835832
self.exec_decoded(unsafe { &*d })
836833
}
@@ -914,7 +911,10 @@ For R4000SC/MC CPUs:
914911
let fetch = self.fetch_instr(pc);
915912
let result = if fetch.status == EXEC_COMPLETE {
916913
let slot = fetch.instr as *mut DecodedInstr;
917-
if decode_into::<T, C>(unsafe { &mut *slot }) {
914+
let d = unsafe { &mut *slot };
915+
if !d.decoded {
916+
decode_into::<T, C>(d);
917+
} else {
918918
#[cfg(feature = "developer")]
919919
self.decoded_count.fetch_add(1, Ordering::Relaxed);
920920
}
@@ -4204,12 +4204,8 @@ For R4000SC/MC CPUs:
42044204

42054205
}
42064206

4207-
/// Decode `raw` into `ins` if not already decoded.
4208-
/// Returns `true` if the instruction was already decoded, `false` otherwise.
4209-
pub fn decode_into<T: Tlb, C: MipsCache>(ins: &mut DecodedInstr) -> bool {
4210-
if ins.decoded {
4211-
return true;
4212-
}
4207+
/// Decode `raw` into `ins`. Caller is responsible for checking `ins.decoded` first.
4208+
pub fn decode_into<T: Tlb, C: MipsCache>(ins: &mut DecodedInstr) {
42134209
let raw = ins.raw;
42144210

42154211
let op = ((raw >> 26) & 0x3F) as u8;
@@ -4450,7 +4446,6 @@ pub fn decode_into<T: Tlb, C: MipsCache>(ins: &mut DecodedInstr) -> bool {
44504446
ins.funct = funct;
44514447
ins.handler = handler as usize;
44524448
ins.decoded = true;
4453-
false
44544449
}
44554450

44564451
// Field extraction helpers have been replaced by DecodedInstr fields

0 commit comments

Comments
 (0)