diff --git a/crates/vm/src/builtins/dict.rs b/crates/vm/src/builtins/dict.rs
index 0cc9ee66f3f..fa8f45e0505 100644
--- a/crates/vm/src/builtins/dict.rs
+++ b/crates/vm/src/builtins/dict.rs
@@ -5,7 +5,7 @@ use super::{
 use crate::common::lock::LazyLock;
 use crate::object::{Traverse, TraverseFn};
 use crate::{
-    AsObject, Context, Py, PyObject, PyObjectRef, PyPayload, PyRef, PyRefExact, PyResult,
+    AsObject, Context, Py, PyExact, PyObject, PyObjectRef, PyPayload, PyRef, PyRefExact, PyResult,
     TryFromObject, atomic_func,
     builtins::{
         PyTuple,
@@ -681,7 +681,10 @@ impl Py<PyDict> {
         let self_exact = self.exact_dict(vm);
         let other_exact = other.exact_dict(vm);
         if self_exact && other_exact {
-            self.entries.get_chain(&other.entries, vm, key)
+            // SAFETY: exact_dict checks passed
+            let self_exact = unsafe { PyExact::ref_unchecked(self) };
+            let other_exact = unsafe { PyExact::ref_unchecked(other) };
+            self_exact.get_chain_exact(other_exact, key, vm)
         } else if let Some(value) = self.get_item_opt(key, vm)? {
             Ok(Some(value))
         } else {
@@ -690,6 +693,21 @@ impl Py<PyDict> {
     }
 }
 
+impl PyExact<PyDict> {
+    /// Look up `key` in `self`, falling back to `other`.
+    /// Both dicts must be exact `dict` types (enforced by `PyExact`).
+    pub(crate) fn get_chain_exact<K: DictKey + ?Sized>(
+        &self,
+        other: &Self,
+        key: &K,
+        vm: &VirtualMachine,
+    ) -> PyResult<Option<PyObjectRef>> {
+        debug_assert!(self.class().is(vm.ctx.types.dict_type));
+        debug_assert!(other.class().is(vm.ctx.types.dict_type));
+        self.entries.get_chain(&other.entries, vm, key)
+    }
+}
+
 // Implement IntoIterator so that we can easily iterate dictionaries from rust code.
 impl IntoIterator for PyDictRef {
     type Item = (PyObjectRef, PyObjectRef);
diff --git a/crates/vm/src/builtins/range.rs b/crates/vm/src/builtins/range.rs
index e42cf5f23f6..4b563de81f4 100644
--- a/crates/vm/src/builtins/range.rs
+++ b/crates/vm/src/builtins/range.rs
@@ -613,6 +613,19 @@ pub struct PyRangeIterator {
     length: usize,
 }
 
+impl PyRangeIterator {
+    /// Advance and return next value without going through the iterator protocol.
+    #[inline]
+    pub(crate) fn next_fast(&self) -> Option<isize> {
+        let index = self.index.fetch_add(1);
+        if index < self.length {
+            Some(self.start + (index as isize) * self.step)
+        } else {
+            None
+        }
+    }
+}
+
 impl PyPayload for PyRangeIterator {
     #[inline]
     fn class(ctx: &Context) -> &'static Py<PyType> {
diff --git a/crates/vm/src/builtins/str.rs b/crates/vm/src/builtins/str.rs
index 576815e3b4f..a9e8484dfcc 100644
--- a/crates/vm/src/builtins/str.rs
+++ b/crates/vm/src/builtins/str.rs
@@ -1584,6 +1584,20 @@ impl AsMapping for PyStr {
 impl AsNumber for PyStr {
     fn as_number() -> &'static PyNumberMethods {
         static AS_NUMBER: PyNumberMethods = PyNumberMethods {
+            add: Some(|a, b, vm| {
+                let Some(a) = a.downcast_ref::<PyStr>() else {
+                    return Ok(vm.ctx.not_implemented());
+                };
+                let Some(b) = b.downcast_ref::<PyStr>() else {
+                    return Ok(vm.ctx.not_implemented());
+                };
+                let bytes = a.as_wtf8().py_add(b.as_wtf8());
+                Ok(unsafe {
+                    let kind = a.kind() | b.kind();
+                    PyStr::new_str_unchecked(bytes.into(), kind)
+                }
+                .to_pyobject(vm))
+            }),
             remainder: Some(|a, b, vm| {
                 if let Some(a) = a.downcast_ref::<PyStr>() {
                     a.__mod__(b.to_owned(), vm).to_pyresult(vm)
diff --git a/crates/vm/src/frame.rs b/crates/vm/src/frame.rs
index f1175401676..663885c579d 100644
--- a/crates/vm/src/frame.rs
+++ b/crates/vm/src/frame.rs
@@ -1,15 +1,18 @@
 #[cfg(feature = "flame")]
 use crate::bytecode::InstructionMetadata;
 use crate::{
-    AsObject, Py, PyObject, PyObjectRef, PyPayload, PyRef, PyResult, PyStackRef, TryFromObject,
-    VirtualMachine,
+    AsObject, Py, PyExact, PyObject, PyObjectRef, PyPayload, PyRef, PyResult, PyStackRef,
+    TryFromObject, VirtualMachine,
     builtins::{
         PyBaseException, PyBaseExceptionRef, PyCode, PyCoroutine, PyDict, PyDictRef, PyGenerator,
         PyInterpolation, PyList, PySet, PySlice, PyStr, PyStrInterned, PyTemplate, PyTraceback,
         PyType, PyUtf8Str,
         asyncgenerator::PyAsyncGenWrappedValue,
+        float::PyFloat,
         frame::stack_analysis,
         function::{PyCell, PyCellRef, PyFunction},
+        int::PyInt,
+        range::PyRangeIterator,
         tuple::{PyTuple, PyTupleRef},
     },
     bytecode::{self, Instruction, LoadAttr, LoadSuperAttr, SpecialMethod},
@@ -22,7 +25,7 @@ use crate::{
     protocol::{PyIter, PyIterReturn},
     scope::Scope,
     stdlib::{builtins, sys::monitoring, typing},
-    types::PyTypeFlags,
+    types::{PyComparisonOp, PyTypeFlags},
     vm::{Context, PyMethod},
 };
 use alloc::fmt;
@@ -34,6 +37,7 @@ use core::sync::atomic::AtomicPtr;
 use core::sync::atomic::Ordering::Relaxed;
 use indexmap::IndexMap;
 use itertools::Itertools;
+use malachite_bigint::BigInt;
 use rustpython_common::atomic::{PyAtomic, Radium};
 use rustpython_common::{
     boxvec::BoxVec,
@@ -97,9 +101,11 @@ impl FrameOwner {
 /// Lock-free storage for local variables (localsplus).
 ///
 /// # Safety
-/// Access is serialized by the frame's state mutex in `with_exec()`, which
-/// prevents concurrent frame execution. Trace callbacks that access `f_locals`
-/// run sequentially on the same thread as instruction execution.
+/// Mutable access is serialized by the frame's state mutex in `with_exec()`.
+/// External readers (e.g. `f_locals`) must use `try_lock` on the state mutex:
+/// if acquired, the frame is not executing and access is exclusive; if not,
+/// the caller is on the same thread as `with_exec()` (trace callback) and
+/// access is safe because frame execution is single-threaded.
 pub struct FastLocals {
     inner: UnsafeCell<Box<[Option<PyObjectRef>]>>,
 }
@@ -387,12 +393,17 @@ impl Frame {
     }
 
     pub fn locals(&self, vm: &VirtualMachine) -> PyResult<ArgMapping> {
+        // Acquire the state mutex to synchronize with frame execution.
+        // If try_lock fails, the frame is executing on this thread (e.g.
+        // trace callback accessing f_locals), so fastlocals access is safe.
+        let _guard = self.state.try_lock();
         let locals = &self.locals;
         let code = &**self.code;
         let map = &code.varnames;
         let j = core::cmp::min(map.len(), code.varnames.len());
         if !code.varnames.is_empty() {
-            // SAFETY: Trace callbacks run sequentially on the same thread.
+            // SAFETY: Either _guard holds the state mutex (frame not executing),
+            // or we're in a trace callback on the same thread that holds it.
             let fastlocals = unsafe { self.fastlocals.borrow() };
             for (&k, v) in zip(&map[..j], fastlocals) {
                 match locals.mapping().ass_subscript(k, v.clone(), vm) {
@@ -403,8 +414,6 @@ impl Frame {
             }
         }
         if !code.cellvars.is_empty() || !code.freevars.is_empty() {
-            // Access cells through fastlocals to avoid locking state
-            // (state may be held by with_exec during frame execution).
             for (i, &k) in code.cellvars.iter().enumerate() {
                 let cell_value = self.get_cell_contents(i);
                 match locals.mapping().ass_subscript(k, cell_value, vm) {
@@ -430,7 +439,7 @@ impl Frame {
 
 impl Py<Frame> {
     #[inline(always)]
-    fn with_exec<R>(&self, f: impl FnOnce(ExecutingFrame<'_>) -> R) -> R {
+    fn with_exec<R>(&self, vm: &VirtualMachine, f: impl FnOnce(ExecutingFrame<'_>) -> R) -> R {
         let mut state = self.state.lock();
         let exec = ExecutingFrame {
             code: &self.code,
@@ -438,6 +447,14 @@ impl Py<Frame> {
             locals: &self.locals,
             globals: &self.globals,
             builtins: &self.builtins,
+            builtins_dict: if self.globals.class().is(vm.ctx.types.dict_type) {
+                self.builtins
+                    .downcast_ref_if_exact::<PyDict>(vm)
+                    // SAFETY: downcast_ref_if_exact already verified exact type
+                    .map(|d| unsafe { PyExact::ref_unchecked(d) })
+            } else {
+                None
+            },
             lasti: &self.lasti,
             object: self,
             state: &mut state,
@@ -448,7 +465,7 @@ impl Py<Frame> {
 
     // #[cfg_attr(feature = "flame-it", flame("Frame"))]
     pub fn run(&self, vm: &VirtualMachine) -> PyResult<ExecutionResult> {
-        self.with_exec(|mut exec| exec.run(vm))
+        self.with_exec(vm, |mut exec| exec.run(vm))
     }
 
     pub(crate) fn resume(
@@ -456,7 +473,7 @@ impl Py<Frame> {
         value: Option<PyObjectRef>,
         vm: &VirtualMachine,
     ) -> PyResult<ExecutionResult> {
-        self.with_exec(|mut exec| {
+        self.with_exec(vm, |mut exec| {
             if let Some(value) = value {
                 exec.push_value(value)
             }
@@ -471,7 +488,7 @@ impl Py<Frame> {
         exc_val: PyObjectRef,
         exc_tb: PyObjectRef,
     ) -> PyResult<ExecutionResult> {
-        self.with_exec(|mut exec| exec.gen_throw(vm, exc_type, exc_val, exc_tb))
+        self.with_exec(vm, |mut exec| exec.gen_throw(vm, exc_type, exc_val, exc_tb))
     }
 
     pub fn yield_from_target(&self) -> Option<PyObjectRef> {
@@ -484,6 +501,7 @@ impl Py<Frame> {
             locals: &self.locals,
             globals: &self.globals,
             builtins: &self.builtins,
+            builtins_dict: None,
             lasti: &self.lasti,
             object: self,
             state: &mut state,
@@ -519,6 +537,11 @@ struct ExecutingFrame<'a> {
     locals: &'a ArgMapping,
     globals: &'a PyDictRef,
     builtins: &'a PyObjectRef,
+    /// Cached downcast of builtins to PyDict for fast LOAD_GLOBAL.
+    /// Only set when both globals and builtins are exact dict types (not
+    /// subclasses), so that `__missing__` / `__getitem__` overrides are
+    /// not bypassed.
+    builtins_dict: Option<&'a PyExact<PyDict>>,
     object: &'a Py<Frame>,
     lasti: &'a PyAtomic<u32>,
     state: &'a mut FrameState,
@@ -1669,30 +1692,29 @@ impl ExecutingFrame<'_> {
             Instruction::LoadAttr { idx } => self.load_attr(vm, idx.get(arg)),
             Instruction::LoadSuperAttr { arg: idx } => self.load_super_attr(vm, idx.get(arg)),
             Instruction::LoadBuildClass => {
-                let build_class =
-                    if let Some(builtins_dict) = self.builtins.downcast_ref::<PyDict>() {
-                        builtins_dict
-                            .get_item_opt(identifier!(vm, __build_class__), vm)?
-                            .ok_or_else(|| {
+                let build_class = if let Some(builtins_dict) = self.builtins_dict {
+                    builtins_dict
+                        .get_item_opt(identifier!(vm, __build_class__), vm)?
+                        .ok_or_else(|| {
+                            vm.new_name_error(
+                                "__build_class__ not found".to_owned(),
+                                identifier!(vm, __build_class__).to_owned(),
+                            )
+                        })?
+                } else {
+                    self.builtins
+                        .get_item(identifier!(vm, __build_class__), vm)
+                        .map_err(|e| {
+                            if e.fast_isinstance(vm.ctx.exceptions.key_error) {
                                 vm.new_name_error(
                                     "__build_class__ not found".to_owned(),
                                     identifier!(vm, __build_class__).to_owned(),
                                 )
-                            })?
-                    } else {
-                        self.builtins
-                            .get_item(identifier!(vm, __build_class__), vm)
-                            .map_err(|e| {
-                                if e.fast_isinstance(vm.ctx.exceptions.key_error) {
-                                    vm.new_name_error(
-                                        "__build_class__ not found".to_owned(),
-                                        identifier!(vm, __build_class__).to_owned(),
-                                    )
-                                } else {
-                                    e
-                                }
-                            })?
-                    };
+                            } else {
+                                e
+                            }
+                        })?
+                };
                 self.push_value(build_class);
                 Ok(None)
             }
@@ -3014,10 +3036,12 @@ impl ExecutingFrame<'_> {
 
     #[inline]
     fn load_global_or_builtin(&self, name: &Py<PyStr>, vm: &VirtualMachine) -> PyResult {
-        if let Some(builtins_dict) = self.builtins.downcast_ref::<PyDict>() {
-            // Fast path: builtins is a dict
-            self.globals
-                .get_chain(builtins_dict, name, vm)?
+        if let Some(builtins_dict) = self.builtins_dict {
+            // Fast path: both globals and builtins are exact dicts
+            // SAFETY: builtins_dict is only set when globals is also exact dict
+            let globals_exact = unsafe { PyExact::ref_unchecked(self.globals.as_ref()) };
+            globals_exact
+                .get_chain_exact(builtins_dict, name, vm)?
                 .ok_or_else(|| {
                     vm.new_name_error(format!("name '{name}' is not defined"), name.to_owned())
                 })
@@ -3703,7 +3727,23 @@ impl ExecutingFrame<'_> {
         vm: &VirtualMachine,
         target: bytecode::Label,
     ) -> Result<bool, PyBaseExceptionRef> {
-        let top_of_stack = PyIter::new(self.top_value());
+        let top = self.top_value();
+
+        // FOR_ITER_RANGE: bypass generic iterator protocol for range iterators
+        if let Some(range_iter) = top.downcast_ref_if_exact::<PyRangeIterator>(vm) {
+            if let Some(value) = range_iter.next_fast() {
+                self.push_value(vm.ctx.new_int(value).into());
+                return Ok(true);
+            }
+            if vm.use_tracing.get() && !vm.is_none(&self.object.trace.lock()) {
+                let stop_exc = vm.new_stop_iteration(None);
+                self.fire_exception_trace(&stop_exc, vm)?;
+            }
+            self.jump(self.for_iter_jump_target(target));
+            return Ok(false);
+        }
+
+        let top_of_stack = PyIter::new(top);
         let next_obj = top_of_stack.next(vm);
 
         match next_obj {
@@ -3718,21 +3758,7 @@ impl ExecutingFrame<'_> {
                     let stop_exc = vm.new_stop_iteration(value);
                     self.fire_exception_trace(&stop_exc, vm)?;
                 }
-                // Skip END_FOR (base or instrumented) and jump to POP_ITER.
-                let target_idx = target.0 as usize;
-                let jump_target = if let Some(unit) = self.code.instructions.get(target_idx) {
-                    if matches!(
-                        unit.op,
-                        bytecode::Instruction::EndFor | bytecode::Instruction::InstrumentedEndFor
-                    ) {
-                        bytecode::Label(target.0 + 1)
-                    } else {
-                        target
-                    }
-                } else {
-                    target
-                };
-                self.jump(jump_target);
+                self.jump(self.for_iter_jump_target(target));
                 Ok(false)
             }
             Err(next_error) => {
@@ -3741,6 +3767,20 @@ impl ExecutingFrame<'_> {
             }
         }
     }
+
+    /// Compute the jump target for FOR_ITER exhaustion: skip END_FOR and jump to POP_ITER.
+    fn for_iter_jump_target(&self, target: bytecode::Label) -> bytecode::Label {
+        let target_idx = target.0 as usize;
+        if let Some(unit) = self.code.instructions.get(target_idx)
+            && matches!(
+                unit.op,
+                bytecode::Instruction::EndFor | bytecode::Instruction::InstrumentedEndFor
+            )
+        {
+            return bytecode::Label(target.0 + 1);
+        }
+        target
+    }
     fn execute_make_function(&mut self, vm: &VirtualMachine) -> FrameResult {
         // MakeFunction only takes code object, no flags
         let code_obj: PyRef<PyCode> = self
@@ -3789,8 +3829,33 @@ impl ExecutingFrame<'_> {
         let b_ref = &self.pop_value();
         let a_ref = &self.pop_value();
         let value = match op {
-            bytecode::BinaryOperator::Subtract => vm._sub(a_ref, b_ref),
-            bytecode::BinaryOperator::Add => vm._add(a_ref, b_ref),
+            // BINARY_OP_ADD_INT / BINARY_OP_SUBTRACT_INT fast paths:
+            // bypass binary_op1 dispatch for exact int types, use i64 arithmetic
+            // when possible to avoid BigInt heap allocation.
+            bytecode::BinaryOperator::Add | bytecode::BinaryOperator::InplaceAdd => {
+                if let (Some(a), Some(b)) = (
+                    a_ref.downcast_ref_if_exact::<PyInt>(vm),
+                    b_ref.downcast_ref_if_exact::<PyInt>(vm),
+                ) {
+                    Ok(self.int_add(a.as_bigint(), b.as_bigint(), vm))
+                } else if matches!(op, bytecode::BinaryOperator::Add) {
+                    vm._add(a_ref, b_ref)
+                } else {
+                    vm._iadd(a_ref, b_ref)
+                }
+            }
+            bytecode::BinaryOperator::Subtract | bytecode::BinaryOperator::InplaceSubtract => {
+                if let (Some(a), Some(b)) = (
+                    a_ref.downcast_ref_if_exact::<PyInt>(vm),
+                    b_ref.downcast_ref_if_exact::<PyInt>(vm),
+                ) {
+                    Ok(self.int_sub(a.as_bigint(), b.as_bigint(), vm))
+                } else if matches!(op, bytecode::BinaryOperator::Subtract) {
+                    vm._sub(a_ref, b_ref)
+                } else {
+                    vm._isub(a_ref, b_ref)
+                }
+            }
             bytecode::BinaryOperator::Multiply => vm._mul(a_ref, b_ref),
             bytecode::BinaryOperator::MatrixMultiply => vm._matmul(a_ref, b_ref),
             bytecode::BinaryOperator::Power => vm._pow(a_ref, b_ref, vm.ctx.none.as_object()),
@@ -3802,8 +3867,6 @@ impl ExecutingFrame<'_> {
             bytecode::BinaryOperator::Xor => vm._xor(a_ref, b_ref),
             bytecode::BinaryOperator::Or => vm._or(a_ref, b_ref),
             bytecode::BinaryOperator::And => vm._and(a_ref, b_ref),
-            bytecode::BinaryOperator::InplaceSubtract => vm._isub(a_ref, b_ref),
-            bytecode::BinaryOperator::InplaceAdd => vm._iadd(a_ref, b_ref),
             bytecode::BinaryOperator::InplaceMultiply => vm._imul(a_ref, b_ref),
             bytecode::BinaryOperator::InplaceMatrixMultiply => vm._imatmul(a_ref, b_ref),
             bytecode::BinaryOperator::InplacePower => {
@@ -3824,6 +3887,30 @@ impl ExecutingFrame<'_> {
         Ok(None)
     }
 
+    /// Int addition with i64 fast path to avoid BigInt heap allocation.
+    #[inline]
+    fn int_add(&self, a: &BigInt, b: &BigInt, vm: &VirtualMachine) -> PyObjectRef {
+        use num_traits::ToPrimitive;
+        if let (Some(av), Some(bv)) = (a.to_i64(), b.to_i64())
+            && let Some(result) = av.checked_add(bv)
+        {
+            return vm.ctx.new_int(result).into();
+        }
+        vm.ctx.new_int(a + b).into()
+    }
+
+    /// Int subtraction with i64 fast path to avoid BigInt heap allocation.
+    #[inline]
+    fn int_sub(&self, a: &BigInt, b: &BigInt, vm: &VirtualMachine) -> PyObjectRef {
+        use num_traits::ToPrimitive;
+        if let (Some(av), Some(bv)) = (a.to_i64(), b.to_i64())
+            && let Some(result) = av.checked_sub(bv)
+        {
+            return vm.ctx.new_int(result).into();
+        }
+        vm.ctx.new_int(a - b).into()
+    }
+
     #[cold]
     fn setup_annotations(&mut self, vm: &VirtualMachine) -> FrameResult {
         let __annotations__ = identifier!(vm, __annotations__);
@@ -3853,43 +3940,18 @@ impl ExecutingFrame<'_> {
         let value = self.pop_value();
         let size = size as usize;
 
-        // Fast path for exact tuple/list types (not subclasses) — check
-        // length directly without creating an iterator, matching
-        // UNPACK_SEQUENCE_TUPLE / UNPACK_SEQUENCE_LIST specializations.
+        // Fast path for exact tuple/list types (not subclasses) — push
+        // elements directly from the slice without intermediate Vec allocation,
+        // matching UNPACK_SEQUENCE_TUPLE / UNPACK_SEQUENCE_LIST specializations.
         let cls = value.class();
-        let fast_elements: Option<Vec<PyObjectRef>> = if cls.is(vm.ctx.types.tuple_type) {
-            Some(value.downcast_ref::<PyTuple>().unwrap().as_slice().to_vec())
-        } else if cls.is(vm.ctx.types.list_type) {
-            Some(
-                value
-                    .downcast_ref::<PyList>()
-                    .unwrap()
-                    .borrow_vec()
-                    .to_vec(),
-            )
-        } else {
-            None
-        };
-        if let Some(elements) = fast_elements {
-            return match elements.len().cmp(&size) {
-                core::cmp::Ordering::Equal => {
-                    self.state.stack.extend(
-                        elements
-                            .into_iter()
-                            .rev()
-                            .map(|e| Some(PyStackRef::new_owned(e))),
-                    );
-                    Ok(None)
-                }
-                core::cmp::Ordering::Greater => Err(vm.new_value_error(format!(
-                    "too many values to unpack (expected {size}, got {})",
-                    elements.len()
-                ))),
-                core::cmp::Ordering::Less => Err(vm.new_value_error(format!(
-                    "not enough values to unpack (expected {size}, got {})",
-                    elements.len()
-                ))),
-            };
+        if cls.is(vm.ctx.types.tuple_type) {
+            let tuple = value.downcast_ref::<PyTuple>().unwrap();
+            return self.unpack_fast(tuple.as_slice(), size, vm);
+        }
+        if cls.is(vm.ctx.types.list_type) {
+            let list = value.downcast_ref::<PyList>().unwrap();
+            let borrowed = list.borrow_vec();
+            return self.unpack_fast(&borrowed, size, vm);
         }
 
         // General path — iterate up to `size + 1` elements to avoid
@@ -3955,6 +4017,30 @@ impl ExecutingFrame<'_> {
         }
     }
 
+    fn unpack_fast(
+        &mut self,
+        elements: &[PyObjectRef],
+        size: usize,
+        vm: &VirtualMachine,
+    ) -> FrameResult {
+        match elements.len().cmp(&size) {
+            core::cmp::Ordering::Equal => {
+                for elem in elements.iter().rev() {
+                    self.push_value(elem.clone());
+                }
+                Ok(None)
+            }
+            core::cmp::Ordering::Greater => Err(vm.new_value_error(format!(
+                "too many values to unpack (expected {size}, got {})",
+                elements.len()
+            ))),
+            core::cmp::Ordering::Less => Err(vm.new_value_error(format!(
+                "not enough values to unpack (expected {size}, got {})",
+                elements.len()
+            ))),
+        }
+    }
+
     fn convert_value(
         &mut self,
         conversion: bytecode::ConvertValueOparg,
@@ -3996,7 +4082,30 @@ impl ExecutingFrame<'_> {
     ) -> FrameResult {
         let b = self.pop_value();
         let a = self.pop_value();
-        let value = a.rich_compare(b, op.into(), vm)?;
+        let cmp_op: PyComparisonOp = op.into();
+
+        // COMPARE_OP_INT: leaf type, cannot recurse — skip rich_compare dispatch
+        if let (Some(a_int), Some(b_int)) = (
+            a.downcast_ref_if_exact::<PyInt>(vm),
+            b.downcast_ref_if_exact::<PyInt>(vm),
+        ) {
+            let result = cmp_op.eval_ord(a_int.as_bigint().cmp(b_int.as_bigint()));
+            self.push_value(vm.ctx.new_bool(result).into());
+            return Ok(None);
+        }
+        // COMPARE_OP_FLOAT: leaf type, cannot recurse — skip rich_compare dispatch.
+        // Falls through on NaN (partial_cmp returns None) for correct != semantics.
+        if let (Some(a_f), Some(b_f)) = (
+            a.downcast_ref_if_exact::<PyFloat>(vm),
+            b.downcast_ref_if_exact::<PyFloat>(vm),
+        ) && let Some(ord) = a_f.to_f64().partial_cmp(&b_f.to_f64())
+        {
+            let result = cmp_op.eval_ord(ord);
+            self.push_value(vm.ctx.new_bool(result).into());
+            return Ok(None);
+        }
+
+        let value = a.rich_compare(b, cmp_op, vm)?;
         self.push_value(value);
         Ok(None)
     }
diff --git a/crates/vm/src/protocol/object.rs b/crates/vm/src/protocol/object.rs
index 78ac9903905..513ecbefeb5 100644
--- a/crates/vm/src/protocol/object.rs
+++ b/crates/vm/src/protocol/object.rs
@@ -277,13 +277,24 @@ impl PyObject {
 
     // Perform a comparison, raising TypeError when the requested comparison
     // operator is not supported.
-    // see: CPython PyObject_RichCompare
+    // see: PyObject_RichCompare / do_richcompare
     #[inline] // called by ExecutingFrame::execute_compare with const op
     fn _cmp(
         &self,
         other: &Self,
         op: PyComparisonOp,
         vm: &VirtualMachine,
+    ) -> PyResult<Either<PyObjectRef, bool>> {
+        // Single recursion guard for the entire comparison
+        // (do_richcompare in Objects/object.c).
+        vm.with_recursion("in comparison", || self._cmp_inner(other, op, vm))
+    }
+
+    fn _cmp_inner(
+        &self,
+        other: &Self,
+        op: PyComparisonOp,
+        vm: &VirtualMachine,
     ) -> PyResult<Either<PyObjectRef, bool>> {
         let swapped = op.swapped();
         let call_cmp = |obj: &Self, other: &Self, op| {
@@ -302,19 +313,17 @@ impl PyObject {
             !self_class.is(other_class) && other_class.fast_issubclass(self_class)
         };
         if is_strict_subclass {
-            let res = vm.with_recursion("in comparison", || call_cmp(other, self, swapped))?;
+            let res = call_cmp(other, self, swapped)?;
             checked_reverse_op = true;
             if let PyArithmeticValue::Implemented(x) = res {
                 return Ok(x);
             }
         }
-        if let PyArithmeticValue::Implemented(x) =
-            vm.with_recursion("in comparison", || call_cmp(self, other, op))?
-        {
+        if let PyArithmeticValue::Implemented(x) = call_cmp(self, other, op)? {
             return Ok(x);
         }
         if !checked_reverse_op {
-            let res = vm.with_recursion("in comparison", || call_cmp(other, self, swapped))?;
+            let res = call_cmp(other, self, swapped)?;
             if let PyArithmeticValue::Implemented(x) = res {
                 return Ok(x);
             }
diff --git a/crates/vm/src/signal.rs b/crates/vm/src/signal.rs
index ede037c3791..87c4fe2749f 100644
--- a/crates/vm/src/signal.rs
+++ b/crates/vm/src/signal.rs
@@ -45,6 +45,12 @@ pub fn check_signals(vm: &VirtualMachine) -> PyResult<()> {
         return Ok(());
     }
 
+    // Read-only check first: avoids cache-line invalidation on every
+    // instruction when no signal is pending (the common case).
+    if !ANY_TRIGGERED.load(Ordering::Relaxed) {
+        return Ok(());
+    }
+    // Atomic RMW only when a signal is actually pending.
     if !ANY_TRIGGERED.swap(false, Ordering::Acquire) {
         return Ok(());
     }