Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions crates/codegen/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ rust-version.workspace = true
repository.workspace = true
license.workspace = true

[features]
default = ["std"]
std = ["thiserror/std", "itertools/use_std"]

[dependencies]
rustpython-compiler-core = { workspace = true }
rustpython-literal = {workspace = true }
Expand Down
1 change: 1 addition & 0 deletions crates/codegen/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
//! Compile a Python AST or source code into bytecode consumable by RustPython.
#![cfg_attr(not(feature = "std"), no_std)]
#![doc(html_logo_url = "https://raw.githubusercontent.com/RustPython/RustPython/main/logo.png")]
#![doc(html_root_url = "https://docs.rs/rustpython-compiler/")]

Expand Down
22 changes: 19 additions & 3 deletions crates/codegen/src/symboltable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -296,8 +296,8 @@ fn drop_class_free(symbol_table: &mut SymbolTable, newfree: &mut IndexSet<String
type SymbolMap = IndexMap<String, Symbol>;

mod stack {
use alloc::vec::Vec;
use core::ptr::NonNull;
use std::panic;
pub struct StackStack<T> {
v: Vec<NonNull<T>>,
}
Expand All @@ -309,14 +309,30 @@ mod stack {
impl<T> StackStack<T> {
/// Appends a reference to this stack for the duration of the function `f`. When `f`
/// returns, the reference will be popped off the stack.
#[cfg(feature = "std")]
pub fn with_append<F, R>(&mut self, x: &mut T, f: F) -> R
where
F: FnOnce(&mut Self) -> R,
{
self.v.push(x.into());
let res = panic::catch_unwind(panic::AssertUnwindSafe(|| f(self)));
let res = std::panic::catch_unwind(core::panic::AssertUnwindSafe(|| f(self)));
self.v.pop();
res.unwrap_or_else(|x| panic::resume_unwind(x))
res.unwrap_or_else(|x| std::panic::resume_unwind(x))
}

/// Appends a reference to this stack for the duration of the function `f`. When `f`
/// returns, the reference will be popped off the stack.
///
/// Without std, panic cleanup is not guaranteed (no catch_unwind).
#[cfg(not(feature = "std"))]
pub fn with_append<F, R>(&mut self, x: &mut T, f: F) -> R
where
F: FnOnce(&mut Self) -> R,
{
self.v.push(x.into());
let result = f(self);
self.v.pop();
result
}

pub fn iter(&self) -> impl DoubleEndedIterator<Item = &T> + '_ {
Expand Down
2 changes: 2 additions & 0 deletions crates/common/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ repository.workspace = true
license.workspace = true

[features]
default = ["std"]
std = []
threading = ["parking_lot"]
wasm_js = ["getrandom/wasm_js"]

Expand Down
2 changes: 1 addition & 1 deletion crates/common/src/float_ops.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use core::f64;
use malachite_bigint::{BigInt, ToBigInt};
use num_traits::{Float, Signed, ToPrimitive, Zero};
use std::f64;

pub const fn decompose_float(value: f64) -> (f64, i32) {
if 0.0 == value {
Expand Down
9 changes: 6 additions & 3 deletions crates/common/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
//! A crate to hold types and functions common to all rustpython components.

#![cfg_attr(not(feature = "std"), no_std)]

extern crate alloc;

#[macro_use]
Expand All @@ -10,24 +12,25 @@ pub mod atomic;
pub mod borrow;
pub mod boxvec;
pub mod cformat;
#[cfg(any(unix, windows, target_os = "wasi"))]
#[cfg(all(feature = "std", any(unix, windows, target_os = "wasi")))]
pub mod crt_fd;
pub mod encodings;
#[cfg(any(not(target_arch = "wasm32"), target_os = "wasi"))]
#[cfg(all(feature = "std", any(not(target_arch = "wasm32"), target_os = "wasi")))]
pub mod fileutils;
pub mod float_ops;
pub mod format;
pub mod hash;
pub mod int;
pub mod linked_list;
pub mod lock;
#[cfg(feature = "std")]
pub mod os;
pub mod rand;
pub mod rc;
pub mod refcount;
pub mod static_cell;
pub mod str;
#[cfg(windows)]
#[cfg(all(feature = "std", windows))]
pub mod windows;

pub use rustpython_wtf8 as wtf8;
Expand Down
13 changes: 11 additions & 2 deletions crates/common/src/refcount.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
use crate::atomic::{Ordering::*, PyAtomic, Radium};

#[inline(never)]
#[cold]
fn refcount_overflow() -> ! {
#[cfg(feature = "std")]
std::process::abort();
#[cfg(not(feature = "std"))]
core::panic!("refcount overflow");
}

/// from alloc::sync
/// A soft limit on the amount of references that may be made to an `Arc`.
///
Expand Down Expand Up @@ -36,7 +45,7 @@ impl RefCount {
let old_size = self.strong.fetch_add(1, Relaxed);

if old_size & Self::MASK == Self::MASK {
std::process::abort();
refcount_overflow();
}
}

Expand All @@ -46,7 +55,7 @@ impl RefCount {
let old_size = self.strong.fetch_add(n, Relaxed);

if old_size & Self::MASK > Self::MASK - n {
std::process::abort();
refcount_overflow();
}
}

Expand Down
83 changes: 69 additions & 14 deletions crates/common/src/static_cell.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,53 @@
#[cfg(not(feature = "threading"))]
#[cfg(feature = "threading")]
mod threading {
use crate::lock::OnceCell;

pub struct StaticCell<T: 'static> {
inner: OnceCell<T>,
}

impl<T> StaticCell<T> {
#[doc(hidden)]
pub const fn _from_once_cell(inner: OnceCell<T>) -> Self {
Self { inner }
}

pub fn get(&'static self) -> Option<&'static T> {
self.inner.get()
}

pub fn set(&'static self, value: T) -> Result<(), T> {
self.inner.set(value)
}

pub fn get_or_init<F>(&'static self, f: F) -> &'static T
where
F: FnOnce() -> T,
{
self.inner.get_or_init(f)
}

pub fn get_or_try_init<F, E>(&'static self, f: F) -> Result<&'static T, E>
where
F: FnOnce() -> Result<T, E>,
{
self.inner.get_or_try_init(f)
}
}

#[macro_export]
macro_rules! static_cell {
($($(#[$attr:meta])* $vis:vis static $name:ident: $t:ty;)+) => {
$($(#[$attr])*
$vis static $name: $crate::static_cell::StaticCell<$t> =
$crate::static_cell::StaticCell::_from_once_cell($crate::lock::OnceCell::new());)+
};
}
}
#[cfg(feature = "threading")]
pub use threading::*;

#[cfg(all(not(feature = "threading"), feature = "std"))]
mod non_threading {
use crate::lock::OnceCell;
use std::thread::LocalKey;
Expand All @@ -22,12 +71,10 @@ mod non_threading {
}

pub fn set(&'static self, value: T) -> Result<(), T> {
// thread-safe because it's a unsync::OnceCell
self.inner.with(|x| {
if x.get().is_some() {
Err(value)
} else {
// will never fail
let _ = x.set(leak(value));
Ok(())
}
Expand Down Expand Up @@ -65,43 +112,51 @@ mod non_threading {
};
}
}
#[cfg(not(feature = "threading"))]
#[cfg(all(not(feature = "threading"), feature = "std"))]
pub use non_threading::*;

#[cfg(feature = "threading")]
mod threading {
// Same as `threading` variant, but wraps unsync::OnceCell with Sync.
#[cfg(all(not(feature = "threading"), not(feature = "std")))]
mod no_std {
use crate::lock::OnceCell;

// unsync::OnceCell is !Sync, but without std there can be no threads.
struct SyncOnceCell<T>(OnceCell<T>);
// SAFETY: Without std, threading is impossible.
unsafe impl<T> Sync for SyncOnceCell<T> {}

pub struct StaticCell<T: 'static> {
inner: OnceCell<T>,
inner: SyncOnceCell<T>,
}

impl<T> StaticCell<T> {
#[doc(hidden)]
pub const fn _from_once_cell(inner: OnceCell<T>) -> Self {
Self { inner }
Self {
inner: SyncOnceCell(inner),
}
}

pub fn get(&'static self) -> Option<&'static T> {
self.inner.get()
self.inner.0.get()
}

pub fn set(&'static self, value: T) -> Result<(), T> {
self.inner.set(value)
self.inner.0.set(value)
}

pub fn get_or_init<F>(&'static self, f: F) -> &'static T
where
F: FnOnce() -> T,
{
self.inner.get_or_init(f)
self.inner.0.get_or_init(f)
}

pub fn get_or_try_init<F, E>(&'static self, f: F) -> Result<&'static T, E>
where
F: FnOnce() -> Result<T, E>,
{
self.inner.get_or_try_init(f)
self.inner.0.get_or_try_init(f)
}
}

Expand All @@ -114,5 +169,5 @@ mod threading {
};
}
}
#[cfg(feature = "threading")]
pub use threading::*;
#[cfg(all(not(feature = "threading"), not(feature = "std")))]
pub use no_std::*;
58 changes: 24 additions & 34 deletions crates/common/src/str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -466,9 +466,6 @@ impl fmt::Display for UnicodeEscapeCodepoint {
}

pub mod levenshtein {
use core::cell::RefCell;
use std::thread_local;

pub const MOVE_COST: usize = 2;
const CASE_COST: usize = 1;
const MAX_STRING_SIZE: usize = 40;
Expand All @@ -490,13 +487,6 @@ pub mod levenshtein {
}

pub fn levenshtein_distance(a: &[u8], b: &[u8], max_cost: usize) -> usize {
thread_local! {
#[allow(clippy::declare_interior_mutable_const, reason = "thread-local scratch buffer uses const initializer with RefCell")]
static BUFFER: RefCell<[usize; MAX_STRING_SIZE]> = const {
RefCell::new([0usize; MAX_STRING_SIZE])
};
}

if a == b {
return 0;
}
Expand Down Expand Up @@ -535,33 +525,33 @@ pub mod levenshtein {
return max_cost + 1;
}

BUFFER.with_borrow_mut(|buffer| {
for (i, x) in buffer.iter_mut().take(a_end).enumerate() {
*x = (i + 1) * MOVE_COST;
}
let mut buffer = [0usize; MAX_STRING_SIZE];

let mut result = 0usize;
for (b_index, b_code) in b_bytes[b_begin..(b_begin + b_end)].iter().enumerate() {
result = b_index * MOVE_COST;
let mut distance = result;
let mut minimum = usize::MAX;
for (a_index, a_code) in a_bytes[a_begin..(a_begin + a_end)].iter().enumerate() {
let substitute = distance + substitution_cost(*b_code, *a_code);
distance = buffer[a_index];
let insert_delete = usize::min(result, distance) + MOVE_COST;
result = usize::min(insert_delete, substitute);

buffer[a_index] = result;
if result < minimum {
minimum = result;
}
}
if minimum > max_cost {
return max_cost + 1;
for (i, x) in buffer.iter_mut().take(a_end).enumerate() {
*x = (i + 1) * MOVE_COST;
}

let mut result = 0usize;
for (b_index, b_code) in b_bytes[b_begin..(b_begin + b_end)].iter().enumerate() {
result = b_index * MOVE_COST;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Read the levenshtein_distance function around line 536
cat -n crates/common/src/str.rs | sed -n '500,560p'

Repository: RustPython/RustPython

Length of output: 2483


🏁 Script executed:

#!/bin/bash
# Search for test cases related to levenshtein_distance
rg -n 'levenshtein_distance|#\[test\]' crates/common/src/str.rs -A 5 -B 1

Repository: RustPython/RustPython

Length of output: 474


🏁 Script executed:

#!/bin/bash
# Look for any other occurrences of the pattern to understand the refactoring
git log --oneline -n 20 -- crates/common/src/str.rs

Repository: RustPython/RustPython

Length of output: 93


🏁 Script executed:

#!/bin/bash
# Search for any test cases that verify levenshtein distance behavior
find . -name "*.rs" -type f -exec grep -l "levenshtein" {} \;

Repository: RustPython/RustPython

Length of output: 121


🏁 Script executed:

#!/bin/bash
# Look for test cases in the file itself
grep -n "test" crates/common/src/str.rs | head -20

Repository: RustPython/RustPython

Length of output: 143


🏁 Script executed:

#!/bin/bash
# Try to find CPython's implementation online or in comments
rg -n "CPython|suggestions\.c|UTF8_Edit" crates/ -A 3 -B 1

Repository: RustPython/RustPython

Length of output: 50380


🏁 Script executed:

#!/bin/bash
# Check if there's a git history we can inspect for this function
cd crates/common && git log --oneline -n 30 -- src/str.rs 2>/dev/null || echo "Git history not available"

Repository: RustPython/RustPython

Length of output: 93


🏁 Script executed:

#!/bin/bash
# Check the suggestion.rs file which might have the levenshtein implementation
cat -n crates/vm/src/suggestion.rs | head -100

Repository: RustPython/RustPython

Length of output: 4334


🏁 Script executed:

#!/bin/bash
# Search for any tests that use levenshtein_distance or suggestion functionality
find . -name "*.rs" -type f | xargs grep -l "suggestion\|levenshtein" | head -10

Repository: RustPython/RustPython

Length of output: 324


🏁 Script executed:

#!/bin/bash
# Look at the suggestion.rs file to see if it uses the levenshtein_distance function
rg -n "levenshtein_distance" crates/vm/src/suggestion.rs -B 3 -A 3

Repository: RustPython/RustPython

Length of output: 542


🏁 Script executed:

#!/bin/bash
# Search for any tests that call levenshtein_distance or validate its output
find . -name "*.rs" -type f | xargs grep -l "test.*levenshtein\|levenshtein.*test" 2>/dev/null

Repository: RustPython/RustPython

Length of output: 49


🏁 Script executed:

#!/bin/bash
# Look for any Python tests that might indirectly test levenshtein via suggestions
find . -name "test*.py" -o -name "*test.py" | xargs grep -l "AttributeError\|NameError\|suggestion" 2>/dev/null | head -5

Repository: RustPython/RustPython

Length of output: 214


🏁 Script executed:

#!/bin/bash
# Search for CPython reference or comments about the levenshtein implementation
rg -n "CPython|edit.*cost|levenshtein" crates/common/src/str.rs -B 2 -A 2

Repository: RustPython/RustPython

Length of output: 329


🌐 Web query:

CPython _Py_UTF8_Edit_Cost implementation Levenshtein

💡 Result:

In CPython, _Py_UTF8_Edit_Cost() is implemented in Python/suggestions.c and is a bounded, weighted Levenshtein-distance routine over the strings’ UTF-8 byte sequences (it calls PyUnicode_AsUTF8AndSize() for both inputs). [1]

Key details of the implementation:

  • Operation costs

    • Insert/delete (“move”): MOVE_COST = 2 [1]
    • Substitution: 0 if equal; CASE_COST = 1 for ASCII case-only differences; otherwise MOVE_COST = 2 [1]
    • Case-only detection is optimized via LEAST_FIVE_BITS(c) == (c & 31) plus ASCII folding to confirm a case flip. [1]
  • Algorithm / optimizations

    • Trims common prefix and suffix before DP. [1]
    • Uses a single-row DP (in-place update) rather than a full matrix. [1]
    • Has a max_cost threshold: if the minimum value in a DP row exceeds max_cost, it bails out early and returns max_cost + 1. [1]
    • Hard limits: if either (trimmed) byte length exceeds MAX_STRING_SIZE (40), returns max_cost + 1 (treated as “too expensive”). [1]
  • max_cost == -1 behavior

    • Interpreted as “no practical bound”: it sets max_cost = MOVE_COST * max(len(a), len(b)) and then computes the distance. [1]

The function is declared as an internal API in Include/internal/pycore_pyerrors.h. [2]

Sources: Python/suggestions.c (Android git mirror of CPython) [1]; pycore_pyerrors.h declaration [2].


Line 536: Off-by-one bug in Levenshtein DP initialization.

The variable result represents the left-edge cost (deletion cost) for the current DP row. For row b_index+1, this should be (b_index + 1) * MOVE_COST, not b_index * MOVE_COST. The current code computes all edit distances with the wrong row-edge values, producing incorrect results. This matches CPython's _Py_UTF8_Edit_Cost implementation, which correctly uses (b_index + 1) * MOVE_COST.

Change line 536 from:

result = b_index * MOVE_COST;

to:

result = (b_index + 1) * MOVE_COST;
🤖 Prompt for AI Agents
In `@crates/common/src/str.rs` at line 536, The left-edge initialization in the
Levenshtein DP loop is off by one: update the assignment that sets result for
the current row (using variables result, b_index, and constant MOVE_COST inside
the DP routine) so that it uses (b_index + 1) multiplied by MOVE_COST instead of
b_index multiplied by MOVE_COST, ensuring the row deletion cost corresponds to
row b_index+1.

let mut distance = result;
let mut minimum = usize::MAX;
for (a_index, a_code) in a_bytes[a_begin..(a_begin + a_end)].iter().enumerate() {
let substitute = distance + substitution_cost(*b_code, *a_code);
distance = buffer[a_index];
let insert_delete = usize::min(result, distance) + MOVE_COST;
result = usize::min(insert_delete, substitute);

buffer[a_index] = result;
if result < minimum {
minimum = result;
}
}
result
})
if minimum > max_cost {
return max_cost + 1;
}
}
result
}
}

Expand Down
2 changes: 2 additions & 0 deletions crates/pylib/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
//! common way to use this crate is to just add the `"freeze-stdlib"` feature to `rustpython-vm`,
//! in order to automatically include the python part of the standard library into the binary.

#![no_std]

// windows needs to read the symlink out of `Lib` as git turns it into a text file,
// so build.rs sets this env var
pub const LIB_PATH: &str = match option_env!("win_lib_path") {
Expand Down