Skip to content

Commit c7340bd

Browse files
committed
Reduce memory footprint of notebooks
1 parent 6185a2a commit c7340bd

File tree

3 files changed

+103
-73
lines changed

3 files changed

+103
-73
lines changed

crates/ruff_db/src/diagnostic/render/full.rs

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -112,16 +112,16 @@ impl std::fmt::Display for Diff<'_> {
112112
// `None`, indicating a regular script file, all the lines will be in one "cell" under the
113113
// `None` key.
114114
let cells = if let Some(notebook_index) = &self.notebook_index {
115-
let mut last_cell = OneIndexed::MIN;
115+
let mut last_cell_index = OneIndexed::MIN;
116116
let mut cells: Vec<(Option<OneIndexed>, TextSize)> = Vec::new();
117-
for (row, cell) in notebook_index.iter() {
118-
if cell != last_cell {
119-
let offset = source_code.line_start(row);
120-
cells.push((Some(last_cell), offset));
121-
last_cell = cell;
117+
for cell in notebook_index.iter() {
118+
if cell.cell_index() != last_cell_index {
119+
let offset = source_code.line_start(cell.start_row());
120+
cells.push((Some(last_cell_index), offset));
121+
last_cell_index = cell.cell_index();
122122
}
123123
}
124-
cells.push((Some(last_cell), source_text.text_len()));
124+
cells.push((Some(last_cell_index), source_text.text_len()));
125125
cells
126126
} else {
127127
vec![(None, source_text.text_len())]

crates/ruff_notebook/src/index.rs

Lines changed: 57 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,37 +8,49 @@ use ruff_source_file::{LineColumn, OneIndexed, SourceLocation};
88
/// [`ruff_text_size::TextSize`] to jupyter notebook cell/row/column.
99
#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
1010
pub struct NotebookIndex {
11-
/// Enter a row (1-based), get back the cell (1-based)
12-
pub(super) row_to_cell: Vec<OneIndexed>,
13-
/// Enter a row (1-based), get back the row in cell (1-based)
14-
pub(super) row_to_row_in_cell: Vec<OneIndexed>,
11+
/// Stores the starting row and the absolute cell index for every Python (valid) cell.
12+
///
13+
/// The index in this vector corresponds to the Python cell index (valid cell index).
14+
pub(super) cell_starts: Vec<CellStart>,
1515
}
1616

1717
impl NotebookIndex {
18-
pub fn new(row_to_cell: Vec<OneIndexed>, row_to_row_in_cell: Vec<OneIndexed>) -> Self {
19-
Self {
20-
row_to_cell,
21-
row_to_row_in_cell,
18+
/// Helper method to find the cell index and start row for a given row.
19+
/// Returns `Some((cell_index, cell_start_row))` or `None` if the row is before the first cell.
20+
fn find_cell(&self, row: OneIndexed) -> Option<CellStart> {
21+
match self
22+
.cell_starts
23+
.binary_search_by_key(&row, |start| start.start_row)
24+
{
25+
// Exact match: row is the first row of a cell
26+
Ok(cell_index) => Some(self.cell_starts[cell_index]),
27+
// Not an exact match: the cell is the one before the insertion point
28+
Err(0) => None, // Row is before the first cell
29+
Err(insertion_point) => {
30+
let cell_index = insertion_point - 1;
31+
Some(self.cell_starts[cell_index])
32+
}
2233
}
2334
}
2435

25-
/// Returns the cell number (1-based) for the given row (1-based).
36+
/// Returns the (raw) cell number (1-based) for the given row (1-based).
2637
pub fn cell(&self, row: OneIndexed) -> Option<OneIndexed> {
27-
self.row_to_cell.get(row.to_zero_indexed()).copied()
38+
self.find_cell(row).map(|start| start.raw_cell_index)
2839
}
2940

3041
/// Returns the row number (1-based) in the cell (1-based) for the
3142
/// given row (1-based).
3243
pub fn cell_row(&self, row: OneIndexed) -> Option<OneIndexed> {
33-
self.row_to_row_in_cell.get(row.to_zero_indexed()).copied()
44+
self.find_cell(row)
45+
.map(|start| OneIndexed::from_zero_indexed(row.get() - start.start_row.get()))
3446
}
3547

36-
/// Returns an iterator over the row:cell-number pairs (both 1-based).
37-
pub fn iter(&self) -> impl Iterator<Item = (OneIndexed, OneIndexed)> {
38-
self.row_to_cell
39-
.iter()
40-
.enumerate()
41-
.map(|(row, cell)| (OneIndexed::from_zero_indexed(row), *cell))
48+
/// Returns an iterator over the starting rows of each cell (1-based).
49+
///
50+
/// This yields one entry per cell, representing the first row number of that cell
51+
/// in the concatenated source text.
52+
pub fn iter(&self) -> impl Iterator<Item = CellStart> + '_ {
53+
self.cell_starts.iter().copied()
4254
}
4355

4456
/// Translates the given [`LineColumn`] based on the indexing table.
@@ -49,6 +61,10 @@ impl NotebookIndex {
4961
LineColumn {
5062
line: self
5163
.cell_row(source_location.line)
64+
.or_else(|| {
65+
// If the row is beyond the last cell, return the last cell's last row + 1
66+
self.cell_starts.last().map(|_| OneIndexed::MIN)
67+
})
5268
.unwrap_or(OneIndexed::MIN),
5369
column: source_location.column,
5470
}
@@ -62,8 +78,32 @@ impl NotebookIndex {
6278
SourceLocation {
6379
line: self
6480
.cell_row(source_location.line)
81+
.or_else(|| {
82+
// If the row is beyond the last cell, return the last cell's last row + 1
83+
self.cell_starts.last().map(|_| OneIndexed::MIN)
84+
})
6585
.unwrap_or(OneIndexed::MIN),
6686
character_offset: source_location.character_offset,
6787
}
6888
}
6989
}
90+
91+
#[derive(Debug, Copy, Clone, Eq, PartialEq, Serialize, Deserialize)]
92+
pub struct CellStart {
93+
/// The row in the concatenated notebook source code at which
94+
/// this cell starts.
95+
pub(super) start_row: OneIndexed,
96+
97+
/// The absolute index of this cell in the notebook.
98+
pub(super) raw_cell_index: OneIndexed,
99+
}
100+
101+
impl CellStart {
102+
pub fn start_row(&self) -> OneIndexed {
103+
self.start_row
104+
}
105+
106+
pub fn cell_index(&self) -> OneIndexed {
107+
self.raw_cell_index
108+
}
109+
}

crates/ruff_notebook/src/notebook.rs

Lines changed: 39 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ use ruff_text_size::TextSize;
1818
use crate::cell::CellOffsets;
1919
use crate::index::NotebookIndex;
2020
use crate::schema::{Cell, RawNotebook, SortAlphabetically, SourceValue};
21-
use crate::{CellMetadata, RawNotebookMetadata, schema};
21+
use crate::{CellMetadata, CellStart, RawNotebookMetadata, schema};
2222

2323
/// Run round-trip source code generation on a given Jupyter notebook file path.
2424
pub fn round_trip(path: &Path) -> anyhow::Result<String> {
@@ -317,14 +317,22 @@ impl Notebook {
317317
/// of lines visible in the UI is three. The same goes for [`SourceValue::String`]
318318
/// where we need to check for the trailing newline.
319319
///
320-
/// The index building is expensive as it needs to go through the content of
321-
/// every valid code cell.
320+
/// The index building is now much more efficient, storing only cell start
321+
/// positions rather than per-row mappings.
322322
fn build_index(&self) -> NotebookIndex {
323-
let mut row_to_cell = Vec::new();
324-
let mut row_to_row_in_cell = Vec::new();
323+
let mut cell_starts = Vec::with_capacity(self.valid_code_cells.len());
324+
325+
let mut current_row = OneIndexed::MIN;
325326

326327
for &cell_index in &self.valid_code_cells {
327-
let line_count = match &self.raw.cells[cell_index as usize].source() {
328+
let raw_cell_index = cell_index as usize;
329+
// Record the starting row of this cell
330+
cell_starts.push(CellStart {
331+
start_row: current_row,
332+
raw_cell_index: OneIndexed::from_zero_indexed(raw_cell_index),
333+
});
334+
335+
let line_count = match &self.raw.cells[raw_cell_index].source() {
328336
SourceValue::String(string) => {
329337
if string.is_empty() {
330338
1
@@ -342,17 +350,11 @@ impl Notebook {
342350
}
343351
}
344352
};
345-
row_to_cell.extend(std::iter::repeat_n(
346-
OneIndexed::from_zero_indexed(cell_index as usize),
347-
line_count,
348-
));
349-
row_to_row_in_cell.extend((0..line_count).map(OneIndexed::from_zero_indexed));
350-
}
351353

352-
NotebookIndex {
353-
row_to_cell,
354-
row_to_row_in_cell,
354+
current_row = current_row.saturating_add(line_count);
355355
}
356+
357+
NotebookIndex { cell_starts }
356358
}
357359

358360
/// Return the notebook content.
@@ -456,7 +458,7 @@ mod tests {
456458

457459
use ruff_source_file::OneIndexed;
458460

459-
use crate::{Cell, Notebook, NotebookError, NotebookIndex};
461+
use crate::{Cell, CellStart, Notebook, NotebookError, NotebookIndex};
460462

461463
/// Construct a path to a Jupyter notebook in the `resources/test/fixtures/jupyter` directory.
462464
fn notebook_path(path: impl AsRef<Path>) -> std::path::PathBuf {
@@ -548,39 +550,27 @@ print("after empty cells")
548550
assert_eq!(
549551
notebook.index(),
550552
&NotebookIndex {
551-
row_to_cell: vec![
552-
OneIndexed::from_zero_indexed(0),
553-
OneIndexed::from_zero_indexed(0),
554-
OneIndexed::from_zero_indexed(0),
555-
OneIndexed::from_zero_indexed(0),
556-
OneIndexed::from_zero_indexed(0),
557-
OneIndexed::from_zero_indexed(0),
558-
OneIndexed::from_zero_indexed(2),
559-
OneIndexed::from_zero_indexed(2),
560-
OneIndexed::from_zero_indexed(2),
561-
OneIndexed::from_zero_indexed(2),
562-
OneIndexed::from_zero_indexed(2),
563-
OneIndexed::from_zero_indexed(4),
564-
OneIndexed::from_zero_indexed(6),
565-
OneIndexed::from_zero_indexed(6),
566-
OneIndexed::from_zero_indexed(7)
567-
],
568-
row_to_row_in_cell: vec![
569-
OneIndexed::from_zero_indexed(0),
570-
OneIndexed::from_zero_indexed(1),
571-
OneIndexed::from_zero_indexed(2),
572-
OneIndexed::from_zero_indexed(3),
573-
OneIndexed::from_zero_indexed(4),
574-
OneIndexed::from_zero_indexed(5),
575-
OneIndexed::from_zero_indexed(0),
576-
OneIndexed::from_zero_indexed(1),
577-
OneIndexed::from_zero_indexed(2),
578-
OneIndexed::from_zero_indexed(3),
579-
OneIndexed::from_zero_indexed(4),
580-
OneIndexed::from_zero_indexed(0),
581-
OneIndexed::from_zero_indexed(0),
582-
OneIndexed::from_zero_indexed(1),
583-
OneIndexed::from_zero_indexed(0)
553+
cell_starts: vec![
554+
CellStart {
555+
start_row: OneIndexed::MIN,
556+
raw_cell_index: OneIndexed::MIN
557+
},
558+
CellStart {
559+
start_row: OneIndexed::from_zero_indexed(6),
560+
raw_cell_index: OneIndexed::from_zero_indexed(2)
561+
},
562+
CellStart {
563+
start_row: OneIndexed::from_zero_indexed(11),
564+
raw_cell_index: OneIndexed::from_zero_indexed(4)
565+
},
566+
CellStart {
567+
start_row: OneIndexed::from_zero_indexed(12),
568+
raw_cell_index: OneIndexed::from_zero_indexed(6)
569+
},
570+
CellStart {
571+
start_row: OneIndexed::from_zero_indexed(14),
572+
raw_cell_index: OneIndexed::from_zero_indexed(7)
573+
}
584574
],
585575
}
586576
);

0 commit comments

Comments
 (0)