Skip to content

Commit fcd67f2

Browse files
committed
Reduce memory footprint of notebooks
1 parent 6185a2a commit fcd67f2

File tree

3 files changed

+84
-71
lines changed

3 files changed

+84
-71
lines changed

crates/ruff_db/src/diagnostic/render/full.rs

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -112,16 +112,16 @@ impl std::fmt::Display for Diff<'_> {
112112
// `None`, indicating a regular script file, all the lines will be in one "cell" under the
113113
// `None` key.
114114
let cells = if let Some(notebook_index) = &self.notebook_index {
115-
let mut last_cell = OneIndexed::MIN;
115+
let mut last_cell_index = OneIndexed::MIN;
116116
let mut cells: Vec<(Option<OneIndexed>, TextSize)> = Vec::new();
117-
for (row, cell) in notebook_index.iter() {
118-
if cell != last_cell {
119-
let offset = source_code.line_start(row);
120-
cells.push((Some(last_cell), offset));
121-
last_cell = cell;
117+
for cell in notebook_index.iter() {
118+
if cell.cell_index() != last_cell_index {
119+
let offset = source_code.line_start(cell.start_row());
120+
cells.push((Some(last_cell_index), offset));
121+
last_cell_index = cell.cell_index();
122122
}
123123
}
124-
cells.push((Some(last_cell), source_text.text_len()));
124+
cells.push((Some(last_cell_index), source_text.text_len()));
125125
cells
126126
} else {
127127
vec![(None, source_text.text_len())]

crates/ruff_notebook/src/index.rs

Lines changed: 40 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,37 +8,40 @@ use ruff_source_file::{LineColumn, OneIndexed, SourceLocation};
88
/// [`ruff_text_size::TextSize`] to jupyter notebook cell/row/column.
99
#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
1010
pub struct NotebookIndex {
11-
/// Enter a row (1-based), get back the cell (1-based)
12-
pub(super) row_to_cell: Vec<OneIndexed>,
13-
/// Enter a row (1-based), get back the row in cell (1-based)
14-
pub(super) row_to_row_in_cell: Vec<OneIndexed>,
11+
/// Stores the starting row and the absolute cell index for every Python (valid) cell.
12+
///
13+
/// The index in this vector corresponds to the Python cell index (valid cell index).
14+
pub(super) cell_starts: Vec<CellStart>,
1515
}
1616

1717
impl NotebookIndex {
18-
pub fn new(row_to_cell: Vec<OneIndexed>, row_to_row_in_cell: Vec<OneIndexed>) -> Self {
19-
Self {
20-
row_to_cell,
21-
row_to_row_in_cell,
18+
fn find_cell(&self, row: OneIndexed) -> Option<CellStart> {
19+
match self
20+
.cell_starts
21+
.binary_search_by_key(&row, |start| start.start_row)
22+
{
23+
Ok(cell_index) => Some(self.cell_starts[cell_index]),
24+
Err(insertion_point) => Some(self.cell_starts[insertion_point.checked_sub(1)?]),
2225
}
2326
}
2427

25-
/// Returns the cell number (1-based) for the given row (1-based).
28+
/// Returns the (raw) cell number (1-based) for the given row (1-based).
2629
pub fn cell(&self, row: OneIndexed) -> Option<OneIndexed> {
27-
self.row_to_cell.get(row.to_zero_indexed()).copied()
30+
self.find_cell(row).map(|start| start.raw_cell_index)
2831
}
2932

3033
/// Returns the row number (1-based) in the cell (1-based) for the
3134
/// given row (1-based).
3235
pub fn cell_row(&self, row: OneIndexed) -> Option<OneIndexed> {
33-
self.row_to_row_in_cell.get(row.to_zero_indexed()).copied()
36+
self.find_cell(row)
37+
.map(|start| OneIndexed::from_zero_indexed(row.get() - start.start_row.get()))
3438
}
3539

36-
/// Returns an iterator over the row:cell-number pairs (both 1-based).
37-
pub fn iter(&self) -> impl Iterator<Item = (OneIndexed, OneIndexed)> {
38-
self.row_to_cell
39-
.iter()
40-
.enumerate()
41-
.map(|(row, cell)| (OneIndexed::from_zero_indexed(row), *cell))
40+
/// Returns an iterator over the starting rows of each cell (1-based).
41+
///
42+
/// This yields one entry per Python cell (skipping over Makrdown cell).
43+
pub fn iter(&self) -> impl Iterator<Item = CellStart> + '_ {
44+
self.cell_starts.iter().copied()
4245
}
4346

4447
/// Translates the given [`LineColumn`] based on the indexing table.
@@ -67,3 +70,23 @@ impl NotebookIndex {
6770
}
6871
}
6972
}
73+
74+
#[derive(Debug, Copy, Clone, Eq, PartialEq, Serialize, Deserialize)]
75+
pub struct CellStart {
76+
/// The row in the concatenated notebook source code at which
77+
/// this cell starts.
78+
pub(super) start_row: OneIndexed,
79+
80+
/// The absolute index of this cell in the notebook.
81+
pub(super) raw_cell_index: OneIndexed,
82+
}
83+
84+
impl CellStart {
85+
pub fn start_row(&self) -> OneIndexed {
86+
self.start_row
87+
}
88+
89+
pub fn cell_index(&self) -> OneIndexed {
90+
self.raw_cell_index
91+
}
92+
}

crates/ruff_notebook/src/notebook.rs

Lines changed: 37 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ use ruff_text_size::TextSize;
1818
use crate::cell::CellOffsets;
1919
use crate::index::NotebookIndex;
2020
use crate::schema::{Cell, RawNotebook, SortAlphabetically, SourceValue};
21-
use crate::{CellMetadata, RawNotebookMetadata, schema};
21+
use crate::{CellMetadata, CellStart, RawNotebookMetadata, schema};
2222

2323
/// Run round-trip source code generation on a given Jupyter notebook file path.
2424
pub fn round_trip(path: &Path) -> anyhow::Result<String> {
@@ -320,11 +320,19 @@ impl Notebook {
320320
/// The index building is expensive as it needs to go through the content of
321321
/// every valid code cell.
322322
fn build_index(&self) -> NotebookIndex {
323-
let mut row_to_cell = Vec::new();
324-
let mut row_to_row_in_cell = Vec::new();
323+
let mut cell_starts = Vec::with_capacity(self.valid_code_cells.len());
324+
325+
let mut current_row = OneIndexed::MIN;
325326

326327
for &cell_index in &self.valid_code_cells {
327-
let line_count = match &self.raw.cells[cell_index as usize].source() {
328+
let raw_cell_index = cell_index as usize;
329+
// Record the starting row of this cell
330+
cell_starts.push(CellStart {
331+
start_row: current_row,
332+
raw_cell_index: OneIndexed::from_zero_indexed(raw_cell_index),
333+
});
334+
335+
let line_count = match &self.raw.cells[raw_cell_index].source() {
328336
SourceValue::String(string) => {
329337
if string.is_empty() {
330338
1
@@ -342,17 +350,11 @@ impl Notebook {
342350
}
343351
}
344352
};
345-
row_to_cell.extend(std::iter::repeat_n(
346-
OneIndexed::from_zero_indexed(cell_index as usize),
347-
line_count,
348-
));
349-
row_to_row_in_cell.extend((0..line_count).map(OneIndexed::from_zero_indexed));
350-
}
351353

352-
NotebookIndex {
353-
row_to_cell,
354-
row_to_row_in_cell,
354+
current_row = current_row.saturating_add(line_count);
355355
}
356+
357+
NotebookIndex { cell_starts }
356358
}
357359

358360
/// Return the notebook content.
@@ -456,7 +458,7 @@ mod tests {
456458

457459
use ruff_source_file::OneIndexed;
458460

459-
use crate::{Cell, Notebook, NotebookError, NotebookIndex};
461+
use crate::{Cell, CellStart, Notebook, NotebookError, NotebookIndex};
460462

461463
/// Construct a path to a Jupyter notebook in the `resources/test/fixtures/jupyter` directory.
462464
fn notebook_path(path: impl AsRef<Path>) -> std::path::PathBuf {
@@ -548,39 +550,27 @@ print("after empty cells")
548550
assert_eq!(
549551
notebook.index(),
550552
&NotebookIndex {
551-
row_to_cell: vec![
552-
OneIndexed::from_zero_indexed(0),
553-
OneIndexed::from_zero_indexed(0),
554-
OneIndexed::from_zero_indexed(0),
555-
OneIndexed::from_zero_indexed(0),
556-
OneIndexed::from_zero_indexed(0),
557-
OneIndexed::from_zero_indexed(0),
558-
OneIndexed::from_zero_indexed(2),
559-
OneIndexed::from_zero_indexed(2),
560-
OneIndexed::from_zero_indexed(2),
561-
OneIndexed::from_zero_indexed(2),
562-
OneIndexed::from_zero_indexed(2),
563-
OneIndexed::from_zero_indexed(4),
564-
OneIndexed::from_zero_indexed(6),
565-
OneIndexed::from_zero_indexed(6),
566-
OneIndexed::from_zero_indexed(7)
567-
],
568-
row_to_row_in_cell: vec![
569-
OneIndexed::from_zero_indexed(0),
570-
OneIndexed::from_zero_indexed(1),
571-
OneIndexed::from_zero_indexed(2),
572-
OneIndexed::from_zero_indexed(3),
573-
OneIndexed::from_zero_indexed(4),
574-
OneIndexed::from_zero_indexed(5),
575-
OneIndexed::from_zero_indexed(0),
576-
OneIndexed::from_zero_indexed(1),
577-
OneIndexed::from_zero_indexed(2),
578-
OneIndexed::from_zero_indexed(3),
579-
OneIndexed::from_zero_indexed(4),
580-
OneIndexed::from_zero_indexed(0),
581-
OneIndexed::from_zero_indexed(0),
582-
OneIndexed::from_zero_indexed(1),
583-
OneIndexed::from_zero_indexed(0)
553+
cell_starts: vec![
554+
CellStart {
555+
start_row: OneIndexed::MIN,
556+
raw_cell_index: OneIndexed::MIN
557+
},
558+
CellStart {
559+
start_row: OneIndexed::from_zero_indexed(6),
560+
raw_cell_index: OneIndexed::from_zero_indexed(2)
561+
},
562+
CellStart {
563+
start_row: OneIndexed::from_zero_indexed(11),
564+
raw_cell_index: OneIndexed::from_zero_indexed(4)
565+
},
566+
CellStart {
567+
start_row: OneIndexed::from_zero_indexed(12),
568+
raw_cell_index: OneIndexed::from_zero_indexed(6)
569+
},
570+
CellStart {
571+
start_row: OneIndexed::from_zero_indexed(14),
572+
raw_cell_index: OneIndexed::from_zero_indexed(7)
573+
}
584574
],
585575
}
586576
);

0 commit comments

Comments
 (0)