Skip to content
Merged
Changes from 11 commits
Commits
Show all changes
65 commits
Select commit Hold shift + click to select a range
c3cc4a1
refactor(parser): use integer parsing functions from stdlib
Alvaro-Kothe Oct 11, 2025
2459313
perf: use a local buffer to store the processed string
Alvaro-Kothe Oct 13, 2025
d8a454e
fix: use macro to fix MSVC build error
Alvaro-Kothe Oct 13, 2025
87789e6
fix: use `bool`
Alvaro-Kothe Oct 13, 2025
2287944
refactor: don't pass PROCESSED_WORD_CAPACITY as a separate argument
Alvaro-Kothe Oct 13, 2025
2bea3c2
perf: write in chunks
Alvaro-Kothe Oct 13, 2025
fb38679
hack: try bigger buffer size for arm error
Alvaro-Kothe Oct 13, 2025
f9ede5c
fix: solution without manipulating the string
Alvaro-Kothe Oct 13, 2025
798c263
some cleanup
Alvaro-Kothe Oct 13, 2025
2f06f19
fix: use ptrdiff_t to fix MSVC build error
Alvaro-Kothe Oct 13, 2025
280b55e
add other exponent cases for completion
Alvaro-Kothe Oct 14, 2025
d85aaf0
fix: use builtin overflow check verification
Alvaro-Kothe Oct 14, 2025
9046ecc
fix: change std to c2x
Alvaro-Kothe Oct 14, 2025
a4e2fb8
Revert previous commits
Alvaro-Kothe Oct 14, 2025
ef82cf4
refactor: move overflow check to header
Alvaro-Kothe Oct 14, 2025
5afeb11
refactor: use overflow check from numpy
Alvaro-Kothe Oct 14, 2025
0ef47a7
fix: handle negative check
Alvaro-Kothe Oct 14, 2025
c840ef0
fix: add test for thousand separator with negative number
Alvaro-Kothe Oct 14, 2025
132342b
move to portable
Alvaro-Kothe Oct 14, 2025
e6977cc
perf: use builtin unsigned long overflow check
Alvaro-Kothe Oct 14, 2025
8120eea
refactor: combine builting and gnuc branches
Alvaro-Kothe Oct 14, 2025
1f5d506
don't assign null
Alvaro-Kothe Oct 14, 2025
479a2ab
fix: perform bound check
Alvaro-Kothe Oct 14, 2025
c37c355
fix: assign error if doesn't have a digit after tsep
Alvaro-Kothe Oct 14, 2025
7d55283
fix: go back to buffer solution
Alvaro-Kothe Oct 14, 2025
ffe50ce
refactor: undo refactor in np_datetime.c
Alvaro-Kothe Oct 14, 2025
af5ad71
fix: fix undefined behavior
Alvaro-Kothe Oct 14, 2025
9211704
fix: fix leftover undefined behaviour
Alvaro-Kothe Oct 14, 2025
d026b01
rewrite `copy_string_without_char`
Alvaro-Kothe Oct 14, 2025
d76ff5f
fix: change solution to safe guard against end_ptr
Alvaro-Kothe Oct 14, 2025
b523a19
test: add some edge cases tests with thousand separator
Alvaro-Kothe Oct 14, 2025
6265172
Update pandas/_libs/src/parser/tokenizer.c
Alvaro-Kothe Oct 14, 2025
abed6c1
fix: error to -1
Alvaro-Kothe Oct 14, 2025
8616f9f
fix: leftover status check
Alvaro-Kothe Oct 14, 2025
c4e0e25
fix: remove duplicate nbytes declaration
Alvaro-Kothe Oct 14, 2025
0b19208
fix: use memchr to find if need to process the word
Alvaro-Kothe Oct 15, 2025
29d74f7
chore: add comment explaining why 128 bytes for capacity
Alvaro-Kothe Oct 15, 2025
b5b8f3b
rename bytes_read to bytes_written
Alvaro-Kothe Oct 15, 2025
803a8bf
chore: move errno assignment
Alvaro-Kothe Oct 15, 2025
31f26cf
fix: fix error logic by comparing pointers
Alvaro-Kothe Oct 15, 2025
171b553
fix: use pointers
Alvaro-Kothe Oct 15, 2025
30f6bdc
fix: keep track on how many bytes to read
Alvaro-Kothe Oct 15, 2025
554675b
chore: cast to size_t
Alvaro-Kothe Oct 15, 2025
7ea4454
fix: cast pointer to fix Wc++-compat warning
Alvaro-Kothe Oct 15, 2025
82dc037
fix: remove casts for -Weverything
Alvaro-Kothe Oct 15, 2025
627df4c
fix: move remaining_bytes_to_read to the start of loop
Alvaro-Kothe Oct 15, 2025
1b3eba0
fix: consolidate it even further
Alvaro-Kothe Oct 15, 2025
e1667fa
fix: move right definition
Alvaro-Kothe Oct 15, 2025
bee776a
chore: remove superfluous comments
Alvaro-Kothe Oct 15, 2025
d693345
fix: add const qualifier
Alvaro-Kothe Oct 15, 2025
593c614
fix: remove unnecessary NULL and null-byte checks
Alvaro-Kothe Oct 15, 2025
ff4d48b
fix: remove unnecessary errno verification
Alvaro-Kothe Oct 15, 2025
e3a88d3
chore: remove NULL assignment
Alvaro-Kothe Oct 15, 2025
b135738
fix: don't recompute strlen
Alvaro-Kothe Oct 15, 2025
ba8c9b3
chore: add some comments back to simplify diff
Alvaro-Kothe Oct 15, 2025
818921f
fix: reset errno after handling it
Alvaro-Kothe Oct 15, 2025
3e067f7
fix: put back const char p
Alvaro-Kothe Oct 15, 2025
c0ed83c
fix: improve diff for sign handling
Alvaro-Kothe Oct 15, 2025
cb60adb
fix: improve diff for trailing whitespace
Alvaro-Kothe Oct 15, 2025
5117e89
chore: remove newline to simplify diff even more
Alvaro-Kothe Oct 15, 2025
e1e327a
chore: drop another superfluous comment
Alvaro-Kothe Oct 15, 2025
ffcb7c2
test: xfail python engine with consecutive thousand separators
Alvaro-Kothe Oct 15, 2025
47b87f9
fix: move errno handling to avoid polution and early return
Alvaro-Kothe Oct 16, 2025
cd536fb
chore: rename to number for diff
Alvaro-Kothe Oct 16, 2025
1ef9259
chore: add comment explaining consecutive thousand separators in C en…
Alvaro-Kothe Oct 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
309 changes: 157 additions & 152 deletions pandas/_libs/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,11 @@ GitHub. See Python Software Foundation License and BSD licenses for these.

#include <ctype.h>
#include <float.h>
#include <limits.h>
#include <math.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdlib.h>

#include "pandas/portable.h"
#include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64
Expand Down Expand Up @@ -1834,201 +1837,203 @@ int uint64_conflict(uint_state *self) {
return self->seen_uint && (self->seen_sint || self->seen_null);
}

int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
int *error, char tsep) {
const char *p = p_item;
// Skip leading spaces.
while (isspace_ascii(*p)) {
++p;
/**
* @brief Check if the character in the pointer indicates a number.
* It expects that you consumed all leading whitespace.
*
* @param p_item Pointer to verify
* @return Non-zero integer indicating that has a digit 0 otherwise.
*/
static inline bool has_digit_int(const char *str) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Going back to my comment made (in another PR?) - if we replace the char ** words member with a custom struct, we could add to that struct a member that flags this as the word is built, rather than coming back after the fact.

Not something to tackle in this PR - just wanted to give you an idea

if (!str || *str == '\0') {
return false;
}

switch (*str) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
return true;
case '+':
case '-':
return isdigit_ascii(str[1]);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will violate bounds checking if you pass a string of + or -

default:
return false;
}
}

// Handle sign.
const bool isneg = *p == '-' ? true : false;
// Handle sign.
if (isneg || (*p == '+')) {
p++;
static inline bool has_only_spaces(const char *str) {
while (*str != '\0' && isspace_ascii(*str)) {
str++;
}
return *str == '\0';
}

// Check that there is a first digit.
if (!isdigit_ascii(*p)) {
// Error...
*error = ERROR_NO_DIGITS;
static int power_int(int base, int exponent) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is int the right return type for this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mainly because I expect the result to be a small power of 10, up to 10**6

// https://en.wikipedia.org/wiki/Exponentiation_by_squaring
if (exponent == 0) {
return 1;
} else if (exponent < 0) {
return 0;
}

int64_t number = 0;
if (isneg) {
// If number is greater than pre_min, at least one more digit
// can be processed without overflowing.
int dig_pre_min = -(int_min % 10);
int64_t pre_min = int_min / 10;

// Process the digits.
char d = *p;
if (tsep != '\0') {
while (1) {
if (d == tsep) {
d = *++p;
continue;
} else if (!isdigit_ascii(d)) {
break;
}
if ((number > pre_min) ||
((number == pre_min) && (d - '0' <= dig_pre_min))) {
number = number * 10 - (d - '0');
d = *++p;
} else {
*error = ERROR_OVERFLOW;
return 0;
}
}
} else {
while (isdigit_ascii(d)) {
if ((number > pre_min) ||
((number == pre_min) && (d - '0' <= dig_pre_min))) {
number = number * 10 - (d - '0');
d = *++p;
} else {
*error = ERROR_OVERFLOW;
return 0;
}
}
int result = 1;

while (exponent > 1) {
if (exponent % 2 == 1) {
result *= base;
exponent--;
}
} else {
// If number is less than pre_max, at least one more digit
// can be processed without overflowing.
int64_t pre_max = int_max / 10;
int dig_pre_max = int_max % 10;

// Process the digits.
char d = *p;
if (tsep != '\0') {
while (1) {
if (d == tsep) {
d = *++p;
continue;
} else if (!isdigit_ascii(d)) {
break;
}
if ((number < pre_max) ||
((number == pre_max) && (d - '0' <= dig_pre_max))) {
number = number * 10 + (d - '0');
d = *++p;
result *= result;
exponent /= 2;
}

} else {
*error = ERROR_OVERFLOW;
return 0;
}
}
} else {
while (isdigit_ascii(d)) {
if ((number < pre_max) ||
((number == pre_max) && (d - '0' <= dig_pre_max))) {
number = number * 10 + (d - '0');
d = *++p;
return result * base;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks like it could overflow easily

}

} else {
*error = ERROR_OVERFLOW;
return 0;
}
}
static inline int64_t add_int_check_overflow(int64_t lhs, int64_t rhs,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's no need to implement custom overflow detection. We already have macros that defer to compiler builtins (see the np_datetime.c module). You can refactor those if needed to make available here.

FWIW C23 also standardizes checked overflow functions. I'm not sure how far along MSVC is in implementing that, but for other platforms we can likely use the standard on newer compilers.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The pinned meson-python version doesn't allow compiling with std=c23. I don't know why stdckdint.h worked on my system with c11.

I will use the macro in np_datetime.c to check for overflow, but I will move it to the header.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am still using my implementation for uint overflow, because the NumPy implementation seems specific to int64 (at least for Windows).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The pinned meson-python version doesn't allow compiling with std=c23. I don't know why stdckdint.h worked on my system with c11.

You can specify multiple standards with Meson, and it will pick the first that the compiler supports. So can just set the option of c_std=c23,c11

I don't know why stdckdint.h worked on my system with c11

While stdckdint.h may not have been standardized until after c11, it doesn't exclude your compiler from using it if it has already implemented.

I am still using my implementation for uint overflow, because the NumPy implementation seems specific to int64 (at least for Windows).

Its unfortunate the MSVC implementation doesn't use a generic macro like the gcc/clang versions do to make these work regardless of size. However, there's the ULongLongAdd function you can use instead of your own implementation:

https://learn.microsoft.com/en-us/windows/win32/api/intsafe/nf-intsafe-ulonglongadd

Generally opt for the compiler built-ins when available; they are almost assuredly faster, and mean less code we have to maintain :-)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can specify multiple standards with Meson, and it will pick the first that the compiler supports. So can just set the option of c_std=c23,c11

I got this error

../../meson.build:2:0: ERROR: Value "c11,c23" (of type "string") for combo option "C language standard to use" is not one of the choices. Possible choices are (as string): "none",
"c89", "c99", "c11", "c17", "c18", "c2x", "gnu89", "gnu99", "gnu11", "gnu17", "gnu18", "gnu2x".

It needs meson>=1.4

int64_t mul_lhs) {
// rhs will always be positive, because this function
// only executes after the first parse, hence the sign will always go to lhs.
// if lhs > 0:
// Will overflow if (mul_lhs * lhs) + rhs > INT_MAX
// iff lhs > (INT_MAX - rhs) / mul_lhs
// if lhs < 0:
// Will underflow if (mul_lhs * lhs) - rhs < INT_MIN
// iff lhs < (INT_MIN + rhs) / mul_lhs
if (lhs >= 0) {
if (lhs > (INT_MAX - rhs) / mul_lhs) {
errno = ERANGE;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Related to my previous comment about return codes, I think in user code its better to return a code when you can rather than setting the global errno. errno is a static global, so its not thread safe, and you can be mangling codes by doing this. I think its also more of an artifact to the language than a modern error messaging solution

}
} else {
if (lhs < (INT_MIN + rhs) / mul_lhs) {
errno = ERANGE;
}
rhs = -rhs;
}
return lhs * mul_lhs + rhs;
}

// Skip trailing spaces.
while (isspace_ascii(*p)) {
++p;
static inline uint64_t add_uint_check_overflow(uint64_t lhs, uint64_t rhs,
uint64_t mul_lhs) {
if (lhs > (UINT_MAX - rhs) / mul_lhs) {
errno = ERANGE;
}
return lhs * mul_lhs + rhs;
}

// Did we use up all the characters?
if (*p) {
*error = ERROR_INVALID_CHARS;
int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
int *error, char tsep) {
if (!p_item || *p_item == '\0') {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it necessary to add this? I don't think we should be special-casing behavior for null or the null byte; I expect the former is undefined (since its not a string) and the latter would be handled naturally by the rest of the logic (?)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, not necessary. The null byte is already handled below.

*error = ERROR_NO_DIGITS;
return 0;
}

*error = 0;
return number;
while (isspace_ascii(*p_item)) {
++p_item;
}

if (!has_digit_int(p_item)) {
*error = ERROR_NO_DIGITS;
return 0;
}

errno = 0;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to check that the errno isn't set already before just clearing?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't seem necessary, this is just parsing a new number and error handling is done in parsers.pyx

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you can remove this now, right? Generally I'd advise against clearing errno without good reason. This currently is equivalent to doing a try ... except Exception in Python and blindly continuing along, so it should be avoided

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The errno should be reset. I added a comment explaining why. Mainly because strtoll assign its value and I want to reset before calling it, so that parsing previous words don't pollute the verification below.

char *endptr = NULL;
int64_t result = strtoll(p_item, &endptr, 10);

while (errno == 0 && tsep != '\0' && *endptr == tsep) {
// Skip multiple consecutive tsep
while (*endptr == tsep) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe I am misreading but its strange to see this in the invariant here and directly preceding it. Can we consolidate that logic somehow?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another option is to move endptr to the next character and make sure it's a digit, if it's not, exit the loop.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That will probably read easier

endptr++;
}

char *new_end = NULL;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need to assign NULL

int64_t next_part = strtoll(endptr, &new_end, 10);
ptrdiff_t digits = new_end - endptr;
int64_t mul_result = power_int(10, (int)digits);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah OK - I think you are assembling this in pieces. However, I'd still suggest just getting rid of the separators in a local buffer and then just calling strtoll (or whatever function) on the whole buffer. There's a lot of nuance to range/value handling that its not worth reimplementing

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem with the local buffer is that there were failing tests on macOS that I couldn't reproduce. The malloc solution worked, but the local buffer on the stack was failing.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That sounds like you were hitting some undefined behavior, but that would come from whatever the implementation was, not the general solution. We should definitely go back to that route and just see where we got stuck

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok. I will address some of your comments here, commit, and then revert to that state.

result = add_int_check_overflow(result, next_part, mul_result);
endptr = new_end;
}

if (!has_only_spaces(endptr)) {
// Check first for invalid characters because we may
// want to skip integer parsing if we find one.
*error = ERROR_INVALID_CHARS;
result = 0;
} else if (errno == ERANGE || result > int_max || result < int_min) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose this is unrelated to your PR but why does this function accept int_max and int_min as arguments? Assuming those are actually set to INT64_MAX and INT64_MIN this is a no-op, and potentially waste of cycles if the compiler can't optimize it away

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, it's unnecessary. It's set to INT64_MAX and INT64_MIN.

data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
&error, parser.thousands)

Should I just check errno or I should also change the function signature?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changing the function signature is fine (let's do as a separate PR)

*error = ERROR_OVERFLOW;
result = 0;
} else {
*error = 0;
}

return result;
}

uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
uint64_t uint_max, int *error, char tsep) {
const char *p = p_item;
// Skip leading spaces.
while (isspace_ascii(*p)) {
++p;
if (!p_item || *p_item == '\0') {
*error = ERROR_NO_DIGITS;
return 0;
}

// Handle sign.
if (*p == '-') {
while (isspace_ascii(*p_item)) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you removed the const char *p = p_item assignment as a cleanup and not for any type of functionality, but if that's true its inflating the diff. Should move cleanup items like that to a separate PR

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've made several changes that complicate a lot the diff. I'll put back the const char *p assignment, and also remove the need for some auxiliary functions created.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great - thanks!

++p_item;
}

if (*p_item == '-') {
state->seen_sint = 1;
*error = 0;
return 0;
} else if (*p == '+') {
p++;
} else if (*p_item == '+') {
p_item++;
}

// Check that there is a first digit.
if (!isdigit_ascii(*p)) {
// Error...
if (!isdigit_ascii(*p_item)) {
*error = ERROR_NO_DIGITS;
return 0;
}

// If number is less than pre_max, at least one more digit
// can be processed without overflowing.
//
// Process the digits.
uint64_t number = 0;
const uint64_t pre_max = uint_max / 10;
const uint64_t dig_pre_max = uint_max % 10;
char d = *p;
if (tsep != '\0') {
while (1) {
if (d == tsep) {
d = *++p;
continue;
} else if (!isdigit_ascii(d)) {
break;
}
if ((number < pre_max) ||
((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) {
number = number * 10 + (d - '0');
d = *++p;

} else {
*error = ERROR_OVERFLOW;
return 0;
}
}
} else {
while (isdigit_ascii(d)) {
if ((number < pre_max) ||
((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) {
number = number * 10 + (d - '0');
d = *++p;
errno = 0;
char *endptr = NULL;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't assign NULL

uint64_t result = strtoull(p_item, &endptr, 10);

} else {
*error = ERROR_OVERFLOW;
return 0;
}
while (errno == 0 && tsep != '\0' && *endptr == tsep) {
// Skip multiple consecutive tsep
while (*endptr == tsep) {
endptr++;
}
}

// Skip trailing spaces.
while (isspace_ascii(*p)) {
++p;
char *new_end = NULL;
uint64_t next_part = strtoull(endptr, &new_end, 10);
ptrdiff_t digits = new_end - endptr;
uint64_t mul_result = power_int(10, (int)digits);
result = add_uint_check_overflow(result, next_part, mul_result);
endptr = new_end;
}

// Did we use up all the characters?
if (*p) {
if (!has_only_spaces(endptr)) {
*error = ERROR_INVALID_CHARS;
return 0;
result = 0;
} else if (errno == ERANGE || result > uint_max) {
*error = ERROR_OVERFLOW;
result = 0;
} else {
*error = 0;
}

if (number > (uint64_t)int_max) {
if (result > (uint64_t)int_max) {
state->seen_uint = 1;
}

*error = 0;
return number;
return result;
}
Loading