-
-
Notifications
You must be signed in to change notification settings - Fork 19.3k
CLN: use integer parsing functions from stdlib #62658
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
c3cc4a1
2459313
d8a454e
87789e6
2287944
2bea3c2
fb38679
f9ede5c
798c263
2f06f19
280b55e
d85aaf0
9046ecc
a4e2fb8
ef82cf4
5afeb11
0ef47a7
c840ef0
132342b
e6977cc
8120eea
1f5d506
479a2ab
c37c355
7d55283
ffe50ce
af5ad71
9211704
d026b01
d76ff5f
b523a19
6265172
abed6c1
8616f9f
c4e0e25
0b19208
29d74f7
b5b8f3b
803a8bf
31f26cf
171b553
30f6bdc
554675b
7ea4454
82dc037
627df4c
1b3eba0
e1667fa
bee776a
d693345
593c614
ff4d48b
e3a88d3
b135738
ba8c9b3
818921f
3e067f7
c0ed83c
cb60adb
5117e89
e1e327a
ffcb7c2
47b87f9
cd536fb
1ef9259
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -23,10 +23,13 @@ GitHub. See Python Software Foundation License and BSD licenses for these. | |||||
| #include <float.h> | ||||||
| #include <math.h> | ||||||
| #include <stdbool.h> | ||||||
| #include <stdlib.h> | ||||||
|
|
||||||
| #include "pandas/portable.h" | ||||||
| #include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64 | ||||||
|
|
||||||
| static const int PROCESSED_WORD_CAPACITY = 128; | ||||||
|
|
||||||
| void coliter_setup(coliter_t *self, parser_t *parser, int64_t i, | ||||||
| int64_t start) { | ||||||
| // column i, starting at 0 | ||||||
|
|
@@ -1834,201 +1837,163 @@ int uint64_conflict(uint_state *self) { | |||||
| return self->seen_uint && (self->seen_sint || self->seen_null); | ||||||
| } | ||||||
|
|
||||||
| int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, | ||||||
| int *error, char tsep) { | ||||||
| const char *p = p_item; | ||||||
| // Skip leading spaces. | ||||||
| while (isspace_ascii(*p)) { | ||||||
| ++p; | ||||||
| /** | ||||||
| * @brief Check if the character in the pointer indicates a number. | ||||||
| * It expects that you consumed all leading whitespace. | ||||||
| * | ||||||
| * @param p_item Pointer to verify | ||||||
| * @return Non-zero integer indicating that has a digit 0 otherwise. | ||||||
| */ | ||||||
| static inline int has_digit_int(const char *str) { | ||||||
| if (!str || *str == '\0') { | ||||||
| return 0; | ||||||
| } | ||||||
|
|
||||||
| // Handle sign. | ||||||
| const bool isneg = *p == '-' ? true : false; | ||||||
| // Handle sign. | ||||||
| if (isneg || (*p == '+')) { | ||||||
| p++; | ||||||
| switch (*str) { | ||||||
| case '0': | ||||||
| case '1': | ||||||
| case '2': | ||||||
| case '3': | ||||||
| case '4': | ||||||
| case '5': | ||||||
| case '6': | ||||||
| case '7': | ||||||
| case '8': | ||||||
| case '9': | ||||||
| return 1; | ||||||
| case '+': | ||||||
| case '-': | ||||||
| return isdigit_ascii(str[1]); | ||||||
|
||||||
| default: | ||||||
| return 0; | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| // Check that there is a first digit. | ||||||
| if (!isdigit_ascii(*p)) { | ||||||
| // Error... | ||||||
| *error = ERROR_NO_DIGITS; | ||||||
| return 0; | ||||||
| static inline int has_only_spaces(const char *str) { | ||||||
| while (*str != '\0' && isspace_ascii(*str)) { | ||||||
| str++; | ||||||
| } | ||||||
| return *str == '\0'; | ||||||
| } | ||||||
|
|
||||||
| int64_t number = 0; | ||||||
| if (isneg) { | ||||||
| // If number is greater than pre_min, at least one more digit | ||||||
| // can be processed without overflowing. | ||||||
| int dig_pre_min = -(int_min % 10); | ||||||
| int64_t pre_min = int_min / 10; | ||||||
|
|
||||||
| // Process the digits. | ||||||
| char d = *p; | ||||||
| if (tsep != '\0') { | ||||||
| while (1) { | ||||||
| if (d == tsep) { | ||||||
| d = *++p; | ||||||
| continue; | ||||||
| } else if (!isdigit_ascii(d)) { | ||||||
| break; | ||||||
| } | ||||||
| if ((number > pre_min) || | ||||||
| ((number == pre_min) && (d - '0' <= dig_pre_min))) { | ||||||
| number = number * 10 - (d - '0'); | ||||||
| d = *++p; | ||||||
| } else { | ||||||
| *error = ERROR_OVERFLOW; | ||||||
| return 0; | ||||||
| } | ||||||
| } | ||||||
| } else { | ||||||
| while (isdigit_ascii(d)) { | ||||||
| if ((number > pre_min) || | ||||||
| ((number == pre_min) && (d - '0' <= dig_pre_min))) { | ||||||
| number = number * 10 - (d - '0'); | ||||||
| d = *++p; | ||||||
| } else { | ||||||
| *error = ERROR_OVERFLOW; | ||||||
| return 0; | ||||||
| } | ||||||
| } | ||||||
| /* Copy a string without `char_to_remove` into `output`, | ||||||
| * while ensuring it's null terminated. | ||||||
| */ | ||||||
| static void copy_string_without_char(char *output, const char *str, | ||||||
|
||||||
| static void copy_string_without_char(char *output, const char *str, | |
| static void copy_string_without_char(char output[PROCESSED_WORD_CAPACITY], const char *str, |
Don't think you need to pass this capacity as a separate argument, since it is always the same value
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done in 2287944
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would be great to avoid character-by-character writes if we can, especially since this is a relatively sparse character to search for.
You might be limited with what you can do without some of the larger changes I suggested, but maybe even just finding a region and writing multiple bytes at once will be more performant.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Perhaps something like:
size_t pos = 0;
const char* oldptr = p_item;
while (const char* newptr = strchr(oldptr, tsep) != NULL) {
size_t len = newptr - oldptr;
memcpy(output[pos], oldptr, len);
oldptr = newptr + 1;
pos += len;
}
output[startpos + 1] = '\0';Might be an off by one and probably worth bounds checking. You might also want to use strchrnul instead of strchr to avoid a buffer overrun
Just some brief unchecked thoughts - hope they help
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I couldn't make strchr work, because I couldn't know where the null terminator would be. I tried your suggestion in 2bea3c2.
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is pretty wasteful to continually write null bytes. You could either memset the buffer before you send it, or ideally short circuit when you've processed all the necessary bytes
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
continually write null bytes
Actually, it wasn't continually writing null bytes. Just once after copying src. Anyway, I changed to use memset.
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it necessary to add this? I don't think we should be special-casing behavior for null or the null byte; I expect the former is undefined (since its not a string) and the latter would be handled naturally by the rest of the logic (?)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Indeed, not necessary. The null byte is already handled below.
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we need to check that the errno isn't set already before just clearing?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It doesn't seem necessary, this is just parsing a new number and error handling is done in parsers.pyx
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you can remove this now, right? Generally I'd advise against clearing errno without good reason. This currently is equivalent to doing a try ... except Exception in Python and blindly continuing along, so it should be avoided
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The errno should be reset. I added a comment explaining why. Mainly because strtoll assign its value and I want to reset before calling it, so that parsing previous words don't pollute the verification below.
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you use memchr here for consistency?
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is lost in a refactor right? I don't think errno will be set by anything preceding this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Indeed, I forgot to remove it.
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| char *endptr = NULL; | |
| char *endptr; |
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I suppose this is unrelated to your PR but why does this function accept int_max and int_min as arguments? Assuming those are actually set to INT64_MAX and INT64_MIN this is a no-op, and potentially waste of cycles if the compiler can't optimize it away
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Indeed, it's unnecessary. It's set to INT64_MAX and INT64_MIN.
pandas/pandas/_libs/parsers.pyx
Lines 1930 to 1931 in 066a4f7
| data[i] = str_to_int64(word, INT64_MIN, INT64_MAX, | |
| &error, parser.thousands) |
Should I just check errno or I should also change the function signature?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Changing the function signature is fine (let's do as a separate PR)
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you removed the const char *p = p_item assignment as a cleanup and not for any type of functionality, but if that's true its inflating the diff. Should move cleanup items like that to a separate PR
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've made several changes that complicate a lot the diff. I'll put back the const char *p assignment, and also remove the need for some auxiliary functions created.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Great - thanks!
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is undefined behavior, at least according to https://stackoverflow.com/a/79330647/621736. You may need to move the buffer declaration outside of this branch (can run ASAN/UBSAN to check)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It fixes it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We are far enough removed from C89 that we can use the bool type :-)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done in 87789e6