pandas-dev · mroeschke · Oct 16, 2025 · Oct 11, 2025 · Oct 13, 2025 · Oct 13, 2025
diff --git a/pandas/_libs/include/pandas/portable.h b/pandas/_libs/include/pandas/portable.h
@@ -35,3 +35,38 @@ The full license is in the LICENSE file, distributed with this software.
   do {                                                                         \
   } while (0) /* fallthrough */
 #endif
+
+#if defined(_WIN32)
+#ifndef ENABLE_INTSAFE_SIGNED_FUNCTIONS
+#define ENABLE_INTSAFE_SIGNED_FUNCTIONS
+#endif
+#include <intsafe.h>
+#define checked_int64_add(a, b, res) LongLongAdd(a, b, res)
+#define checked_int64_sub(a, b, res) LongLongSub(a, b, res)
+#define checked_int64_mul(a, b, res) LongLongMult(a, b, res)
+#define checked_uint64_add(a, b, res) ULongLongAdd(a, b, res)
+#define checked_uint64_sub(a, b, res) ULongLongSub(a, b, res)
+#define checked_uint64_mul(a, b, res) ULongLongMult(a, b, res)
+#else
+#if defined __has_builtin
+#if __has_builtin(__builtin_add_overflow)
+#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res)
+#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res)
+#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res)
+#define checked_uint64_add(a, b, res) __builtin_add_overflow(a, b, res)
+#define checked_uint64_sub(a, b, res) __builtin_sub_overflow(a, b, res)
+#define checked_uint64_mul(a, b, res) __builtin_mul_overflow(a, b, res)
+#else
+_Static_assert(0,
+               "Overflow checking not detected; please try a newer compiler");
+#endif
+// __has_builtin was added in gcc 10, but our muslinux_1_1 build environment
+// only has gcc-9.3, so fall back to __GNUC__ macro as long as we have that
+#elif __GNUC__ > 7
+#define checked_int64_add(a, b, res) __builtin_add_overflow(a, b, res)
+#define checked_int64_sub(a, b, res) __builtin_sub_overflow(a, b, res)
+#define checked_int64_mul(a, b, res) __builtin_mul_overflow(a, b, res)
+#else
+_Static_assert(0, "__has_builtin not detected; please try a newer compiler");
+#endif
+#endif
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
@@ -21,8 +21,11 @@ GitHub. See Python Software Foundation License and BSD licenses for these.
 
 #include <ctype.h>
 #include <float.h>
+#include <limits.h>
 #include <math.h>
 #include <stdbool.h>
+#include <stddef.h>
+#include <stdlib.h>
 
 #include "pandas/portable.h"
 #include "pandas/vendored/klib/khash.h" // for kh_int64_t, kh_destroy_int64
@@ -1834,201 +1837,197 @@ int uint64_conflict(uint_state *self) {
   return self->seen_uint && (self->seen_sint || self->seen_null);
 }
 
+/**
+ * @brief Check if the character in the pointer indicates a number.
+ * It expects that you consumed all leading whitespace.
+ *
+ * @param p_item Pointer to verify
+ * @return Non-zero integer indicating that has a digit 0 otherwise.
+ */
+static inline bool has_digit_int(const char *str) {
+  if (!str || *str == '\0') {
+    return false;
+  }
+
+  switch (*str) {
+  case '0':
+  case '1':
+  case '2':
+  case '3':
+  case '4':
+  case '5':
+  case '6':
+  case '7':
+  case '8':
+  case '9':
+    return true;
+  case '+':
+  case '-':
+    return isdigit_ascii(str[1]);
+  default:
+    return false;
+  }
+}
+
+static inline bool has_only_spaces(const char *str) {
+  while (*str != '\0' && isspace_ascii(*str)) {
+    str++;
+  }
+  return *str == '\0';
+}
+
+static int power_int(int base, int exponent) {
+  // https://en.wikipedia.org/wiki/Exponentiation_by_squaring
+  if (exponent == 0) {
+    return 1;
+  } else if (exponent < 0) {
+    return 0;
+  }
+
+  int result = 1;
+
+  while (exponent > 1) {
+    if (exponent % 2 == 1) {
+      result *= base;
+      exponent--;
+    }
+    result *= result;
+    exponent /= 2;
+  }
+
+  return result * base;
+}
+
 int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
                      int *error, char tsep) {
-  const char *p = p_item;
-  // Skip leading spaces.
-  while (isspace_ascii(*p)) {
-    ++p;
+  if (!p_item || *p_item == '\0') {
+    *error = ERROR_NO_DIGITS;
+    return 0;
   }
 
-  // Handle sign.
-  const bool isneg = *p == '-' ? true : false;
-  // Handle sign.
-  if (isneg || (*p == '+')) {
-    p++;
+  while (isspace_ascii(*p_item)) {
+    ++p_item;
   }
 
-  // Check that there is a first digit.
-  if (!isdigit_ascii(*p)) {
-    // Error...
+  if (!has_digit_int(p_item)) {
     *error = ERROR_NO_DIGITS;
     return 0;
   }
 
-  int64_t number = 0;
-  if (isneg) {
-    // If number is greater than pre_min, at least one more digit
-    // can be processed without overflowing.
-    int dig_pre_min = -(int_min % 10);
-    int64_t pre_min = int_min / 10;
-
-    // Process the digits.
-    char d = *p;
-    if (tsep != '\0') {
-      while (1) {
-        if (d == tsep) {
-          d = *++p;
-          continue;
-        } else if (!isdigit_ascii(d)) {
-          break;
-        }
-        if ((number > pre_min) ||
-            ((number == pre_min) && (d - '0' <= dig_pre_min))) {
-          number = number * 10 - (d - '0');
-          d = *++p;
-        } else {
-          *error = ERROR_OVERFLOW;
-          return 0;
-        }
-      }
-    } else {
-      while (isdigit_ascii(d)) {
-        if ((number > pre_min) ||
-            ((number == pre_min) && (d - '0' <= dig_pre_min))) {
-          number = number * 10 - (d - '0');
-          d = *++p;
-        } else {
-          *error = ERROR_OVERFLOW;
-          return 0;
-        }
-      }
-    }
-  } else {
-    // If number is less than pre_max, at least one more digit
-    // can be processed without overflowing.
-    int64_t pre_max = int_max / 10;
-    int dig_pre_max = int_max % 10;
-
-    // Process the digits.
-    char d = *p;
-    if (tsep != '\0') {
-      while (1) {
-        if (d == tsep) {
-          d = *++p;
-          continue;
-        } else if (!isdigit_ascii(d)) {
-          break;
-        }
-        if ((number < pre_max) ||
-            ((number == pre_max) && (d - '0' <= dig_pre_max))) {
-          number = number * 10 + (d - '0');
-          d = *++p;
+  errno = 0;
+  char *endptr = NULL;
+  int64_t result = strtoll(p_item, &endptr, 10);
+  bool is_negative = result < 0;
 
-        } else {
-          *error = ERROR_OVERFLOW;
-          return 0;
-        }
-      }
-    } else {
-      while (isdigit_ascii(d)) {
-        if ((number < pre_max) ||
-            ((number == pre_max) && (d - '0' <= dig_pre_max))) {
-          number = number * 10 + (d - '0');
-          d = *++p;
+  while (errno == 0 && tsep != '\0' && *endptr == tsep) {
+    // Skip multiple consecutive tsep
+    while (*endptr == tsep) {
+      endptr++;
+    }
 
-        } else {
-          *error = ERROR_OVERFLOW;
-          return 0;
-        }
-      }
+    char *new_end = NULL;
+    int64_t next_part = strtoll(endptr, &new_end, 10);
+    if (is_negative) {
+      next_part = -next_part;
     }
-  }
 
-  // Skip trailing spaces.
-  while (isspace_ascii(*p)) {
-    ++p;
+    ptrdiff_t digits = new_end - endptr;
+    int64_t mul_result = power_int(10, (int)digits);
+    // result * mul_result
+    if (checked_int64_mul(result, mul_result, &result)) {
+      // overflow
+      errno = ERANGE;
+    }
+    // result + next_part
+    if (checked_int64_add(result, next_part, &result)) {
+      // overflow
+      errno = ERANGE;
+    }
+    endptr = new_end;
   }
 
-  // Did we use up all the characters?
-  if (*p) {
+  if (!has_only_spaces(endptr)) {
+    // Check first for invalid characters because we may
+    // want to skip integer parsing if we find one.
     *error = ERROR_INVALID_CHARS;
-    return 0;
+    result = 0;
+  } else if (errno == ERANGE || result > int_max || result < int_min) {
 data[i] = str_to_int64(word, INT64_MIN, INT64_MAX, 
                        &error, parser.thousands) 
 data[i] = str_to_int64(word, INT64_MIN, INT64_MAX, 
                        &error, parser.thousands) 
+    *error = ERROR_OVERFLOW;
+    result = 0;
+  } else {
+    *error = 0;
   }
 
-  *error = 0;
-  return number;
+  return result;
 }
 
 uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
                        uint64_t uint_max, int *error, char tsep) {
-  const char *p = p_item;
-  // Skip leading spaces.
-  while (isspace_ascii(*p)) {
-    ++p;
+  if (!p_item || *p_item == '\0') {
+    *error = ERROR_NO_DIGITS;
+    return 0;
   }
 
-  // Handle sign.
-  if (*p == '-') {
+  while (isspace_ascii(*p_item)) {
+    ++p_item;
+  }
+
+  if (*p_item == '-') {
     state->seen_sint = 1;
     *error = 0;
     return 0;
-  } else if (*p == '+') {
-    p++;
+  } else if (*p_item == '+') {
+    p_item++;
   }
 
   // Check that there is a first digit.
-  if (!isdigit_ascii(*p)) {
-    // Error...
+  if (!isdigit_ascii(*p_item)) {
     *error = ERROR_NO_DIGITS;
     return 0;
   }
 
-  // If number is less than pre_max, at least one more digit
-  // can be processed without overflowing.
-  //
-  // Process the digits.
-  uint64_t number = 0;
-  const uint64_t pre_max = uint_max / 10;
-  const uint64_t dig_pre_max = uint_max % 10;
-  char d = *p;
-  if (tsep != '\0') {
-    while (1) {
-      if (d == tsep) {
-        d = *++p;
-        continue;
-      } else if (!isdigit_ascii(d)) {
-        break;
-      }
-      if ((number < pre_max) ||
-          ((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) {
-        number = number * 10 + (d - '0');
-        d = *++p;
+  errno = 0;
+  char *endptr = NULL;
+  uint64_t result = strtoull(p_item, &endptr, 10);
 
-      } else {
-        *error = ERROR_OVERFLOW;
-        return 0;
-      }
+  while (errno == 0 && tsep != '\0' && *endptr == tsep) {
+    // Skip multiple consecutive tsep
+    while (*endptr == tsep) {
+      endptr++;
     }
-  } else {
-    while (isdigit_ascii(d)) {
-      if ((number < pre_max) ||
-          ((number == pre_max) && ((uint64_t)(d - '0') <= dig_pre_max))) {
-        number = number * 10 + (d - '0');
-        d = *++p;
 
-      } else {
-        *error = ERROR_OVERFLOW;
-        return 0;
-      }
+    char *new_end = NULL;
+    uint64_t next_part = strtoull(endptr, &new_end, 10);
+    ptrdiff_t digits = new_end - endptr;
+    uint64_t mul_result = power_int(10, (int)digits);
+
+    // result * mul_result
+    if (checked_uint64_mul(result, mul_result, &result)) {
+      // overflow
+      errno = ERANGE;
+    }
+    // result + next_part
+    if (checked_uint64_add(result, next_part, &result)) {
+      // overflow
+      errno = ERANGE;
     }
-  }
 
-  // Skip trailing spaces.
-  while (isspace_ascii(*p)) {
-    ++p;
+    endptr = new_end;
   }
 
-  // Did we use up all the characters?
-  if (*p) {
+  if (!has_only_spaces(endptr)) {
     *error = ERROR_INVALID_CHARS;
-    return 0;
+    result = 0;
+  } else if (errno == ERANGE || result > uint_max) {
+    *error = ERROR_OVERFLOW;
+    result = 0;
+  } else {
+    *error = 0;
   }
 
-  if (number > (uint64_t)int_max) {
+  if (result > (uint64_t)int_max) {
     state->seen_uint = 1;
   }
 
-  *error = 0;
-  return number;
+  return result;
 }