Skip to content
10 changes: 10 additions & 0 deletions pandas/core/config_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -897,3 +897,13 @@ def register_converter_cb(key: str) -> None:
"(at which point this option will be deprecated).",
validator=is_one_of_factory([True, False]),
)

cf.register_option(
"usecols_use_order",
False,
": bool\n "
"Whether usecols parameter will use order of input when "
"making a DataFrame. \n This feature will be default in pandas 3.0"
"(at which point this option will be deprecated).",
validator=is_one_of_factory([True, False]),
)
17 changes: 16 additions & 1 deletion pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
defaultdict,
)
import csv
from inspect import isfunction
import sys
from textwrap import fill
from typing import (
Expand All @@ -26,6 +27,8 @@

import numpy as np

from pandas._config import get_option

from pandas._libs import lib
from pandas._libs.parsers import STR_NA_VALUES
from pandas.errors import (
Expand Down Expand Up @@ -1516,8 +1519,10 @@ def read(self, nrows: int | None = None) -> DataFrame:

if hasattr(self, "orig_options"):
dtype_arg = self.orig_options.get("dtype", None)
usecols = self.orig_options.get("usecols", None)
else:
dtype_arg = None
usecols = None

if isinstance(dtype_arg, dict):
dtype = defaultdict(lambda: None) # type: ignore[var-annotated]
Expand All @@ -1530,6 +1535,17 @@ def read(self, nrows: int | None = None) -> DataFrame:
else:
dtype = None

if get_option("future.usecols_use_order"):
if usecols is None or isfunction(usecols):
# Doesn't change anything if function or None gets passed
pass
elif len(usecols) == len(columns):
# uses size of number in usecols to determine corresponding columns
value_ranked = {v: i for i, v in enumerate(sorted(usecols))}
usecols_pressed = [value_ranked[v] for v in usecols]
columns = [columns[i] for i in usecols_pressed]
col_dict = {k: col_dict[k] for k in columns}

if dtype is not None:
new_col_dict = {}
for k, v in col_dict.items():
Expand All @@ -1548,7 +1564,6 @@ def read(self, nrows: int | None = None) -> DataFrame:
index=index,
copy=False,
)

self._currow += new_rows
return df

Expand Down
40 changes: 40 additions & 0 deletions pandas/tests/io/parser/usecols/test_usecols_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import numpy as np
import pytest

from pandas._config.config import option_context

from pandas.errors import ParserError

from pandas import (
Expand Down Expand Up @@ -545,3 +547,41 @@ def test_usecols_dtype(all_parsers):
{"col1": array(["a", "b"]), "col2": np.array([1, 2], dtype="uint8")}
)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("usecols", [(3, 0, 2), ("d", "a", "c")])
@pytest.mark.parametrize("usecols_use_order", (True, False))
def test_usecols_order(all_parsers, usecols, usecols_use_order):
# TODOE add portion in doc for 3.0 transition
parser = all_parsers
pyarrow_flag = False
data = """\
a,b,c,d
1,2,3,0
4,5,6,0
7,8,9,0
10,11,12,13"""

if parser.engine == "pyarrow":
if isinstance(usecols[0], int):
msg = "The pyarrow engine does not allow 'usecols' to be integer column"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), usecols=usecols)
return
else:
# looks like pyarrow already considers column order by default.
# Modifies test to account for it in selecting expected df
pyarrow_flag = True

if usecols_use_order or pyarrow_flag:
expected = DataFrame(
{"d": [0, 0, 0, 13], "a": [1, 4, 7, 10], "c": [3, 6, 9, 12]}
)
else:
expected = DataFrame(
{"a": [1, 4, 7, 10], "c": [3, 6, 9, 12], "d": [0, 0, 0, 13]}
)

with option_context("future.usecols_use_order", usecols_use_order):
result = parser.read_csv(StringIO(data), usecols=usecols)
tm.assert_frame_equal(result, expected)
Loading