1515
1616import numpy as np
1717
18+ from pandas ._config import using_pyarrow_strict_nans
19+
1820from pandas ._libs import lib
19- from pandas ._libs .missing import NA
21+ from pandas ._libs .missing import is_pdna_or_none
2022from pandas ._libs .tslibs import (
2123 Timedelta ,
2224 Timestamp ,
@@ -324,6 +326,11 @@ def _from_sequence_of_strings(
324326 """
325327 Construct a new ExtensionArray from a sequence of strings.
326328 """
329+ mask = isna (strings )
330+
331+ if isinstance (strings , cls ):
332+ strings = strings ._pa_array
333+
327334 pa_type = to_pyarrow_type (dtype )
328335 if (
329336 pa_type is None
@@ -342,22 +349,35 @@ def _from_sequence_of_strings(
342349 from pandas .core .tools .datetimes import to_datetime
343350
344351 scalars = to_datetime (strings , errors = "raise" ).date
352+
353+ if isinstance (strings , cls ):
354+ # Avoid an object path
355+ # TODO: this assumes that pyarrows str->date casting is the
356+ # same as to_datetime. Is that a fair assumption?
357+ scalars = strings ._pa_array .cast (pa_type )
358+ else :
359+ scalars = pa .array (scalars , mask = mask .view (bool ), type = pa_type )
360+
345361 elif pa .types .is_duration (pa_type ):
346362 from pandas .core .tools .timedeltas import to_timedelta
347363
348364 scalars = to_timedelta (strings , errors = "raise" )
365+
349366 if pa_type .unit != "ns" :
350367 # GH51175: test_from_sequence_of_strings_pa_array
351368 # attempt to parse as int64 reflecting pyarrow's
352369 # duration to string casting behavior
353370 mask = isna (scalars )
354- if not isinstance (strings , (pa .Array , pa .ChunkedArray )):
355- strings = pa .array (strings , type = pa .string ())
371+ if isinstance (strings , cls ):
372+ strings = strings ._pa_array
373+ elif not isinstance (strings , (pa .Array , pa .ChunkedArray )):
374+ strings = pa .array (strings , type = pa .string (), mask = mask )
356375 strings = pc .if_else (mask , None , strings )
357376 try :
358377 scalars = strings .cast (pa .int64 ())
359378 except pa .ArrowInvalid :
360379 pass
380+
361381 elif pa .types .is_time (pa_type ):
362382 from pandas .core .tools .times import to_time
363383
@@ -373,7 +393,7 @@ def _from_sequence_of_strings(
373393 if isinstance (strings , (pa .Array , pa .ChunkedArray )):
374394 scalars = strings
375395 else :
376- scalars = pa .array (strings , type = pa .string ())
396+ scalars = pa .array (strings , type = pa .string (), mask = mask )
377397 scalars = pc .if_else (pc .equal (scalars , "1.0" ), "1" , scalars )
378398 scalars = pc .if_else (pc .equal (scalars , "0.0" ), "0" , scalars )
379399 scalars = scalars .cast (pa .bool_ ())
@@ -385,12 +405,16 @@ def _from_sequence_of_strings(
385405 from pandas .core .tools .numeric import to_numeric
386406
387407 scalars = to_numeric (strings , errors = "raise" )
388- if not pa .types .is_decimal (pa_type ):
408+ if not pa .types .is_decimal (pa_type ) and isinstance (
409+ strings , (pa .Array , pa .ChunkedArray )
410+ ):
389411 # TODO: figure out why doing this cast breaks with decimal dtype
390412 # in test_from_sequence_of_strings_pa_array
391413 mask = strings .is_null ()
392414 scalars = pa .array (scalars , mask = np .array (mask ), type = pa_type )
393415 # TODO: could we just do strings.cast(pa_type)?
416+ elif mask is not None :
417+ scalars = pa .array (scalars , mask = mask .view (bool ), type = pa_type )
394418
395419 else :
396420 raise NotImplementedError (
@@ -544,23 +568,20 @@ def _box_pa_array(
544568 return pa_array
545569
546570 mask = None
547- if getattr (value , "dtype" , None ) is None or value .dtype .kind not in "mfM" :
548- # similar to isna(value) but exclude NaN
549- # TODO: cythonize!
550- mask = np .array ([x is NA or x is None for x in value ], dtype = bool )
551-
552- from_pandas = False
553- if pa .types .is_integer (pa_type ):
554- # If user specifically asks to cast a numpy float array with NaNs
555- # to pyarrow integer, we'll treat those NaNs as NA
556- from_pandas = True
571+ if getattr (value , "dtype" , None ) is None or value .dtype .kind not in "mMf" :
572+ try :
573+ arr_value = np .asarray (value )
574+ except ValueError :
575+ # e.g. list dtype with mixed-length lists
576+ arr_value = np .asarray (value , dtype = object )
577+ # similar to isna(value) but exclude NaN, NaT, nat-like, nan-like
578+ mask = is_pdna_or_none (arr_value )
579+
557580 try :
558- pa_array = pa .array (
559- value , type = pa_type , mask = mask , from_pandas = from_pandas
560- )
581+ pa_array = pa .array (value , type = pa_type , mask = mask )
561582 except (pa .ArrowInvalid , pa .ArrowTypeError ):
562583 # GH50430: let pyarrow infer type, then cast
563- pa_array = pa .array (value , mask = mask , from_pandas = from_pandas )
584+ pa_array = pa .array (value , mask = mask )
564585
565586 if pa_type is None and pa .types .is_duration (pa_array .type ):
566587 # Workaround https://github.com/apache/arrow/issues/37291
@@ -1496,7 +1517,11 @@ def to_numpy(
14961517 pa .types .is_floating (pa_type )
14971518 and (
14981519 na_value is np .nan
1499- or (original_na_value is lib .no_default and is_float_dtype (dtype ))
1520+ or (
1521+ original_na_value is lib .no_default
1522+ and is_float_dtype (dtype )
1523+ and not using_pyarrow_strict_nans ()
1524+ )
15001525 )
15011526 ):
15021527 result = data ._pa_array .to_numpy ()
@@ -1964,8 +1989,10 @@ def _explode(self):
19641989 fill_value = pa .scalar ([None ], type = self ._pa_array .type )
19651990 mask = counts == 0
19661991 if mask .any ():
1967- values = values .copy ()
1968- values [mask ] = fill_value
1992+ # pc.if_else here is similar to `values[mask] = fill_value`
1993+ # but this avoids a object-dtype round-trip.
1994+ pa_values = pc .if_else (~ mask , values ._pa_array , fill_value )
1995+ values = type (self )(pa_values )
19691996 counts = counts .copy ()
19701997 counts [mask ] = 1
19711998 values = values .fillna (fill_value )
@@ -2367,6 +2394,7 @@ def _replace_with_mask(
23672394 replacements = np .array (replacements , dtype = object )
23682395 elif isinstance (replacements , pa .Scalar ):
23692396 replacements = replacements .as_py ()
2397+
23702398 result = np .array (values , dtype = object )
23712399 result [mask ] = replacements
23722400 return pa .array (result , type = values .type )
0 commit comments