@@ -3457,13 +3457,7 @@ def _indexer_from_factorized(labels, shape, compress=True):
34573457 comp_ids = group_index
34583458 max_group = com ._long_prod (shape )
34593459
3460- if max_group > 1e6 :
3461- # Use mergesort to avoid memory errors in counting sort
3462- indexer = comp_ids .argsort (kind = 'mergesort' )
3463- else :
3464- indexer , _ = _algos .groupsort_indexer (comp_ids .astype (np .int64 ),
3465- max_group )
3466-
3460+ indexer = _get_group_index_sorter (comp_ids .astype (np .int64 ), max_group )
34673461 return indexer
34683462
34693463
@@ -3586,21 +3580,27 @@ def _get_indices_dict(label_list, keys):
35863580
35873581def _get_group_index_sorter (group_index , ngroups ):
35883582 """
3589- _algos.groupsort_indexer is at least O(ngroups), where
3583+ _algos.groupsort_indexer implements `counting sort` and it is at least
3584+ O(ngroups), where
35903585 ngroups = prod(shape)
35913586 shape = map(len, keys)
35923587 that is, linear in the number of combinations (cartesian product) of unique
35933588 values of groupby keys. This can be huge when doing multi-key groupby.
3594- np.argsort is O(count)^2 when using quicksort (the default) where count is the length
3595- of the data-frame;
3589+ np.argsort(kind='mergesort') is O(count x log(count)) where count is the
3590+ length of the data-frame;
3591+ Both algorithms are `stable` sort and that is necessary for correctness of
3592+ groupby operations. e.g. consider:
3593+ df.groupby(key)[col].transform('first')
35963594 """
35973595 count = len (group_index )
3598- if ngroups < count * np .log (count ): # taking complexities literally
3596+ alpha = 0.0 # taking complexities literally; there may be
3597+ beta = 1.0 # some room for fine-tuning these parameters
3598+ if alpha + beta * ngroups < count * np .log (count ):
35993599 sorter , _ = _algos .groupsort_indexer (com ._ensure_int64 (group_index ),
36003600 ngroups )
36013601 return com ._ensure_platform_int (sorter )
36023602 else :
3603- return group_index .argsort ()
3603+ return group_index .argsort (kind = 'mergesort' )
36043604
36053605
36063606def _compress_group_index (group_index , sort = True ):
0 commit comments