@@ -3431,13 +3431,7 @@ def _indexer_from_factorized(labels, shape, compress=True):
34313431 comp_ids = group_index
34323432 max_group = com ._long_prod (shape )
34333433
3434- if max_group > 1e6 :
3435- # Use mergesort to avoid memory errors in counting sort
3436- indexer = comp_ids .argsort (kind = 'mergesort' )
3437- else :
3438- indexer , _ = _algos .groupsort_indexer (comp_ids .astype (np .int64 ),
3439- max_group )
3440-
3434+ indexer = _get_group_index_sorter (comp_ids .astype (np .int64 ), max_group )
34413435 return indexer
34423436
34433437
@@ -3560,21 +3554,27 @@ def _get_indices_dict(label_list, keys):
35603554
35613555def _get_group_index_sorter (group_index , ngroups ):
35623556 """
3563- _algos.groupsort_indexer is at least O(ngroups), where
3557+ _algos.groupsort_indexer implements `counting sort` and it is at least
3558+ O(ngroups), where
35643559 ngroups = prod(shape)
35653560 shape = map(len, keys)
35663561 that is, linear in the number of combinations (cartesian product) of unique
35673562 values of groupby keys. This can be huge when doing multi-key groupby.
3568- np.argsort is O(count)^2 when using quicksort (the default) where count is the length
3569- of the data-frame;
3563+ np.argsort(kind='mergesort') is O(count x log(count)) where count is the
3564+ length of the data-frame;
3565+ Both algorithms are `stable` sort and that is necessary for correctness of
3566+ groupby operations. e.g. consider:
3567+ df.groupby(key)[col].transform('first')
35703568 """
35713569 count = len (group_index )
3572- if ngroups < count * np .log (count ): # taking complexities literally
3570+ alpha = 0.0 # taking complexities literally; there may be
3571+ beta = 1.0 # some room for fine-tuning these parameters
3572+ if alpha + beta * ngroups < count * np .log (count ):
35733573 sorter , _ = _algos .groupsort_indexer (com ._ensure_int64 (group_index ),
35743574 ngroups )
35753575 return com ._ensure_platform_int (sorter )
35763576 else :
3577- return group_index .argsort ()
3577+ return group_index .argsort (kind = 'mergesort' )
35783578
35793579
35803580def _compress_group_index (group_index , sort = True ):
0 commit comments