Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit ab14b31

Browse files
Add support for standalone use of pandas MultiIndex (#980)
* Initial version of pandas MultiIndex class * Fixing tests * Removing debug traces and reorganizing code * Fixing PEP, updtaing comments, removing debug scripts * Make separate classes for SdcTypeRef-s types * Move helper functions to new file and a bit of renaming * Using infer_global to type classes instead typeof(type)
1 parent 8de494b commit ab14b31

20 files changed

+2609
-66
lines changed

sdc/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949

5050
import sdc.extensions.indexes.range_index_ext
5151
import sdc.extensions.indexes.int64_index_ext
52+
import sdc.extensions.indexes.multi_index_ext
5253

5354
import sdc.extensions.sdc_hashmap_ext
5455

sdc/datatypes/hpat_pandas_series_functions.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4574,7 +4574,7 @@ def _series_operator_add_str_impl(self, other):
45744574
else:
45754575
indexes_join_res = sdc_indexes_join_outer(left_index, right_index)
45764576

4577-
# FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed
4577+
# FIXME_Numba#6686: remove sdc_fix_indexes_join call at all when issue is fixed
45784578
joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res)
45794579
result_size = len(joined_index)
45804580
result_nan_mask = numpy.zeros(result_size, dtype=numpy.bool_)
@@ -4692,7 +4692,7 @@ def _series_operator_mul_common_impl(self, other):
46924692
else:
46934693
indexes_join_res = sdc_indexes_join_outer(left_index, right_index)
46944694

4695-
# FIXME_Numba#XXXX: remove sdc_fix_indexes_join call at all when issue is fixed
4695+
# FIXME_Numba#6686: remove sdc_fix_indexes_join call at all when issue is fixed
46964696
joined_index, left_indexer, right_indexer = sdc_fix_indexes_join(*indexes_join_res)
46974697
str_series_operand = self if self_is_string_series == True else other # noqa
46984698
str_series_indexer = left_indexer if self_is_string_series == True else right_indexer # noqa

sdc/datatypes/indexes/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,4 @@
3030
from .positional_index_type import PositionalIndexType
3131
from .empty_index_type import EmptyIndexType
3232
from .int64_index_type import Int64IndexType
33+
from .multi_index_type import MultiIndexType
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
# -*- coding: utf-8 -*-
2+
# *****************************************************************************
3+
# Copyright (c) 2021, Intel Corporation All rights reserved.
4+
#
5+
# Redistribution and use in source and binary forms, with or without
6+
# modification, are permitted provided that the following conditions are met:
7+
#
8+
# Redistributions of source code must retain the above copyright notice,
9+
# this list of conditions and the following disclaimer.
10+
#
11+
# Redistributions in binary form must reproduce the above copyright notice,
12+
# this list of conditions and the following disclaimer in the documentation
13+
# and/or other materials provided with the distribution.
14+
#
15+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
17+
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
19+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
22+
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23+
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
24+
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25+
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
# *****************************************************************************
27+
28+
from numba import types
29+
from numba.extending import (
30+
models,
31+
register_model,
32+
make_attribute_wrapper,
33+
)
34+
35+
36+
class MultiIndexIteratorType(types.SimpleIteratorType):
37+
def __init__(self, iterable):
38+
self.parent = iterable
39+
yield_type = iterable.dtype
40+
name = "iter[{}->{}],{}".format(
41+
iterable, yield_type, iterable.name
42+
)
43+
super(MultiIndexIteratorType, self).__init__(name, yield_type)
44+
45+
46+
@register_model(MultiIndexIteratorType)
47+
class MultiIndexIterModel(models.StructModel):
48+
def __init__(self, dmm, fe_type):
49+
members = [
50+
('parent', fe_type.parent), # reference to the index object
51+
('state', types.CPointer(types.int64)), # iterator state (i.e. counter)
52+
]
53+
super(MultiIndexIterModel, self).__init__(dmm, fe_type, members)
54+
55+
56+
class MultiIndexType(types.IterableType):
57+
58+
def __init__(self, levels, codes, is_named=False):
59+
self.levels = levels
60+
self.codes = codes
61+
self.is_named = is_named
62+
super(MultiIndexType, self).__init__(
63+
name='MultiIndexType({}, {}, {})'.format(levels, codes, is_named))
64+
65+
@property
66+
def iterator_type(self):
67+
return MultiIndexIteratorType(self).iterator_type
68+
69+
@property
70+
def dtype(self):
71+
nlevels = len(self.levels)
72+
levels_types = [self.levels.dtype] * nlevels if isinstance(self.levels, types.UniTuple) else self.levels
73+
return types.Tuple.from_types([level.dtype for level in levels_types])
74+
75+
@property
76+
def nlevels(self):
77+
return len(self.levels)
78+
79+
@property
80+
def levels_types(self):
81+
if isinstance(self.levels, types.UniTuple):
82+
return [self.levels.dtype] * self.levels.count
83+
84+
return self.levels
85+
86+
@property
87+
def codes_types(self):
88+
if isinstance(self.codes, types.UniTuple):
89+
return [self.codes.dtype] * self.codes.count
90+
91+
return self.codes
92+
93+
94+
@register_model(MultiIndexType)
95+
class MultiIndexModel(models.StructModel):
96+
def __init__(self, dmm, fe_type):
97+
98+
levels_type = fe_type.levels
99+
codes_type = fe_type.codes
100+
name_type = types.unicode_type if fe_type.is_named else types.none # TO-DO: change to types.Optional
101+
members = [
102+
('levels', levels_type),
103+
('codes', codes_type),
104+
('name', name_type),
105+
]
106+
models.StructModel.__init__(self, dmm, fe_type, members)
107+
108+
109+
make_attribute_wrapper(MultiIndexType, 'levels', '_levels')
110+
make_attribute_wrapper(MultiIndexType, 'codes', '_codes')
111+
make_attribute_wrapper(MultiIndexType, 'name', '_name')

sdc/datatypes/sdc_typeref.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# *****************************************************************************
2+
# Copyright (c) 2021, Intel Corporation All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions are met:
6+
#
7+
# Redistributions of source code must retain the above copyright notice,
8+
# this list of conditions and the following disclaimer.
9+
#
10+
# Redistributions in binary form must reproduce the above copyright notice,
11+
# this list of conditions and the following disclaimer in the documentation
12+
# and/or other materials provided with the distribution.
13+
#
14+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
16+
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
18+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
21+
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
22+
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
23+
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
24+
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25+
# *****************************************************************************
26+
27+
import pandas as pd
28+
29+
from numba.core import types
30+
from numba.extending import (models, register_model, )
31+
from numba.core.typing.templates import infer_global
32+
33+
from sdc.extensions.sdc_hashmap_type import ConcurrentDict, ConcurrentDictType
34+
from sdc.datatypes.indexes import MultiIndexType
35+
36+
37+
# FIXME_Numba#6781: due to overlapping of overload_methods for Numba TypeRef
38+
# we have to use our new SdcTypeRef to type objects created from types.Type
39+
# (i.e. ConcurrentDict meta-type). This should be removed once it's fixed.
40+
def sdc_make_new_typeref_class():
41+
class SdcTypeRef(types.Dummy):
42+
"""Reference to a type.
43+
44+
Used when a type is passed as a value.
45+
"""
46+
def __init__(self, instance_type):
47+
self.instance_type = instance_type
48+
super(SdcTypeRef, self).__init__('sdc_typeref[{}]'.format(self.instance_type))
49+
50+
@register_model(SdcTypeRef)
51+
class SdcTypeRefModel(models.OpaqueModel):
52+
def __init__(self, dmm, fe_type):
53+
54+
models.OpaqueModel.__init__(self, dmm, fe_type)
55+
56+
return SdcTypeRef
57+
58+
59+
ConcurrentDictTypeRef = sdc_make_new_typeref_class()
60+
MultiIndexTypeRef = sdc_make_new_typeref_class()
61+
62+
infer_global(ConcurrentDict, ConcurrentDictTypeRef(ConcurrentDictType))
63+
infer_global(pd.MultiIndex, MultiIndexTypeRef(MultiIndexType))

sdc/extensions/indexes/indexes_generic.py

Lines changed: 113 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,12 @@
3030
import pandas as pd
3131

3232
from numba import types
33-
from numba.typed import Dict
33+
from numba.typed import Dict, List
3434
from numba.typed.typedobjectutils import _nonoptional
3535

3636
from sdc.utilities.sdc_typing_utils import sdc_pandas_index_types, sdc_old_index_types
3737
from sdc.datatypes.indexes import *
38-
from sdc.utilities.utils import sdc_overload_method, sdc_overload
38+
from sdc.utilities.utils import sdc_overload
3939
from sdc.utilities.sdc_typing_utils import (
4040
find_index_common_dtype,
4141
sdc_indexes_wo_values_cache,
@@ -96,7 +96,9 @@ def sdc_indexes_operator_eq_ovld(self, other):
9696
# TO-DO: this is for numeric indexes only now, extend to string-index when it's added
9797
use_self_values = isinstance(self, sdc_pandas_index_types) and not isinstance(self, types.Array)
9898
use_other_values = isinstance(other, sdc_pandas_index_types) and not isinstance(other, types.Array)
99-
one_operand_is_scalar = isinstance(self, types.Number) or isinstance(other, types.Number)
99+
100+
one_operand_is_scalar = (isinstance(other, sdc_pandas_index_types) and self is other.dtype
101+
or isinstance(self, sdc_pandas_index_types) and other is self.dtype)
100102

101103
def sdc_indexes_operator_eq_impl(self, other):
102104

@@ -217,8 +219,8 @@ def pd_fix_indexes_join_overload(joined, indexer1, indexer2):
217219
""" Wraps pandas index.join() into new function that returns indexers as arrays and not optional(array) """
218220

219221
# This function is simply a workaround for problem with parfor lowering
220-
# broken by indexers typed as types.Optional(Array) - FIXME_Numba#XXXX: remove it
221-
# in all places whne parfor issue is fixed
222+
# broken by indexers typed as types.Optional(Array) - FIXME_Numba#6686: remove it
223+
# in all places when parfor issue is fixed
222224
def pd_fix_indexes_join_impl(joined, indexer1, indexer2):
223225
if indexer1 is not None:
224226
_indexer1 = _nonoptional(indexer1)
@@ -282,3 +284,109 @@ def sdc_np_array_overload(A):
282284

283285
if isinstance(A, Int64IndexType):
284286
return lambda A: A._data
287+
288+
289+
def sdc_indexes_take(self, target):
290+
pass
291+
292+
293+
@sdc_overload(sdc_indexes_take)
294+
def pd_fix_indexes_take_overload(self, indexes):
295+
""" Simply workaround for not having take method as unique indexes due to
296+
the fact that StringArrayType is one of the index types """
297+
298+
check = isinstance(self, sdc_pandas_index_types)
299+
if not isinstance(self, sdc_pandas_index_types):
300+
return None
301+
302+
index_api_supported = not isinstance(self, sdc_old_index_types)
303+
304+
def pd_fix_indexes_take_impl(self, indexes):
305+
306+
if index_api_supported == True: # noqa
307+
res = self.take(indexes)
308+
else:
309+
res = numpy_like.take(self, indexes)
310+
311+
return res
312+
313+
return pd_fix_indexes_take_impl
314+
315+
316+
def sdc_indexes_rename(index, name):
317+
pass
318+
319+
320+
@sdc_overload(sdc_indexes_rename)
321+
def sdc_index_rename_ovld(index, name):
322+
323+
if not isinstance(index, sdc_pandas_index_types):
324+
return None
325+
326+
if isinstance(index, sdc_old_index_types):
327+
def sdc_indexes_rename_stub(index, name):
328+
# cannot rename string or float indexes, TO-DO: StringIndexType
329+
return index
330+
return sdc_indexes_rename_stub
331+
332+
if isinstance(index, PositionalIndexType):
333+
from sdc.extensions.indexes.positional_index_ext import init_positional_index
334+
335+
def sdc_indexes_rename_impl(index, name):
336+
return init_positional_index(len(index), name)
337+
return sdc_indexes_rename_impl
338+
339+
elif isinstance(index, RangeIndexType):
340+
def sdc_indexes_rename_impl(index, name):
341+
return pd.RangeIndex(index.start, index.stop, index.step, name=name)
342+
return sdc_indexes_rename_impl
343+
344+
elif isinstance(index, Int64IndexType):
345+
def sdc_indexes_rename_impl(index, name):
346+
return pd.Int64Index(index, name=name)
347+
return sdc_indexes_rename_impl
348+
349+
350+
def sdc_indexes_get_name(index):
351+
pass
352+
353+
354+
@sdc_overload(sdc_indexes_get_name)
355+
def sdc_indexes_get_name_ovld(index):
356+
357+
if (isinstance(index, sdc_pandas_index_types)
358+
and not isinstance(index, sdc_old_index_types)):
359+
def sdc_indexes_get_name_impl(index):
360+
return index.name
361+
return sdc_indexes_get_name_impl
362+
363+
def sdc_indexes_get_name_stub(index):
364+
# cannot rename string or float indexes, TO-DO: StringIndexType
365+
return None
366+
return sdc_indexes_get_name_stub
367+
368+
369+
def sdc_indexes_build_map_positions(self):
370+
pass
371+
372+
373+
@sdc_overload(sdc_indexes_build_map_positions)
374+
def sdc_indexes_build_map_positions_ovld(self):
375+
376+
indexer_dtype = self.dtype
377+
indexer_value_type = types.ListType(types.int64)
378+
379+
def sdc_indexes_build_map_positions_impl(self):
380+
indexer_map = Dict.empty(indexer_dtype, indexer_value_type)
381+
for i in range(len(self)):
382+
val = self[i]
383+
index_list = indexer_map.get(val, None)
384+
if index_list is None:
385+
indexer_map[val] = List.empty_list(types.int64)
386+
indexer_map[val].append(i)
387+
else:
388+
index_list.append(i)
389+
390+
return indexer_map
391+
392+
return sdc_indexes_build_map_positions_impl

0 commit comments

Comments
 (0)