Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit d780d38

Browse files
authored
Re-implement DataFrame unboxing (#860)
* Boxing draft Merge branch 'master' of https://github.com/IntelPython/sdc into merge_master # Conflicts: # sdc/hiframes/pd_dataframe_ext.py # sdc/tests/test_dataframe.py * Implement unboxing in new structure * Improve variable names + add error handling * Return error status * Move getting list size to if_ok block * Unskipped unexpected success tests * Unskipped unexpected success tests in GroupBy * Remove decorators * Change to incref False * Skip tests failed due to unimplemented df structure * Bug in rolling
1 parent b5221f0 commit d780d38

File tree

5 files changed

+84
-143
lines changed

5 files changed

+84
-143
lines changed

sdc/hiframes/boxing.py

Lines changed: 54 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
from numba.extending import (typeof_impl, unbox, register_model, models,
3434
NativeValue, box, intrinsic)
3535
from numba import types
36-
from numba.core import cgutils
36+
from numba.core import cgutils, typing
3737
from numba.np import numpy_support
3838
from numba.core.typing import signature
3939
from numba.core.boxing import box_array, unbox_array, box_list
@@ -48,6 +48,8 @@
4848
from sdc.hiframes.pd_series_ext import SeriesType
4949
from sdc.hiframes.pd_series_type import _get_series_array_type
5050

51+
from sdc.hiframes.pd_dataframe_ext import get_structure_maps
52+
5153
from .. import hstr_ext
5254
import llvmlite.binding as ll
5355
from llvmlite import ir as lir
@@ -58,12 +60,14 @@
5860

5961
@typeof_impl.register(pd.DataFrame)
6062
def typeof_pd_dataframe(val, c):
63+
6164
col_names = tuple(val.columns.tolist())
6265
# TODO: support other types like string and timestamp
6366
col_types = get_hiframes_dtypes(val)
6467
index_type = _infer_index_type(val.index)
68+
column_loc, _, _ = get_structure_maps(col_types, col_names)
6569

66-
return DataFrameType(col_types, index_type, col_names, True)
70+
return DataFrameType(col_types, index_type, col_names, True, column_loc=column_loc)
6771

6872

6973
# register series types for import
@@ -86,21 +90,55 @@ def unbox_dataframe(typ, val, c):
8690
# create dataframe struct and store values
8791
dataframe = cgutils.create_struct_proxy(typ)(c.context, c.builder)
8892

89-
column_tup = c.context.make_tuple(
90-
c.builder, types.UniTuple(string_type, n_cols), column_strs)
93+
errorptr = cgutils.alloca_once_value(c.builder, cgutils.false_bit)
9194

92-
# this unboxes all DF columns so that no column unboxing occurs later
93-
for col_ind in range(n_cols):
94-
series_obj = c.pyapi.object_getattr_string(val, typ.columns[col_ind])
95-
arr_obj = c.pyapi.object_getattr_string(series_obj, "values")
96-
ty_series = typ.data[col_ind]
97-
if isinstance(ty_series, types.Array):
98-
native_val = unbox_array(typ.data[col_ind], arr_obj, c)
99-
elif ty_series == string_array_type:
100-
native_val = unbox_str_series(string_array_type, series_obj, c)
95+
col_list_type = types.List(string_type)
96+
ok, inst = listobj.ListInstance.allocate_ex(c.context, c.builder, col_list_type, n_cols)
10197

102-
dataframe.data = c.builder.insert_value(
103-
dataframe.data, native_val.value, col_ind)
98+
with c.builder.if_else(ok, likely=True) as (if_ok, if_not_ok):
99+
with if_ok:
100+
inst.size = c.context.get_constant(types.intp, n_cols)
101+
for i, column_str in enumerate(column_strs):
102+
inst.setitem(c.context.get_constant(types.intp, i), column_str, incref=False)
103+
dataframe.columns = inst.value
104+
105+
with if_not_ok:
106+
c.builder.store(cgutils.true_bit, errorptr)
107+
108+
# If an error occurred, drop the whole native list
109+
with c.builder.if_then(c.builder.load(errorptr)):
110+
c.context.nrt.decref(c.builder, col_list_type, inst.value)
111+
112+
_, data_typs_map, types_order = get_structure_maps(typ.data, typ.columns)
113+
114+
for col_typ in types_order:
115+
type_id, col_indices = data_typs_map[col_typ]
116+
n_type_cols = len(col_indices)
117+
list_type = types.List(col_typ)
118+
ok, inst = listobj.ListInstance.allocate_ex(c.context, c.builder, list_type, n_type_cols)
119+
120+
with c.builder.if_else(ok, likely=True) as (if_ok, if_not_ok):
121+
with if_ok:
122+
inst.size = c.context.get_constant(types.intp, n_type_cols)
123+
for i, col_idx in enumerate(col_indices):
124+
series_obj = c.pyapi.object_getattr_string(val, typ.columns[col_idx])
125+
arr_obj = c.pyapi.object_getattr_string(series_obj, "values")
126+
ty_series = typ.data[col_idx]
127+
if isinstance(ty_series, types.Array):
128+
native_val = unbox_array(typ.data[col_idx], arr_obj, c)
129+
elif ty_series == string_array_type:
130+
native_val = unbox_str_series(string_array_type, series_obj, c)
131+
132+
inst.setitem(c.context.get_constant(types.intp, i), native_val.value, incref=False)
133+
134+
dataframe.data = c.builder.insert_value(dataframe.data, inst.value, type_id)
135+
136+
with if_not_ok:
137+
c.builder.store(cgutils.true_bit, errorptr)
138+
139+
# If an error occurred, drop the whole native list
140+
with c.builder.if_then(c.builder.load(errorptr)):
141+
c.context.nrt.decref(c.builder, list_type, inst.value)
104142

105143
# TODO: support unboxing index
106144
if typ.index == types.none:
@@ -113,7 +151,6 @@ def unbox_dataframe(typ, val, c):
113151
index_data = c.pyapi.object_getattr_string(index_obj, "_data")
114152
dataframe.index = unbox_array(typ.index, index_data, c).value
115153

116-
dataframe.columns = column_tup
117154
dataframe.parent = val
118155

119156
# increase refcount of stored values
@@ -122,7 +159,7 @@ def unbox_dataframe(typ, val, c):
122159
for var in column_strs:
123160
c.context.nrt.incref(c.builder, string_type, var)
124161

125-
return NativeValue(dataframe._getvalue())
162+
return NativeValue(dataframe._getvalue(), is_error=c.builder.load(errorptr))
126163

127164

128165
def get_hiframes_dtypes(df):

sdc/hiframes/pd_dataframe_ext.py

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -59,33 +59,20 @@ class ColumnLoc(NamedTuple):
5959
col_id: int
6060

6161

62-
@intrinsic
63-
def init_dataframe(typingctx, *args):
64-
"""Create a DataFrame with provided data, index and columns values.
65-
Used as a single constructor for DataFrame and assigning its data, so that
66-
optimization passes can look for init_dataframe() to see if underlying
67-
data has changed, and get the array variables from init_dataframe() args if
68-
not changed.
69-
"""
70-
71-
n_cols = len(args) // 2
72-
data_typs = tuple(args[:n_cols])
73-
index_typ = args[n_cols]
74-
column_names = tuple(a.literal_value for a in args[n_cols + 1:])
75-
62+
def get_structure_maps(col_types, col_names):
7663
# Define map column name to column location ex. {'A': (0,0), 'B': (1,0), 'C': (0,1)}
7764
column_loc = {}
7865
# Store unique types of columns ex. {'int64': (0, [0, 2]), 'float64': (1, [1])}
7966
data_typs_map = {}
8067
types_order = []
8168
type_id = 0
82-
for i, col_typ in enumerate(data_typs):
83-
col_name = column_names[i]
69+
for i, col_typ in enumerate(col_types):
70+
col_name = col_names[i]
8471

8572
if col_typ not in data_typs_map:
8673
data_typs_map[col_typ] = (type_id, [i])
8774
# The first column in each type always has 0 index
88-
column_loc[col_name] = ColumnLoc(type_id, col_id=0)
75+
column_loc[col_name] = ColumnLoc(type_id, 0)
8976
types_order.append(col_typ)
9077
type_id += 1
9178
else:
@@ -95,6 +82,25 @@ def init_dataframe(typingctx, *args):
9582
column_loc[col_name] = ColumnLoc(existing_type_id, col_id)
9683
col_indices.append(i)
9784

85+
return column_loc, data_typs_map, types_order
86+
87+
88+
@intrinsic
89+
def init_dataframe(typingctx, *args):
90+
"""Create a DataFrame with provided data, index and columns values.
91+
Used as a single constructor for DataFrame and assigning its data, so that
92+
optimization passes can look for init_dataframe() to see if underlying
93+
data has changed, and get the array variables from init_dataframe() args if
94+
not changed.
95+
"""
96+
97+
n_cols = len(args) // 2
98+
data_typs = tuple(args[:n_cols])
99+
index_typ = args[n_cols]
100+
column_names = tuple(a.literal_value for a in args[n_cols + 1:])
101+
102+
column_loc, data_typs_map, types_order = get_structure_maps(data_typs, column_names)
103+
98104
def codegen(context, builder, signature, args):
99105
in_tup = args[0]
100106
data_arrs = [builder.extract_value(in_tup, i) for i in range(n_cols)]

0 commit comments

Comments
 (0)