Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 21 additions & 20 deletions sdc/hiframes/boxing.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,15 +202,10 @@ def box_dataframe(typ, val, c):
context = c.context
builder = c.builder

n_cols = len(typ.columns)
col_names = typ.columns
arr_typs = typ.data
dtypes = [a.dtype for a in arr_typs] # TODO: check Categorical

dataframe = cgutils.create_struct_proxy(typ)(context, builder, value=val)
col_arrs = [builder.extract_value(dataframe.data, i) for i in range(n_cols)]
# df unboxed from Python
has_parent = cgutils.is_not_null(builder, dataframe.parent)

pyapi = c.pyapi
# gil_state = pyapi.gil_ensure() # acquire GIL
Expand All @@ -219,28 +214,31 @@ def box_dataframe(typ, val, c):
class_obj = pyapi.import_module_noblock(mod_name)
df_dict = pyapi.dict_new()

for i, cname, arr, arr_typ, dtype in zip(range(n_cols), col_names, col_arrs, arr_typs, dtypes):
arrays_list_objs = {}
for cname, arr_typ in zip(col_names, arr_typs):
# df['cname'] = boxed_arr
# TODO: datetime.date, DatetimeIndex?
name_str = context.insert_const_string(c.builder.module, cname)
cname_obj = pyapi.string_from_string(name_str)

if dtype == string_type:
arr_obj = box_str_arr(arr_typ, arr, c)
elif isinstance(dtype, PDCategoricalDtype):
arr_obj = box_categorical_array(arr_typ, arr, c)
# context.nrt.incref(builder, arr_typ, arr)
elif dtype == types.List(string_type):
arr_obj = box_list(list_string_array_type, arr, c)
# context.nrt.incref(builder, arr_typ, arr) # TODO required?
# pyapi.print_object(arr_obj)
else:
arr_obj = box_array(arr_typ, arr, c)
# TODO: is incref required?
# context.nrt.incref(builder, arr_typ, arr)
col_loc = typ.column_loc[cname]
type_id, col_id = col_loc.type_id, col_loc.col_id

# dataframe.data looks like a tuple(list(array))
# e.g. ([array(int64, 1d, C), array(int64, 1d, C)], [array(float64, 1d, C)])
arrays_list_obj = arrays_list_objs.get(type_id)
if arrays_list_obj is None:
list_typ = types.List(arr_typ)
# extracting list from the tuple
list_val = builder.extract_value(dataframe.data, type_id)
# getting array from the list to box it then
arrays_list_obj = box_list(list_typ, list_val, c)
arrays_list_objs[type_id] = arrays_list_obj

# PyList_GetItem returns borrowed reference
arr_obj = pyapi.list_getitem(arrays_list_obj, col_id)
pyapi.dict_setitem(df_dict, cname_obj, arr_obj)

pyapi.decref(arr_obj)
pyapi.decref(cname_obj)

df_obj = pyapi.call_method(class_obj, "DataFrame", (df_dict,))
Expand All @@ -252,6 +250,9 @@ def box_dataframe(typ, val, c):
pyapi.object_setattr_string(df_obj, 'index', arr_obj)
pyapi.decref(arr_obj)

for arrays_list_obj in arrays_list_objs.values():
pyapi.decref(arrays_list_obj)

pyapi.decref(class_obj)
# pyapi.gil_release(gil_state) # release GIL
return df_obj
Expand Down
88 changes: 43 additions & 45 deletions sdc/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ class TestDataFrame(TestCase):

# TODO: Data generator for DataFrames

@dfRefactoringNotImplemented
def test_create1(self):
def test_impl(A, B):
df = pd.DataFrame({'A': A, 'B': B})
Expand Down Expand Up @@ -108,7 +107,6 @@ def test_impl():

self.assertEqual(hpat_func(), test_impl())

@dfRefactoringNotImplemented
def test_create_with_series1(self):
def test_impl(n):
A = pd.Series(np.ones(n, dtype=np.int64))
Expand All @@ -132,7 +130,6 @@ def test_impl(A):
self.assertEqual(hpat_func(df.A), test_impl(df.A))

@skip_sdc_jit
@dfRefactoringNotImplemented
def test_create_string_index(self):
def test_impl(a):
data = {'A': ['a', 'b'], 'B': [2, 3]}
Expand All @@ -142,7 +139,6 @@ def test_impl(a):
hpat_func = sdc.jit(test_impl)
pd.testing.assert_frame_equal(hpat_func(True), test_impl(True))

@dfRefactoringNotImplemented
def test_create_cond1(self):
def test_impl(A, B, c):
if c:
Expand Down Expand Up @@ -232,7 +228,6 @@ def test_impl(n):
do_check = False if platform.system() == 'Windows' and not IS_32BITS else True
pd.testing.assert_frame_equal(hpat_func(n), test_impl(n), check_dtype=do_check)

@dfRefactoringNotImplemented
def test_box2(self):
def test_impl():
df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'bb', 'ccc']})
Expand Down Expand Up @@ -978,7 +973,6 @@ def test_impl(df):
with self.subTest(index=idx):
pd.testing.assert_frame_equal(sdc_func(df), test_impl(df))

@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
def test_df_isna_no_unboxing(self):
def test_impl():
df = pd.DataFrame({
Expand Down Expand Up @@ -1164,7 +1158,6 @@ def test_impl(df, n, k):
with self.subTest(index=idx, n=n, k=k):
pd.testing.assert_frame_equal(sdc_func(df, n, k), test_impl(df, n, k))

@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
def test_df_iloc_slice_no_unboxing(self):
def test_impl(n, k):
df = pd.DataFrame({
Expand Down Expand Up @@ -1280,7 +1273,6 @@ def test_impl(df, n):
with self.subTest(index=idx, n=n):
pd.testing.assert_frame_equal(sdc_func(df, n), test_impl(df, n))

@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
def test_df_iloc_list_no_unboxing(self):
def test_impl(n):
df = pd.DataFrame({
Expand Down Expand Up @@ -1310,7 +1302,6 @@ def test_impl(df, n):
with self.subTest(index=idx, n=n):
pd.testing.assert_frame_equal(sdc_func(df, n), test_impl(df, n))

@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
def test_df_iloc_list_bool_no_unboxing(self):
def test_impl(n):
df = pd.DataFrame({
Expand Down Expand Up @@ -1429,7 +1420,6 @@ def test_impl(df):
"C": [3.1, 8.4, 7.1, 3.2, 1]}, index=idx)
pd.testing.assert_frame_equal(sdc_func(df), test_impl(df))

@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
def test_df_loc_no_unboxing(self):
def test_impl():
df = pd.DataFrame({
Expand Down Expand Up @@ -1489,7 +1479,6 @@ def impl(a):
)
pd.testing.assert_frame_equal(sdc_func(df), ref_impl(df))

@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
def test_df_head_no_unboxing(self):
def test_impl(n):
df = pd.DataFrame({
Expand Down Expand Up @@ -1522,7 +1511,6 @@ def test_impl(df, deep):
with self.subTest(index=idx, deep=deep):
pd.testing.assert_frame_equal(sdc_func(df, deep), test_impl(df, deep))

@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
def test_df_copy_no_unboxing(self):
def test_impl(idx, deep):
df = pd.DataFrame({
Expand All @@ -1534,15 +1522,28 @@ def test_impl(idx, deep):
return df.copy(deep=deep)

sdc_impl = sdc.jit(test_impl)
indexes = [[3, 4, 2, 6, 1], ['a', 'b', 'c', 'd', 'e'], None]
indexes = [[3, 4, 2, 6, 1], ['a', 'b', 'c', 'd', 'e']]
cases_deep = [None, True, False]
for idx, deep in product(indexes, cases_deep):
with self.subTest(index=idx, deep=deep):
jit_result = sdc_impl(idx, deep)
ref_result = test_impl(idx, deep)
pd.testing.assert_frame_equal(jit_result, ref_result)

@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
@unittest.expectedFailure
def test_df_copy_no_unboxing_none_index_error(self):
def test_impl():
df = pd.DataFrame({
'A': [3.2, np.nan, 7.0, 3.3, np.nan],
'B': [3, 4, 1, 0, 222],
'C': [True, True, False, False, True],
'D': ['a', 'dd', 'c', '12', None]
}, index=None)
return df.copy(deep=True)

sdc_impl = sdc.jit(test_impl)
pd.testing.assert_frame_equal(sdc_impl(), test_impl())

def test_pct_change1(self):
def test_impl(n):
df = pd.DataFrame({'A': np.arange(n) + 1.0, 'B': np.arange(n) + 1})
Expand Down Expand Up @@ -1714,7 +1715,6 @@ def test_impl(df):
hpat_func = self.jit(test_impl)
pd.testing.assert_frame_equal(hpat_func(df), test_impl(df))

@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
def test_df_reset_index_drop_literal_index_int_no_unboxing(self):
def gen_test_impl(drop):
def test_impl():
Expand Down Expand Up @@ -1745,7 +1745,6 @@ def test_impl(df):

pd.testing.assert_frame_equal(hpat_func(df), test_impl(df))

@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
def test_df_reset_index_drop_default_index_int_no_unboxing(self):
def test_impl():
df = pd.DataFrame({
Expand Down Expand Up @@ -1842,20 +1841,17 @@ def test_impl(df):
index=index)
pd.testing.assert_frame_equal(sdc_func(df), test_impl(df))

@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
def test_df_drop_one_column(self):
def test_impl(index):
df = pd.DataFrame({'A': [1.0, 2.0, np.nan, 1.0], 'B': [4, 5, 6, 7], 'C': [1.0, 2.0, np.nan, 1.0]},
index=index)
df = pd.DataFrame({
'A': [1.0, 2.0, np.nan, 1.0],
'B': [4, 5, 6, 7],
'C': [1.0, 2.0, np.nan, 1.0]
}, index=index)
return df.drop(columns='A')

index_to_test = [[1, 2, 3, 4],
[.1, .2, .3, .4],
['a', 'b', 'c', 'd']]

sdc_func = self.jit(test_impl)

for index in index_to_test:
for index in [[1, 2, 3, 4], [.1, .2, .3, .4], ['a', 'b', 'c', 'd']]:
with self.subTest(index=index):
pd.testing.assert_frame_equal(sdc_func(index), test_impl(index))

Expand Down Expand Up @@ -1884,7 +1880,6 @@ def test_impl(df):
index=index)
pd.testing.assert_frame_equal(sdc_func(df), test_impl(df))

@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
def test_df_drop_tuple_column(self):
def gen_test_impl(do_jit=False):
def test_impl(index):
Expand Down Expand Up @@ -2037,7 +2032,6 @@ def test_impl(df, arr):
sdc_func = self.jit(test_impl)
pd.testing.assert_frame_equal(sdc_func(df, arr), test_impl(df, arr))

@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
def test_df_getitem_bool_array_even_idx_no_unboxing(self):
def test_impl(arr):
df = pd.DataFrame({
Expand Down Expand Up @@ -2139,7 +2133,6 @@ def test_impl(idx):
sdc_func = self.jit(test_impl)
pd.testing.assert_series_equal(sdc_func('A'), test_impl('A'))

@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
def test_df_getitem_slice_idx_no_unboxing(self):
def test_impl():
df = pd.DataFrame({
Expand All @@ -2150,9 +2143,8 @@ def test_impl():
return df[1:3]

sdc_func = self.jit(test_impl)
pd.testing.assert_series_equal(sdc_func(), test_impl())
pd.testing.assert_frame_equal(sdc_func(), test_impl())

@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
def test_df_getitem_unbox_slice_idx_no_unboxing(self):
def test_impl(start, end):
df = pd.DataFrame({
Expand All @@ -2163,9 +2155,8 @@ def test_impl(start, end):
return df[start:end]

sdc_func = self.jit(test_impl)
pd.testing.assert_series_equal(sdc_func(1, 3), test_impl(1, 3))
pd.testing.assert_frame_equal(sdc_func(1, 3), test_impl(1, 3))

@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
def test_df_getitem_tuple_idx_no_unboxing(self):
def gen_test_impl(do_jit=False):
def test_impl():
Expand All @@ -2183,9 +2174,8 @@ def test_impl():

test_impl = gen_test_impl()
sdc_func = self.jit(gen_test_impl(do_jit=True))
pd.testing.assert_series_equal(sdc_func(), test_impl())
pd.testing.assert_frame_equal(sdc_func(), test_impl())

@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
def test_df_getitem_bool_series_idx_no_unboxing(self):
def test_impl():
df = pd.DataFrame({
Expand All @@ -2196,7 +2186,7 @@ def test_impl():
return df[df['A'] == -1.]

sdc_func = self.jit(test_impl)
pd.testing.assert_series_equal(sdc_func(), test_impl())
pd.testing.assert_frame_equal(sdc_func(), test_impl())

@skip_sdc_jit('DF.getitem unsupported Series name')
@dfRefactoringNotImplemented
Expand Down Expand Up @@ -2295,7 +2285,6 @@ def test_impl(df, df2):
df2.A[n // 2:] = n
pd.testing.assert_frame_equal(sdc_func(df, df2), test_impl(df, df2))

@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
def test_append_df_same_cols_no_index_no_unboxing(self):
def test_impl():
n = 11
Expand All @@ -2304,8 +2293,14 @@ def test_impl():
df2.A[n // 2:] = n
return df.append(df2, ignore_index=True)

sdc_func = self.jit(test_impl)
pd.testing.assert_frame_equal(sdc_func(), test_impl())
sdc_impl = self.jit(test_impl)

kwargs = {}
if platform.system() == 'Windows':
# Attribute "dtype" are different on windows int64 vs int32
kwargs['check_dtype'] = False

pd.testing.assert_frame_equal(sdc_impl(), test_impl(), **kwargs)

@dfRefactoringNotImplemented # required re-implementing DataFrame unboxing
def test_append_df_same_cols_index_default(self):
Expand Down Expand Up @@ -2334,20 +2329,23 @@ def test_impl(df, df2):

pd.testing.assert_frame_equal(sdc_func(df, df2), test_impl(df, df2))

@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
def test_append_df_diff_cols_index_ignore_false_no_unboxing(self):
def test_impl():
n1 = 11
n2 = n1 * 2
df = pd.DataFrame({'A': np.arange(n1), 'B': np.arange(n1) ** 2},
index=np.arange(n1) ** 4)
df2 = pd.DataFrame({'C': np.arange(n2), 'D': np.arange(n2) ** 2,
'E S D': np.arange(n2) + 100},
index=np.arange(n2) ** 8)
df = pd.DataFrame({
'A': np.arange(n1), 'B': np.arange(n1) ** 2
}, index=np.arange(n1) ** 2)
df2 = pd.DataFrame({
'C': np.arange(n2), 'D': np.arange(n2) ** 2,
'E S D': np.arange(n2) + 100
}, index=np.arange(n2) ** 4)
return df.append(df2, ignore_index=False)

sdc_func = self.jit(test_impl)
pd.testing.assert_frame_equal(sdc_func(), test_impl())
res_jit = sdc_func()
res_ref = test_impl()
pd.testing.assert_frame_equal(res_jit, res_ref)

@dfRefactoringNotImplemented # required re-implementing DataFrame unboxing
def test_append_df_diff_cols_index_ignore_index(self):
Expand Down
Loading