Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 37 additions & 42 deletions sdc/datatypes/hpat_pandas_dataframe_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,29 +227,19 @@ def hpat_pandas_df_values_impl(self, numba_common_dtype):

def sdc_pandas_dataframe_append_codegen(df, other, _func_name, ignore_index_value, indexes_comparable, args):
"""
Input:
df = pd.DataFrame({'A': ['cat', 'dog', np.nan], 'B': [.2, .3, np.nan]})
other = pd.DataFrame({'A': ['bird', 'fox', 'mouse'], 'C': ['a', np.nan, '']})
ignore_index=True

Func generated:
Example of generated implementation:
def sdc_pandas_dataframe_append_impl(df, other, ignore_index=False, verify_integrity=False, sort=None):
len_df = len(df._data[0])
len_other = len(other._data[0])
new_col_A_data_df = df._data[0]
new_col_A_data_other = other._data[0]
new_col_A = init_series(new_col_A_data_df).append(init_series(new_col_A_data_other))._data
new_col_B_data_df = df._data[1]
new_col_B_data = init_series(new_col_B_data_df)._data
new_col_B = fill_array(new_col_B_data, len_df+len_other)
new_col_C_data_other = other._data[0]
new_col_C_data = init_series(new_col_C_data_other)._data
new_col_C = fill_str_array(new_col_C_data, len_df+len_other, push_back=False)
return pandas.DataFrame({"A": new_col_A, "B": new_col_B, "C": new_col_C)
len_df = len(df._data[0][0])
len_other = len(other._data[0][0])
new_col_0_data_df = df._data[0][0]
new_col_0_data_other = other._data[0][0]
new_col_0 = init_series(new_col_0_data_df).append(init_series(new_col_0_data_other))._data
new_col_1_data_df = df._data[0][1]
new_col_1_data_other = other._data[0][1]
new_col_1 = init_series(new_col_1_data_df).append(init_series(new_col_1_data_other))._data
return pandas.DataFrame({"A": new_col_0, "B": new_col_1})
"""
indent = 4 * ' '
func_args = ['df', 'other']

func_args = ['df', 'other'] + kwsparams2list(args)

df_columns_indx = {col_name: i for i, col_name in enumerate(df.columns)}
Expand All @@ -267,38 +257,43 @@ def sdc_pandas_dataframe_append_impl(df, other, ignore_index=False, verify_integ
func_text = []
column_list = []

func_text.append(f'len_df = len(df._data[0])')
func_text.append(f'len_other = len(other._data[0])')
func_text.append(f'len_df = len(df._data[0][0])')
func_text.append(f'len_other = len(other._data[0][0])')

for col_name, col_id in df_columns_indx.items():
func_text.append(f'new_col_{col_id}_data_{"df"} = {"df"}._data[{col_id}]')
for col_name, idx in df_columns_indx.items():
col_loc = df.column_loc[col_name]
type_id, col_id = col_loc.type_id, col_loc.col_id
func_text.append(f'new_col_{idx}_data_df = df._data[{type_id}][{col_id}]')
if col_name in other_columns_indx:
other_col_id = other_columns_indx.get(col_name)
func_text.append(f'new_col_{col_id}_data_{"other"} = '
f'{"other"}._data[{other_columns_indx.get(col_name)}]')
s1 = f'init_series(new_col_{col_id}_data_{"df"})'
s2 = f'init_series(new_col_{col_id}_data_{"other"})'
func_text.append(f'new_col_{col_id} = {s1}.append({s2})._data')
other_col_loc = other.column_loc[col_name]
other_type_id, other_col_id = other_col_loc.type_id, other_col_loc.col_id
func_text.append(f'new_col_{idx}_data_other = '
f'other._data[{other_type_id}][{other_col_id}]')
s1 = f'init_series(new_col_{idx}_data_df)'
s2 = f'init_series(new_col_{idx}_data_other)'
func_text.append(f'new_col_{idx} = {s1}.append({s2})._data')
else:
func_text.append(f'new_col_{col_id}_data = init_series(new_col_{col_id}_data_df)._data')
func_text.append(f'new_col_{idx}_data = init_series(new_col_{idx}_data_df)._data')
if col_name in string_type_columns:
func_text.append(f'new_col_{col_id} = fill_str_array(new_col_{col_id}_data, len_df+len_other)')
func_text.append(f'new_col_{idx} = fill_str_array(new_col_{idx}_data, len_df+len_other)')
else:
func_text.append(f'new_col_{col_id} = fill_array(new_col_{col_id}_data, len_df+len_other)')
column_list.append((f'new_col_{col_id}', col_name))
func_text.append(f'new_col_{idx} = fill_array(new_col_{idx}_data, len_df+len_other)')
column_list.append((f'new_col_{idx}', col_name))

for col_name, col_id in other_columns_indx.items():
for col_name, idx in other_columns_indx.items():
if col_name not in df_columns_indx:
func_text.append(f'new_col_{col_id}_data_{"other"} = {"other"}._data[{col_id}]')
func_text.append(f'new_col_{col_id}_data = init_series(new_col_{col_id}_data_other)._data')
other_col_loc = other.column_loc[col_name]
other_type_id, other_col_id = other_col_loc.type_id, other_col_loc.col_id
func_text.append(f'new_col_{idx}_data_other = other._data[{other_type_id}][{other_col_id}]')
func_text.append(f'new_col_{idx}_data = init_series(new_col_{idx}_data_other)._data')
if col_name in string_type_columns:
func_text.append(
f'new_col_{col_id}_other = '
f'fill_str_array(new_col_{col_id}_data, len_df+len_other, push_back=False)')
f'new_col_{idx}_other = '
f'fill_str_array(new_col_{idx}_data, len_df+len_other, push_back=False)')
else:
func_text.append(f'new_col_{col_id}_other = '
f'fill_array(new_col_{col_id}_data, len_df+len_other, push_back=False)')
column_list.append((f'new_col_{col_id}_other', col_name))
func_text.append(f'new_col_{idx}_other = '
f'fill_array(new_col_{idx}_data, len_df+len_other, push_back=False)')
column_list.append((f'new_col_{idx}_other', col_name))

data = ', '.join(f'"{column_name}": {column}' for column, column_name in column_list)

Expand Down
43 changes: 35 additions & 8 deletions sdc/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2057,7 +2057,7 @@ def test_impl(df):
df = pd.DataFrame({'A': np.arange(n), 'B': np.arange(n)**2})
pd.testing.assert_frame_equal(hpat_func(df), test_impl(df))

@dfRefactoringNotImplemented
@dfRefactoringNotImplemented # required re-implementing DataFrame unboxing
def test_append_df_same_cols_no_index(self):
def test_impl(df, df2):
return df.append(df2, ignore_index=True)
Expand All @@ -2069,7 +2069,19 @@ def test_impl(df, df2):
df2.A[n // 2:] = n
pd.testing.assert_frame_equal(sdc_func(df, df2), test_impl(df, df2))

@dfRefactoringNotImplemented
@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
def test_append_df_same_cols_no_index_no_unboxing(self):
def test_impl():
n = 11
df = pd.DataFrame({'A': np.arange(n), 'B': np.arange(n)**2})
df2 = pd.DataFrame({'A': np.arange(n), 'B': np.arange(n)**2})
df2.A[n // 2:] = n
return df.append(df2, ignore_index=True)

sdc_func = self.jit(test_impl)
pd.testing.assert_frame_equal(sdc_func(), test_impl())

@dfRefactoringNotImplemented # required re-implementing DataFrame unboxing
def test_append_df_same_cols_index_default(self):
def test_impl(df, df2):
return df.append(df2)
Expand All @@ -2082,7 +2094,7 @@ def test_impl(df, df2):

pd.testing.assert_frame_equal(sdc_func(df, df2), test_impl(df, df2))

@dfRefactoringNotImplemented
@dfRefactoringNotImplemented # required re-implementing DataFrame unboxing
def test_append_df_diff_cols_index_ignore_false(self):
def test_impl(df, df2):
return df.append(df2, ignore_index=False)
Expand All @@ -2096,7 +2108,22 @@ def test_impl(df, df2):

pd.testing.assert_frame_equal(sdc_func(df, df2), test_impl(df, df2))

@dfRefactoringNotImplemented
@dfRefactoringNotImplemented # required re-implementing DataFrame boxing
def test_append_df_diff_cols_index_ignore_false_no_unboxing(self):
def test_impl():
n1 = 11
n2 = n1 * 2
df = pd.DataFrame({'A': np.arange(n1), 'B': np.arange(n1) ** 2},
index=np.arange(n1) ** 4)
df2 = pd.DataFrame({'C': np.arange(n2), 'D': np.arange(n2) ** 2,
'E S D': np.arange(n2) + 100},
index=np.arange(n2) ** 8)
return df.append(df2, ignore_index=False)

sdc_func = self.jit(test_impl)
pd.testing.assert_frame_equal(sdc_func(), test_impl())

@dfRefactoringNotImplemented # required re-implementing DataFrame unboxing
def test_append_df_diff_cols_index_ignore_index(self):
def test_impl(df, df2):
return df.append(df2, ignore_index=True)
Expand All @@ -2110,7 +2137,7 @@ def test_impl(df, df2):

pd.testing.assert_frame_equal(sdc_func(df, df2), test_impl(df, df2))

@dfRefactoringNotImplemented
@dfRefactoringNotImplemented # required re-implementing DataFrame unboxing
def test_append_df_diff_cols_no_index(self):
def test_impl(df, df2):
return df.append(df2)
Expand All @@ -2123,7 +2150,7 @@ def test_impl(df, df2):

pd.testing.assert_frame_equal(sdc_func(df, df2), test_impl(df, df2))

@dfRefactoringNotImplemented
@dfRefactoringNotImplemented # required re-implementing DataFrame unboxing
def test_append_df_cross_cols_no_index(self):
def test_impl(df, df2):
return df.append(df2, ignore_index=True)
Expand All @@ -2136,7 +2163,7 @@ def test_impl(df, df2):

pd.testing.assert_frame_equal(sdc_func(df, df2), test_impl(df, df2))

@dfRefactoringNotImplemented
@dfRefactoringNotImplemented # required re-implementing DataFrame unboxing
def test_append_df_exception_incomparable_index_type(self):
def test_impl(df, df2):
return df.append(df2, ignore_index=False)
Expand All @@ -2157,7 +2184,7 @@ def test_impl(df, df2):
self.assertIn(msg, str(raises.exception))

@skip_sdc_jit
@dfRefactoringNotImplemented
@dfRefactoringNotImplemented # required re-implementing DataFrame unboxing
def test_append_df_diff_types_no_index(self):
def test_impl(df, df2):
return df.append(df2, ignore_index=True)
Expand Down