Skip to content

Commit f5e4e95

Browse files
committed
feat: add single-column parquet encode/decode functions and improve table source handling
1 parent 66a8400 commit f5e4e95

2 files changed

Lines changed: 41 additions & 7 deletions

File tree

tabled/base.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ def get_table(
102102
Get a table from a variety of sources.
103103
"""
104104
# If table_src is None, the user is trying to fix the parameters of the function
105+
105106
if table_src is None:
106107
return partial(
107108
get_table,
@@ -115,14 +116,19 @@ def get_table(
115116
if isinstance(table_src, pd.DataFrame):
116117
return table_src
117118

118-
if ext is None and isinstance(table_src, str):
119+
120+
if ext is None:
121+
ext = ''
122+
123+
if not ext and isinstance(table_src, str):
119124
key = table_src
120125
ext = file_extension(key)
121126
if ext == "zip":
122127
# compute the next extension
123128
next_ext = file_extension(key[: -len(".zip")])
124129
if next_ext:
125130
ext = f"{next_ext}.{ext}" # e.g. 'csv.zip'
131+
126132

127133
# TODO: Here's a great waste, since many of our table reading functions can
128134
# take file-like (paths, io objects) as input. Should make wrappers.py so that

tabled/wrappers.py

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,38 @@ def map_values(
195195
USE_INDEX = True # set here for the encoders
196196
INDEX_COL = 0 if USE_INDEX else None # ... and this will be used for the decoders
197197

198+
import io
199+
import pandas as pd
200+
201+
202+
def single_column_parquet_encode(sequences, col=0):
203+
"""
204+
Encode a list of sequences into a single-column parquet file.
205+
206+
>>> sequences_1 = [[1, 2], [3, 4, 5]]
207+
>>> encoded_1 = single_column_parquet_encode(sequences_1)
208+
>>> decoded_1 = single_column_parquet_decode(encoded_1)
209+
>>> all((x == y).all() for x, y in zip(decoded_1, sequences_1))
210+
True
211+
212+
"""
213+
return pd.DataFrame({col: sequences}).to_parquet()
214+
215+
216+
def single_column_parquet_decode(b: bytes, col=0):
217+
"""
218+
Decode a single-column parquet file into a list of sequences.
219+
220+
>>> sequences_2 = [['one', 'two'], ['three', 'four', 'five']]
221+
>>> encoded_2 = single_column_parquet_encode(sequences_2)
222+
>>> decoded_2 = single_column_parquet_decode(encoded_2)
223+
>>> all((x == y).all() for x, y in zip(decoded_2, sequences_2))
224+
True
225+
226+
"""
227+
return pd.read_parquet(io.BytesIO(b))[col].values
228+
229+
198230
_extension_to_encoder = split_keys(
199231
{
200232
# csv files
@@ -217,15 +249,9 @@ def map_values(
217249
"p pickle pkl": pd.DataFrame.to_pickle,
218250
# numpy arrays
219251
"npy": LiteralVal(written_bytes(np.save, obj_arg_position_in_writer=1)),
220-
# parquet format
221-
"parquet": LiteralVal(
222-
written_bytes(pd.DataFrame.to_parquet, obj_arg_position_in_writer=0)
223-
),
224252
# zip-compressed tsv (custom implementation)
225253
"zip": LiteralVal(save_df_to_zipped_tsv),
226254
# feather format
227-
"feather": pd.DataFrame.to_feather,
228-
# hdf5 format (Hierarchical Data Format)
229255
"h5 hdf5": pd.DataFrame.to_hdf,
230256
# stata files
231257
"stata dta": partial(pd.DataFrame.to_stata, write_index=USE_INDEX),
@@ -243,6 +269,7 @@ def map_values(
243269
# parquet format
244270
"parquet": pd.DataFrame.to_parquet, # Need: pip install pyarrow, fastparquet
245271
# feather format
272+
"single_column_parquet": single_column_parquet_encode,
246273
"feather": pd.DataFrame.to_feather, # Need: pip install pyarrow
247274
# orc format
248275
"orc": pd.DataFrame.to_orc, # Need: pip install pyarrow
@@ -266,6 +293,7 @@ def map_values(
266293
"tsv": partial(pd.read_csv, sep="\t", index_col=INDEX_COL),
267294
# parquet format
268295
"parquet": pd.read_parquet,
296+
"single_column_parquet": single_column_parquet_decode,
269297
# json format
270298
"json": partial(pd.read_json, orient="records"),
271299
# html tables

0 commit comments

Comments
 (0)