@@ -195,6 +195,38 @@ def map_values(
195195USE_INDEX = True # set here for the encoders
196196INDEX_COL = 0 if USE_INDEX else None # ... and this will be used for the decoders
197197
198+ import io
199+ import pandas as pd
200+
201+
202+ def single_column_parquet_encode (sequences , col = 0 ):
203+ """
204+ Encode a list of sequences into a single-column parquet file.
205+
206+ >>> sequences_1 = [[1, 2], [3, 4, 5]]
207+ >>> encoded_1 = single_column_parquet_encode(sequences_1)
208+ >>> decoded_1 = single_column_parquet_decode(encoded_1)
209+ >>> all((x == y).all() for x, y in zip(decoded_1, sequences_1))
210+ True
211+
212+ """
213+ return pd .DataFrame ({col : sequences }).to_parquet ()
214+
215+
216+ def single_column_parquet_decode (b : bytes , col = 0 ):
217+ """
218+ Decode a single-column parquet file into a list of sequences.
219+
220+ >>> sequences_2 = [['one', 'two'], ['three', 'four', 'five']]
221+ >>> encoded_2 = single_column_parquet_encode(sequences_2)
222+ >>> decoded_2 = single_column_parquet_decode(encoded_2)
223+ >>> all((x == y).all() for x, y in zip(decoded_2, sequences_2))
224+ True
225+
226+ """
227+ return pd .read_parquet (io .BytesIO (b ))[col ].values
228+
229+
198230_extension_to_encoder = split_keys (
199231 {
200232 # csv files
@@ -217,15 +249,9 @@ def map_values(
217249 "p pickle pkl" : pd .DataFrame .to_pickle ,
218250 # numpy arrays
219251 "npy" : LiteralVal (written_bytes (np .save , obj_arg_position_in_writer = 1 )),
220- # parquet format
221- "parquet" : LiteralVal (
222- written_bytes (pd .DataFrame .to_parquet , obj_arg_position_in_writer = 0 )
223- ),
224252 # zip-compressed tsv (custom implementation)
225253 "zip" : LiteralVal (save_df_to_zipped_tsv ),
226254 # feather format
227- "feather" : pd .DataFrame .to_feather ,
228- # hdf5 format (Hierarchical Data Format)
229255 "h5 hdf5" : pd .DataFrame .to_hdf ,
230256 # stata files
231257 "stata dta" : partial (pd .DataFrame .to_stata , write_index = USE_INDEX ),
@@ -243,6 +269,7 @@ def map_values(
243269 # parquet format
244270 "parquet" : pd .DataFrame .to_parquet , # Need: pip install pyarrow, fastparquet
245271 # feather format
272+ "single_column_parquet" : single_column_parquet_encode ,
246273 "feather" : pd .DataFrame .to_feather , # Need: pip install pyarrow
247274 # orc format
248275 "orc" : pd .DataFrame .to_orc , # Need: pip install pyarrow
@@ -266,6 +293,7 @@ def map_values(
266293 "tsv" : partial (pd .read_csv , sep = "\t " , index_col = INDEX_COL ),
267294 # parquet format
268295 "parquet" : pd .read_parquet ,
296+ "single_column_parquet" : single_column_parquet_decode ,
269297 # json format
270298 "json" : partial (pd .read_json , orient = "records" ),
271299 # html tables
0 commit comments