Skip to content
Open
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions synapseclient/core/upload/upload_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,9 @@ def get_partial_dataframe_chunk(
header=False,
index=False,
float_format="%.12g",
doublequote=False,
escapechar="\\",
quoting=0,
Comment thread
danlu1 marked this conversation as resolved.
Outdated
**(to_csv_kwargs or {}),
)
number_of_bytes_in_buffer = buffer.tell()
Expand Down
69 changes: 65 additions & 4 deletions synapseclient/models/mixins/table_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,8 @@ def row_labels_from_rows(rows: List[Row]) -> List[Row]:
def convert_dtypes_to_json_serializable(df):
Comment thread
danlu1 marked this conversation as resolved.
Outdated
Comment thread
danlu1 marked this conversation as resolved.
Outdated
"""
Convert the dtypes of the int64 and float64 columns to object columns which are JSON serializable types.
Convert the list and dict columns to JSON strings which are JSON serializable types.
Comment thread
danlu1 marked this conversation as resolved.
Outdated
Replace both Ellipsis and pandas NA within nested structures which are not JSON serializable types.
Also, convert the ROW_ID, ROW_VERSION, and ROW_ID.1 columns to int columns which are JSON serializable types.
Arguments:
df: The dataframe to convert the dtypes of.
Expand All @@ -163,16 +165,74 @@ def convert_dtypes_to_json_serializable(df):
"datetime_list_col": [[datetime(2021, 1, 1), datetime(2021, 1, 2), datetime(2021, 1, 3)], [datetime(2021, 1, 4), datetime(2021, 1, 5), datetime(2021, 1, 6)], None, [datetime(2021, 1, 7), datetime(2021, 1, 8), datetime(2021, 1, 9)]],
"entityid_list_col": [["syn123", "syn456", None], ["syn101", "syn102", "syn103"], None, ["syn104", "syn105", "syn106"]],
"userid_list_col": [["user1", "user2", "user3"], ["user4", "user5", None], None, ["user7", "user8", "user9"]],
"json_col_with_quotes": [
{
"id": 1,
"description": 'Text with "quotes" in the description field',
"references": []
},
{
"id": 2,
"description": 'Another description with "quoted text" here',`
Comment thread
danlu1 marked this conversation as resolved.
Outdated
"references": ["ref1", "ref2"]
},
{
"id": 3,
"description": 'Description containing "multiple" quoted "words"',
"references": [...]
}
{
"id": 4,
"description": 'Description containing apostrophes sage\'s',
"references": [...]
}

],
}).convert_dtypes()
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we still need to call .convert_dtypes() here? since convert_dtypes_to_json_serializable also calls convert_dtypes in one of the steps?

df = convert_dtypes_to_json_serializable(df)
print(df)
"""
import pandas as pd

for col in df.columns:
df[col] = (
df[col].replace({pd.NA: None}).astype(object)
) # this will convert the int64 and float64 columns to object columns
if df[col].notna().any():
Comment thread
danlu1 marked this conversation as resolved.
Outdated
sample_values = df[col].dropna()
if len(sample_values):
Comment thread
danlu1 marked this conversation as resolved.
Outdated

def _serialize_json_value(x):
if x is None:
return None
if isinstance(x, (list, dict)):

def _reformat_special_values(obj):
if obj is ...:
return "..."
# Handle pandas NA - check type name to avoid array ambiguity
if obj is pd.NA:
return None
if isinstance(obj, dict):
return {
k: _reformat_special_values(v)
for k, v in obj.items()
}
if isinstance(obj, list):
return [_reformat_special_values(item) for item in obj]
return obj

cleaned_x = _reformat_special_values(x)
# return json.dumps(cleaned_x, ensure_ascii=False)
Comment thread
danlu1 marked this conversation as resolved.
Outdated
return cleaned_x
# Handle standalone ellipsis
if x is ...:
return "..."
return x

df[col] = df[col].apply(lambda x: _serialize_json_value(x))

# restore the original values of the column especially for the int64 and float64 columns since apply function changes the dtype
df[col] = df[col].convert_dtypes()
df[col] = df[col].replace({pd.NA: None}).astype(object)

# Convert ROW_ prefixed columns back to int (like ROW_ID, ROW_VERSION)
if col in [
"ROW_ID",
Expand Down Expand Up @@ -4031,8 +4091,9 @@ async def _chunk_and_upload_df(
to_csv_kwargs: Additional arguments to pass to the `pd.DataFrame.to_csv`
function when writing the data to a CSV file.
"""
# Serializes dict/list values to JSON strings
Comment thread
danlu1 marked this conversation as resolved.
Outdated
df = convert_dtypes_to_json_serializable(df)
Comment thread
danlu1 marked this conversation as resolved.
# Loop over the rows of the DF to determine the size/boundries we'll be uploading

chunks_to_upload = []
size_of_chunk = 0
buffer = BytesIO()
Expand Down
Loading
Loading