Sage-Bionetworks · danlu1 · Feb 9, 2026 · Feb 10, 2026 · Feb 10, 2026 · Feb 10, 2026
diff --git a/synapseclient/core/upload/upload_utils.py b/synapseclient/core/upload/upload_utils.py
@@ -70,6 +70,9 @@ def get_partial_dataframe_chunk(
             header=False,
             index=False,
             float_format="%.12g",
+            doublequote=False,
+            escapechar="\\",
+            quoting=0,
             **(to_csv_kwargs or {}),
         )
         number_of_bytes_in_buffer = buffer.tell()

@@ -141,6 +141,7 @@ def row_labels_from_rows(rows: List[Row]) -> List[Row]:
 def convert_dtypes_to_json_serializable(df):
     """
     Convert the dtypes of the int64 and float64 columns to object columns which are JSON serializable types.
+    Replace both Ellipsis and pandas NA within nested structures which are not JSON serializable types.
     Also, convert the ROW_ID, ROW_VERSION, and ROW_ID.1 columns to int columns which are JSON serializable types.
     Arguments:
         df: The dataframe to convert the dtypes of.
@@ -163,16 +164,68 @@ def convert_dtypes_to_json_serializable(df):
             "datetime_list_col": [[datetime(2021, 1, 1), datetime(2021, 1, 2), datetime(2021, 1, 3)], [datetime(2021, 1, 4), datetime(2021, 1, 5), datetime(2021, 1, 6)], None, [datetime(2021, 1, 7), datetime(2021, 1, 8), datetime(2021, 1, 9)]],
             "entityid_list_col": [["syn123", "syn456", None], ["syn101", "syn102", "syn103"], None, ["syn104", "syn105", "syn106"]],
             "userid_list_col": [["user1", "user2", "user3"], ["user4", "user5", None], None, ["user7", "user8", "user9"]],
+            "json_col_with_quotes": [
+                {
+                    "id": 1,
+                    "description": 'Text with "quotes" in the description field',
+                    "references": []
+                },
+                {
+                    "id": 2,
+                    "description": 'Another description with "quoted text" here',`
+                    "references": ["ref1", "ref2"]
+                },
+                {
+                    "id": 3,
+                    "description": 'Description containing "multiple" quoted "words"',
+                    "references": [...]
+                }
+                {
+                    "id": 4,
+                    "description": 'Description containing apostrophes sage\'s',
+                    "references": [...]
+                }
+
+            ],
         }).convert_dtypes()
         df = convert_dtypes_to_json_serializable(df)
         print(df)
     """
     import pandas as pd
 
     for col in df.columns:
-        df[col] = (
-            df[col].replace({pd.NA: None}).astype(object)
-        )  # this will convert the int64 and float64 columns to object columns
+        sample_values = df[col].dropna()
+        if len(sample_values):
+
+            def _serialize_json_value(x):
+                if isinstance(x, (list, dict)):
+
+                    def _reformat_special_values(obj):
+                        if obj is ...:
+                            return "..."
+                        if obj is pd.NA:
+                            return None
+                        if isinstance(obj, dict):
+                            return {
+                                k: _reformat_special_values(v) for k, v in obj.items()
+                            }
+                        if isinstance(obj, list):
+                            return [_reformat_special_values(item) for item in obj]
+                        return obj
+
+                    cleaned_x = _reformat_special_values(x)
+                    return cleaned_x
+                # Handle standalone ellipsis
+                if x is ...:
+                    return "..."
+                return x
+
+            df[col] = df[col].apply(lambda x: _serialize_json_value(x))
+
+            # restore the original values of the column especially for the int64 and float64 columns since apply function changes the dtype
+            df[col] = df[col].convert_dtypes()
+            df[col] = df[col].replace({pd.NA: None}).astype(object)
+
         # Convert ROW_ prefixed columns back to int (like ROW_ID, ROW_VERSION)
         if col in [
             "ROW_ID",
@@ -2809,7 +2862,6 @@ async def main():
             timeout=timeout,
             synapse_client=synapse_client,
         )
-
         if download_location:
             return csv_path
 
@@ -4031,8 +4083,9 @@ async def _chunk_and_upload_df(
             to_csv_kwargs: Additional arguments to pass to the `pd.DataFrame.to_csv`
                 function when writing the data to a CSV file.
         """
+        # Serializes dict/list values to JSON strings
+        df = convert_dtypes_to_json_serializable(df)
         # Loop over the rows of the DF to determine the size/boundries we'll be uploading
-
         chunks_to_upload = []
         size_of_chunk = 0
         buffer = BytesIO()

@@ -10,6 +10,7 @@
 
 import pandas as pd
 import pytest
+from pandas.api.types import is_object_dtype
 from pytest_mock import MockerFixture
 
 import synapseclient.models.mixins.asynchronous_job as asynchronous_job_module
@@ -351,6 +352,8 @@ async def test_store_rows_from_csv_infer_columns(
                 "float_string": [1.1, 2.2, 3.3, None],
             }
         )
+        data_for_table = data_for_table.convert_dtypes()
+        data_for_table = data_for_table.replace({pd.NA: None})
         filepath = f"{tempfile.mkdtemp()}/upload_{uuid.uuid4()}.csv"
         self.schedule_for_cleanup(filepath)
         data_for_table.to_csv(filepath, index=False, float_format="%.12g")
@@ -512,6 +515,8 @@ async def test_store_rows_from_manually_defined_columns(
                 "float_column": [1.1, 2.2, 3.3, None],
             }
         )
+        data_for_table = data_for_table.convert_dtypes()
+        data_for_table = data_for_table.replace({pd.NA: None})
         filepath = f"{tempfile.mkdtemp()}/upload_{uuid.uuid4()}.csv"
         self.schedule_for_cleanup(filepath)
         data_for_table.to_csv(filepath, index=False, float_format="%.12g")
@@ -977,6 +982,179 @@ async def test_store_rows_as_large_df_being_split_and_uploaded(
         # AND The spy should have been called in multiple batches
         assert spy_send_job.call_count == 1
 
+    async def test_store_rows_with_quotes_and_apostrophes_ellipses(
+        self, project_model: Project
+    ) -> None:
+        """Test columns with quotes, apostrophes, and ellipses (in lists, dicts, and standalone) in values are properly stored and retrieved in the tables"""
+        # GIVEN a table with a JSON column
+        table_name = str(uuid.uuid4())
+        table = Table(
+            name=table_name,
+            parent_id=project_model.id,
+            columns=[
+                Column(name="id", column_type=ColumnType.INTEGER),
+                Column(name="json_data", column_type=ColumnType.JSON),
+                Column(
+                    name="string_list_with_ellipses", column_type=ColumnType.STRING_LIST
+                ),
+                Column(name="string_col_with_ellipses", column_type=ColumnType.STRING),
+                Column(name="int_list_with_pa_na", column_type=ColumnType.INTEGER_LIST),
+                Column(name="nullable_int", column_type=ColumnType.INTEGER),
+                Column(name="nullable_float", column_type=ColumnType.DOUBLE),
+            ],
+        )
+        table = await table.store_async(synapse_client=self.syn)
+        self.schedule_for_cleanup(table.id)
+
+        # AND data with quotes in JSON values
+        data_for_table = pd.DataFrame(
+            {
+                "id": [1, 2, 3, 4, 5, 6, 7],
+                "json_data": [
+                    {"description": 'Text with "quotes" here', "value": 100},
+                    {
+                        "description": 'Multiple "quoted" "words" here',
+                        "value": 300,
+                    },
+                    {
+                        "description": ...,
+                        "value": 200,
+                    },  # standalone ellipses in the json value
+                    {
+                        "description": [1, 2, ...],
+                        "value": 400,
+                    },  # list with ellipses in the json value
+                    {
+                        "description": {"inner": ...},
+                        "value": 500,
+                    },  # dict with ellipses in the json value
+                    {
+                        "description": "single apostrophe's",
+                        "author": "D'Angelo",
+                    },  # single apostrophe in the json value
+                    {
+                        "description": "Multiple's apostrophe's",
+                        "author": "McDonald's",
+                    },  # multiple apostrophe's in the json value
+                ],
+                "string_list_with_ellipses": [
+                    ["a", "b", ...],
+                    ["d", ..., "f"],
+                    ["g", "h", "i"],
+                    [...],
+                    ["m", "n", "..."],
+                    ["p", "q", "r"],
+                    ["s", "t", "u"],
+                ],
+                "string_col_with_ellipses": [
+                    "value1",
+                    ...,
+                    "value3",
+                    ...,
+                    "value6",
+                    ...,
+                    "value8",
+                ],
+                "int_list_with_pa_na": [
+                    [1, 2, 3],
+                    pd.NA,
+                    [7, 8, 9],
+                    pd.NA,
+                    [11, 12, 13],
+                    pd.NA,
+                    [15, 16, 17],
+                ],
+                "nullable_int": pd.array([10, pd.NA, 30, pd.NA, 31, pd.NA, 32]),
+                "nullable_float": pd.array([1.1, pd.NA, 3.3, pd.NA, 3.4, pd.NA, 3.5]),
+            }
+        )
+        # WHEN I store the rows
+        await table.store_rows_async(
+            values=data_for_table,
+            synapse_client=self.syn,
+        )
+        # THEN I can query the table and retrieve the data correctly
+        results = await query_async(
+            f"SELECT * FROM {table.id}",
+            synapse_client=self.syn,
+            timeout=QUERY_TIMEOUT_SEC,
+        )
+        # AND the JSON data should be properly preserved with quotes
+        assert len(results) == 7
+        expected_result = pd.DataFrame(
+            {
+                "id": [1, 2, 3, 4, 5, 6, 7],
+                "json_data": [
+                    {"description": 'Text with "quotes" here', "value": 100},
+                    {
+                        "description": 'Multiple "quoted" "words" here',
+                        "value": 300,
+                    },
+                    {
+                        "description": "...",
+                        "value": 200,
+                    },  # standalone ellipses in the json value
+                    {
+                        "description": [1, 2, "..."],
+                        "value": 400,
+                    },  # list with ellipses in the json value
+                    {
+                        "description": {"inner": "..."},
+                        "value": 500,
+                    },  # dict with ellipses in the json value
+                    {
+                        "description": "single apostrophe's",
+                        "author": "D'Angelo",
+                    },  # single apostrophe in the json value
+                    {
+                        "description": "Multiple's apostrophe's",
+                        "author": "McDonald's",
+                    },  # multiple apostrophe's in the json value
+                ],
+                "string_list_with_ellipses": [
+                    ["a", "b", "..."],
+                    ["d", "...", "f"],
+                    ["g", "h", "i"],
+                    ["..."],
+                    ["m", "n", "..."],
+                    ["p", "q", "r"],
+                    ["s", "t", "u"],
+                ],
+                "string_col_with_ellipses": [
+                    "value1",
+                    "...",
+                    "value3",
+                    "...",
+                    "value6",
+                    "...",
+                    "value8",
+                ],
+                "int_list_with_pa_na": [
+                    [1, 2, 3],
+                    [],
+                    [7, 8, 9],
+                    [],
+                    [11, 12, 13],
+                    [],
+                    [15, 16, 17],
+                ],
+                "nullable_int": pd.array([10, None, 30, None, 31, None, 32]),
+                "nullable_float": pd.array([1.1, None, 3.3, None, 3.4, None, 3.5]),
+            }
+        )
+        assert is_object_dtype(results.json_data)
+        assert is_object_dtype(results.int_list_with_pa_na)
+        assert is_object_dtype(results.nullable_int)
+        assert is_object_dtype(results.nullable_float)
+
+        expected_result = expected_result.convert_dtypes()
+        expected_result = expected_result.replace({pd.NA: None})
+        pd.testing.assert_frame_equal(
+            results.drop(columns=["ROW_ID", "ROW_VERSION"]),
+            expected_result,
+            check_dtype=False,
+        )
+
 
 class TestUpsertRows:
     @pytest.fixture(autouse=True, scope="function")
@@ -1549,9 +1727,14 @@ async def test_upsert_all_data_types(self, project_model: Project) -> None:
                     ],
                 }
             )
+
+            expected_results = expected_results.convert_dtypes()
+            expected_results = expected_results.replace({pd.NA: None})
+            # import pdb; pdb.set_trace()
             pd.testing.assert_frame_equal(
                 results_after_insert, expected_results, check_dtype=False
             )
+
             # Create a second test file to update references
             path2 = utils.make_bogus_data_file()
             self.schedule_for_cleanup(path2)
@@ -1733,7 +1916,10 @@ async def test_upsert_all_data_types(self, project_model: Project) -> None:
                     ],
                 }
             )
+            expected_results = expected_results.convert_dtypes()
+            expected_results = expected_results.replace({pd.NA: None})
             pd.testing.assert_frame_equal(results, expected_results, check_dtype=False)
+
             # WHEN I upsert with multiple primary keys and null values
             multi_key_data = pd.DataFrame(
                 {