fix bug in tutorial

andrewelamb · andrewelamb · commit af434baa3793 · 2026-04-06T10:13:06.000-07:00
diff --git a/docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py b/docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py
@@ -5,31 +5,40 @@
 import os
 
 import synapseclient
-import synapseutils
 from synapseclient.models import Project
 
 syn = synapseclient.Synapse()
 syn.login()
 
 # Create some constants to store the paths to the data
 DIRECTORY_FOR_MY_PROJECT = os.path.expanduser(os.path.join("~", "my_ad_project"))
-PATH_TO_MANIFEST_FILE = os.path.expanduser(os.path.join("~", "manifest-for-upload.tsv"))
+PATH_TO_MANIFEST_FILE = os.path.expanduser(os.path.join("~", "manifest-for-upload.csv"))
 
 # Step 1: Let's find the synapse ID of our project:
 my_project_id = syn.findEntityId(
     name="My uniquely named project about Alzheimer's Disease"
 )
 
-# Step 2: Create a manifest TSV file to upload data in bulk
-# Note: When this command is run it will re-create your directory structure within
-# Synapse. Be aware of this before running this command.
-# If folders with the exact names already exists in Synapse, those folders will be used.
-synapseutils.generate_sync_manifest(
-    syn=syn,
-    directory_path=DIRECTORY_FOR_MY_PROJECT,
-    parent_id=my_project_id,
-    manifest_path=PATH_TO_MANIFEST_FILE,
-)
+# Step 2: Create a manifest CSV file to upload data in bulk
+# Walk the local directory tree and build a manifest with the required "path" and
+# "parentId" columns.  Folders that do not yet exist in Synapse are created
+# automatically by sync_to_synapse, so we set parentId to the project for every file.
+# NOTE: In a future release, Project.sync_from_synapse will support writing a manifest
+# CSV directly, removing the need to build one manually.
+import pandas as pd
+
+rows = []
+for dirpath, _dirnames, filenames in os.walk(DIRECTORY_FOR_MY_PROJECT):
+    for filename in filenames:
+        rows.append(
+            {
+                "path": os.path.join(dirpath, filename),
+                "parentId": my_project_id,
+            }
+        )
+
+df = pd.DataFrame(rows)
+df.to_csv(PATH_TO_MANIFEST_FILE, index=False)
 
 # Step 3: After generating the manifest file, we can upload the data in bulk
 project = Project(id=my_project_id)
@@ -39,21 +48,20 @@
 # Pandas is a powerful data manipulation library in Python, although it is not required
 # for this tutorial, it is used here to demonstrate how you can manipulate the manifest
 # file before uploading it to Synapse.
-import pandas as pd
 
-# Read TSV file into a pandas DataFrame
-df = pd.read_csv(PATH_TO_MANIFEST_FILE, sep="\t")
+# Read CSV file into a pandas DataFrame
+df = pd.read_csv(PATH_TO_MANIFEST_FILE)
 
 # Add a new column to the DataFrame
 df["species"] = "Homo sapiens"
 
 # Write the DataFrame back to the manifest file
-df.to_csv(PATH_TO_MANIFEST_FILE, sep="\t", index=False)
+df.to_csv(PATH_TO_MANIFEST_FILE, index=False)
 
 project.sync_to_synapse(manifest_path=PATH_TO_MANIFEST_FILE)
 
 # Step 5: Let's create an Activity/Provenance
-# First let's find the row in the TSV we want to update. This code finds the row number
+# First let's find the row in the CSV we want to update. This code finds the row number
 # that we would like to update.
 row_index = df[
     df["path"] == f"{DIRECTORY_FOR_MY_PROJECT}/biospecimen_experiment_1/fileA.txt"
@@ -77,6 +85,6 @@
 ] = "Experiment results created as a result of the linked data while running the pipeline."
 
 # Write the DataFrame back to the manifest file
-df.to_csv(PATH_TO_MANIFEST_FILE, sep="\t", index=False)
+df.to_csv(PATH_TO_MANIFEST_FILE, index=False)
 
 project.sync_to_synapse(manifest_path=PATH_TO_MANIFEST_FILE)
diff --git a/docs/tutorials/python/upload_data_in_bulk.md b/docs/tutorials/python/upload_data_in_bulk.md
@@ -1,4 +1,5 @@
 # Uploading data in bulk
+
 This tutorial will follow a
 [Flattened Data Layout](../../explanations/structuring_your_project.md#flattened-data-layout-example).
 With a project that has this example layout:
@@ -19,10 +20,11 @@ With a project that has this example layout:
 ```
 
 ## Tutorial Purpose
+
 In this tutorial you will:
 
 1. Find the synapse ID of your project
-1. Create a manifest TSV file to upload data in bulk
+1. Create a manifest CSV file to upload data in bulk
 1. Upload all of the files for our project
 1. Add an annotation to all of our files
 1. Add a provenance/activity record to one of our files
@@ -40,56 +42,59 @@ In this tutorial you will:
 
 
 ## Prerequisites
+
 * Make sure that you have completed the following tutorials:
     * [Project](./project.md)
 * This tutorial is setup to upload the data from `~/my_ad_project`, make sure that this or
 another desired directory exists.
 * Pandas is used in this tutorial. Refer to our
 [installation guide](../installation.md#pypi) to install it. Feel free to skip this
 portion of the tutorial if you do not wish to use Pandas. You may also use external
-tools to open and manipulate Tab Separated Value (TSV) files.
+tools to open and manipulate CSV files.
 
 
 ## 1. Find the synapse ID of your project
 
 First let's set up some constants we'll use in this script, and find the ID of our project
 ```python
-{!docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py!lines=5-21}
+{!docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py!lines=5-22}
 ```
 
-## 2. Create a manifest TSV file to upload data in bulk
+## 2. Create a manifest CSV file to upload data in bulk
 
-Let's "walk" our directory on disk to create a manifest file for upload
+Let's walk our local directory and build a CSV manifest with the required `path` and
+`parentId` columns. In a future release `Project.sync_from_synapse` will support
+writing a manifest CSV directly; for now we build one with pandas.
 ```python
-{!docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py!lines=23-33}
+{!docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py!lines=23-44}
 ```
 
 <details class="example">
-  <summary>After this has been run if you inspect the TSV file created you'll see it will look
+  <summary>After this has been run if you inspect the CSV file created you'll see it will look
 similar to this:</summary>
 ```
-path    parent
-/home/user_name/my_ad_project/single_cell_RNAseq_batch_2/SRR12345678_R2.fastq.gz  syn60109537
-/home/user_name/my_ad_project/single_cell_RNAseq_batch_2/SRR12345678_R1.fastq.gz  syn60109537
-/home/user_name/my_ad_project/biospecimen_experiment_2/fileD.txt  syn60109543
-/home/user_name/my_ad_project/biospecimen_experiment_2/fileC.txt  syn60109543
-/home/user_name/my_ad_project/single_cell_RNAseq_batch_1/SRR12345678_R2.fastq.gz  syn60109534
-/home/user_name/my_ad_project/single_cell_RNAseq_batch_1/SRR12345678_R1.fastq.gz  syn60109534
-/home/user_name/my_ad_project/biospecimen_experiment_1/fileA.txt  syn60109540
-/home/user_name/my_ad_project/biospecimen_experiment_1/fileB.txt  syn60109540
+path,parentId
+/home/user_name/my_ad_project/single_cell_RNAseq_batch_2/SRR12345678_R2.fastq.gz,syn60109500
+/home/user_name/my_ad_project/single_cell_RNAseq_batch_2/SRR12345678_R1.fastq.gz,syn60109500
+/home/user_name/my_ad_project/biospecimen_experiment_2/fileD.txt,syn60109500
+/home/user_name/my_ad_project/biospecimen_experiment_2/fileC.txt,syn60109500
+/home/user_name/my_ad_project/single_cell_RNAseq_batch_1/SRR12345678_R2.fastq.gz,syn60109500
+/home/user_name/my_ad_project/single_cell_RNAseq_batch_1/SRR12345678_R1.fastq.gz,syn60109500
+/home/user_name/my_ad_project/biospecimen_experiment_1/fileA.txt,syn60109500
+/home/user_name/my_ad_project/biospecimen_experiment_1/fileB.txt,syn60109500
 ```
 </details>
 
 ## 3. Upload the data in bulk
 ```python
-{!docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py!lines=35-37}
+{!docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py!lines=46-48}
 ```
 
 
 <details class="example">
   <summary>While this is running you'll see output in your console similar to:</summary>
 ```
-Validating manifest: /home/user_name/manifest-for-upload.tsv
+Validating manifest: /home/user_name/manifest-for-upload.csv
 Validating that all paths exist...
 Validating that all files are unique...
 Validating that all the files are not empty...
@@ -103,12 +108,12 @@ Uploading 8 files: 100%|██████████████████
 
 
 ## 4. Add an annotation to our manifest file
-At this point in the tutorial we will start to use pandas to manipulate a TSV file. If
+At this point in the tutorial we will use pandas to manipulate the CSV manifest. If
 you are not comfortable with pandas you may use any tool that can open and manipulate
-TSV such as excel or google sheets.
+CSV files such as Excel or Google Sheets.
 
 ```python
-{!docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py!lines=39-55}
+{!docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py!lines=50-63}
 ```
 
 Now that you have uploaded and annotated your files you'll be able to inspect your data
@@ -123,14 +128,14 @@ Let's create an [Activity/Provenance](../../explanations/domain_models_of_synaps
 record for one of our files. In otherwords, we will record the steps taken to generate
 the file.
 
-In this code we are finding a row in our TSV file and pointing to the file path of
+In this code we are finding a row in our CSV file and pointing to the file path of
 another file within our manifest. By doing this we are creating a relationship between
 the two files. This is a simple example of how you can create a provenance record in
 Synapse. Additionally we'll link off to a sample URL that describes a process that we
 may have executed to generate the file.
 
 ```python
-{!docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py!lines=57-83}
+{!docs/tutorials/python/tutorial_scripts/upload_data_in_bulk.py!lines=68-92}
 ```
 
 After running this code we may again inspect the synapse web UI. In this screenshot i've
@@ -157,7 +162,6 @@ navigated to the Files tab and selected the file that we added a Provenance reco
 
 - [syn.login][synapseclient.Synapse.login]
 - [syn.findEntityId][synapseclient.Synapse.findEntityId]
-- [synapseutils.generate_sync_manifest][]
 - [Project.sync_to_synapse][synapseclient.models.mixins.StorableContainer.sync_to_synapse]
-- [synapseutils.syncToSynapse][] *(deprecated)*
+- [Manifest CSV format](../../explanations/manifest_csv.md)
 - [Activity/Provenance](../../explanations/domain_models_of_synapse.md#activityprovenance)