add datasets to README.md

karaposu · karaposu · commit 430b1702bfc9 · 2026-02-16T12:36:34.000+03:00
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # Bright Data Python SDK
 
-The official Python SDK for [Bright Data](https://brightdata.com) APIs. Scrape any website, get SERP results, bypass bot detection and CAPTCHAs.
+The official Python SDK for [Bright Data](https://brightdata.com) APIs. Scrape any website, get SERP results, bypass bot detection and CAPTCHAs, and access 100+ ready-made datasets.
 
 [![Python](https://img.shields.io/badge/python-3.9%2B-blue)](https://www.python.org/)
 [![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE)
@@ -135,6 +135,55 @@ async with BrightDataClient() as client:
 - `client.scrape.instagram` - profiles, posts, comments, reels
 - `client.scrape.facebook` - posts, comments, reels
 
+## Datasets API
+
+Access 100+ ready-made datasets from Bright Data — pre-collected, structured data from popular platforms.
+
+```python
+async with BrightDataClient() as client:
+    # Filter a dataset — returns a snapshot_id
+    snapshot_id = await client.datasets.imdb_movies(
+        filter={"name": "title", "operator": "includes", "value": "black"},
+        records_limit=5
+    )
+
+    # Download when ready (polls until snapshot is complete)
+    data = await client.datasets.imdb_movies.download(snapshot_id)
+    print(f"Got {len(data)} records")
+
+    # Quick sample: .sample() auto-discovers fields, no filter needed
+    # Works on any dataset
+    snapshot_id = await client.datasets.imdb_movies.sample(records_limit=5)
+```
+
+**Export results to file:**
+
+```python
+from brightdata.datasets import export
+
+export(data, "results.json")   # JSON
+export(data, "results.csv")    # CSV
+export(data, "results.jsonl")  # JSONL
+```
+
+**Available dataset categories:**
+- **E-commerce:** Amazon, Walmart, Shopee, Lazada, Zalando, Zara, H&M, Shein, IKEA, Sephora, and more
+- **Business intelligence:** ZoomInfo, PitchBook, Owler, Slintel, VentureRadar, Manta
+- **Jobs & HR:** Glassdoor (companies, reviews, jobs), Indeed (companies, jobs), Xing
+- **Reviews:** Google Maps, Yelp, G2, Trustpilot, TrustRadius
+- **Social media:** Pinterest (posts, profiles), Facebook Pages
+- **Real estate:** Zillow, Airbnb, and 8+ regional platforms
+- **Luxury brands:** Chanel, Dior, Prada, Balenciaga, Hermes, YSL, and more
+- **Entertainment:** IMDB, NBA, Goodreads
+
+**Discover available fields:**
+
+```python
+metadata = await client.datasets.imdb_movies.get_metadata()
+for name, field in metadata.fields.items():
+    print(f"{name}: {field.type}")
+```
+
 ## Async Usage
 
 Run multiple requests concurrently:
diff --git a/notebooks/datasets/mass_test.ipynb b/notebooks/datasets/mass_test.ipynb
@@ -46,7 +46,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -556,20 +556,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
-     "ename": "TimeoutError",
-     "evalue": "Snapshot snap_mlowuqjv283bi7hfob not ready after 300s (status: building)",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
-      "\u001b[31mTimeoutError\u001b[39m                              Traceback (most recent call last)",
-      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[11]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m      1\u001b[39m \u001b[38;5;66;03m# Companies Enriched - Download\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m companies_enriched_data = \u001b[38;5;28;01mawait\u001b[39;00m client.datasets.companies_enriched.download(companies_enriched_snapshot)\n\u001b[32m      3\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mCompanies Enriched: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(companies_enriched_data)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m records\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m      4\u001b[39m companies_enriched_data\n",
-      "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/projects/sdk-python/src/brightdata/datasets/base.py:185\u001b[39m, in \u001b[36mBaseDataset.download\u001b[39m\u001b[34m(self, snapshot_id, format, timeout, poll_interval)\u001b[39m\n\u001b[32m    183\u001b[39m         \u001b[38;5;28;01mraise\u001b[39;00m DatasetError(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mSnapshot failed: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstatus.error\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m    184\u001b[39m     \u001b[38;5;28;01melif\u001b[39;00m time.time() - start_time > timeout:\n\u001b[32m--> \u001b[39m\u001b[32m185\u001b[39m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTimeoutError\u001b[39;00m(\n\u001b[32m    186\u001b[39m             \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mSnapshot \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msnapshot_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m not ready after \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtimeout\u001b[38;5;132;01m}\u001b[39;00m\u001b[33ms \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m    187\u001b[39m             \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m(status: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstatus.status\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m)\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m    188\u001b[39m         )\n\u001b[32m    190\u001b[39m     \u001b[38;5;28;01mawait\u001b[39;00m asyncio.sleep(poll_interval)\n\u001b[32m    192\u001b[39m \u001b[38;5;66;03m# Download data\u001b[39;00m\n",
-      "\u001b[31mTimeoutError\u001b[39m: Snapshot snap_mlowuqjv283bi7hfob not ready after 300s (status: building)"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Companies Enriched: 1 records\n"
      ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[{'raw': 'Snapshot is building. Try again in a few minutes'}]"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [