Skip to content

Commit 430b170

Browse files
committed
add datasets to README.md
1 parent d5ad7d0 commit 430b170

2 files changed

Lines changed: 66 additions & 12 deletions

File tree

README.md

Lines changed: 50 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Bright Data Python SDK
22

3-
The official Python SDK for [Bright Data](https://brightdata.com) APIs. Scrape any website, get SERP results, bypass bot detection and CAPTCHAs.
3+
The official Python SDK for [Bright Data](https://brightdata.com) APIs. Scrape any website, get SERP results, bypass bot detection and CAPTCHAs, and access 100+ ready-made datasets.
44

55
[![Python](https://img.shields.io/badge/python-3.9%2B-blue)](https://www.python.org/)
66
[![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE)
@@ -135,6 +135,55 @@ async with BrightDataClient() as client:
135135
- `client.scrape.instagram` - profiles, posts, comments, reels
136136
- `client.scrape.facebook` - posts, comments, reels
137137

138+
## Datasets API
139+
140+
Access 100+ ready-made datasets from Bright Data — pre-collected, structured data from popular platforms.
141+
142+
```python
143+
async with BrightDataClient() as client:
144+
# Filter a dataset — returns a snapshot_id
145+
snapshot_id = await client.datasets.imdb_movies(
146+
filter={"name": "title", "operator": "includes", "value": "black"},
147+
records_limit=5
148+
)
149+
150+
# Download when ready (polls until snapshot is complete)
151+
data = await client.datasets.imdb_movies.download(snapshot_id)
152+
print(f"Got {len(data)} records")
153+
154+
# Quick sample: .sample() auto-discovers fields, no filter needed
155+
# Works on any dataset
156+
snapshot_id = await client.datasets.imdb_movies.sample(records_limit=5)
157+
```
158+
159+
**Export results to file:**
160+
161+
```python
162+
from brightdata.datasets import export
163+
164+
export(data, "results.json") # JSON
165+
export(data, "results.csv") # CSV
166+
export(data, "results.jsonl") # JSONL
167+
```
168+
169+
**Available dataset categories:**
170+
- **E-commerce:** Amazon, Walmart, Shopee, Lazada, Zalando, Zara, H&M, Shein, IKEA, Sephora, and more
171+
- **Business intelligence:** ZoomInfo, PitchBook, Owler, Slintel, VentureRadar, Manta
172+
- **Jobs & HR:** Glassdoor (companies, reviews, jobs), Indeed (companies, jobs), Xing
173+
- **Reviews:** Google Maps, Yelp, G2, Trustpilot, TrustRadius
174+
- **Social media:** Pinterest (posts, profiles), Facebook Pages
175+
- **Real estate:** Zillow, Airbnb, and 8+ regional platforms
176+
- **Luxury brands:** Chanel, Dior, Prada, Balenciaga, Hermes, YSL, and more
177+
- **Entertainment:** IMDB, NBA, Goodreads
178+
179+
**Discover available fields:**
180+
181+
```python
182+
metadata = await client.datasets.imdb_movies.get_metadata()
183+
for name, field in metadata.fields.items():
184+
print(f"{name}: {field.type}")
185+
```
186+
138187
## Async Usage
139188

140189
Run multiple requests concurrently:

notebooks/datasets/mass_test.ipynb

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
},
4747
{
4848
"cell_type": "code",
49-
"execution_count": 2,
49+
"execution_count": null,
5050
"metadata": {},
5151
"outputs": [
5252
{
@@ -556,20 +556,25 @@
556556
},
557557
{
558558
"cell_type": "code",
559-
"execution_count": 11,
559+
"execution_count": 16,
560560
"metadata": {},
561561
"outputs": [
562562
{
563-
"ename": "TimeoutError",
564-
"evalue": "Snapshot snap_mlowuqjv283bi7hfob not ready after 300s (status: building)",
565-
"output_type": "error",
566-
"traceback": [
567-
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
568-
"\u001b[31mTimeoutError\u001b[39m Traceback (most recent call last)",
569-
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[11]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# Companies Enriched - Download\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m companies_enriched_data = \u001b[38;5;28;01mawait\u001b[39;00m client.datasets.companies_enriched.download(companies_enriched_snapshot)\n\u001b[32m 3\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mCompanies Enriched: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(companies_enriched_data)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m records\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 4\u001b[39m companies_enriched_data\n",
570-
"\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/projects/sdk-python/src/brightdata/datasets/base.py:185\u001b[39m, in \u001b[36mBaseDataset.download\u001b[39m\u001b[34m(self, snapshot_id, format, timeout, poll_interval)\u001b[39m\n\u001b[32m 183\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m DatasetError(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mSnapshot failed: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstatus.error\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 184\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m time.time() - start_time > timeout:\n\u001b[32m--> \u001b[39m\u001b[32m185\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTimeoutError\u001b[39;00m(\n\u001b[32m 186\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mSnapshot \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msnapshot_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m not ready after \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtimeout\u001b[38;5;132;01m}\u001b[39;00m\u001b[33ms \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 187\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m(status: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstatus.status\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m)\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 188\u001b[39m )\n\u001b[32m 190\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m asyncio.sleep(poll_interval)\n\u001b[32m 192\u001b[39m \u001b[38;5;66;03m# Download data\u001b[39;00m\n",
571-
"\u001b[31mTimeoutError\u001b[39m: Snapshot snap_mlowuqjv283bi7hfob not ready after 300s (status: building)"
563+
"name": "stdout",
564+
"output_type": "stream",
565+
"text": [
566+
"Companies Enriched: 1 records\n"
572567
]
568+
},
569+
{
570+
"data": {
571+
"text/plain": [
572+
"[{'raw': 'Snapshot is building. Try again in a few minutes'}]"
573+
]
574+
},
575+
"execution_count": 16,
576+
"metadata": {},
577+
"output_type": "execute_result"
573578
}
574579
],
575580
"source": [

0 commit comments

Comments
 (0)