Skip to content

Commit 77318e3

Browse files
committed
wip
1 parent 745e568 commit 77318e3

1 file changed

Lines changed: 72 additions & 242 deletions

File tree

docs/example-notebooks/big_file_blog/part2_blog.marimo.py

Lines changed: 72 additions & 242 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,18 @@ def _():
1111
return (pl,)
1212

1313

14+
@app.cell
15+
def _(mo):
16+
mo.md(
17+
r"""
18+
# Dealing with entire large dataframes
19+
20+
This blog walks through some techniques for dealing with a large dataframe when you need to look at the entire thing.
21+
"""
22+
)
23+
return
24+
25+
1426
@app.cell
1527
def _(mo):
1628
mo.md(
@@ -25,6 +37,11 @@ def _(mo):
2537

2638
@app.cell
2739
def _(pl):
40+
def get_lazy_df(fname):
41+
if fname.endswith("csv"):
42+
return pl.scan_csv(fname)
43+
else:
44+
return pl.scan_parquet(fname)
2845

2946
def get_categoricals(df, n_vals=250):
3047
cat_columns = []
@@ -34,24 +51,25 @@ def get_categoricals(df, n_vals=250):
3451
return cat_columns
3552

3653
def scan_vc(fname, out_fname, n_rows=500_000):
37-
small_df = pl.read_parquet(fname, n_rows=n_rows)
54+
small_df = get_lazy_df(fname)[:n_rows].collect()
55+
#small_df = pl.read_parquet(fname, n_rows=n_rows)
3856
cat_columns = get_categoricals(small_df)
3957
print("finished get_categoricals this many columns", len(cat_columns))
4058
select_args = []
4159
for k in small_df.columns:
4260
if k in cat_columns:
4361
select_args.append(pl.col(k).value_counts(sort=True).implode())
44-
lazy_df = pl.scan_csv(fname) if fname.endswith("csv") else pl.scan_parquet(fname)
45-
pl.scan_parquet(fname).select(select_args).sink_parquet(out_fname)
62+
#lazy_df = pl.scan_csv(fname) if fname.endswith("csv") else pl.scan_parquet(fname)
63+
get_lazy_df(fname).select(select_args).sink_parquet(out_fname)
4664
#scan_vc("~/JULY_FULL.parq", "~/JULY_FULL_vc2.parq")
4765
#this took 57 seconds on my MBA M1
4866
return get_categoricals, scan_vc
4967

5068

5169
@app.cell
52-
def _(scan_vc):
53-
scan_vc("~/NPPES_Data_Dissemination_July_2025/npidata_pfile_20050523-20250713.csv", "~/JULY_FULL_vc2.parq")
54-
70+
def _():
71+
#scan_vc("~/NPPES_Data_Dissemination_July_2025/npidata_pfile_20050523-20250713.csv", "~/JULY_FULL_vc2.parq")
72+
#1m24s on my laptop
5573
return
5674

5775

@@ -109,247 +127,59 @@ def _():
109127

110128

111129
@app.cell
112-
def _(convert_to_enum, pl, scan_vc):
113-
def long_running_function():
114-
scan_vc("~/JULY_FULL.parq", "~/JULY_FULL_vc.parq")
115-
_vc_df = pl.read_parquet("~/JULY_FULL_vc.parq")
116-
convert_to_enum("~/JULY_FULL.parq", "~/JULY_FULL_enum.parq", _vc_df)
130+
def _(pl):
131+
def convert_to_enum3(fname, out_fname, vc_df):
132+
enum_select = []
133+
for col in vc_df.columns:
134+
enum_select.append(pl.col(col))
135+
pl.scan_parquet(fname).select(enum_select).sink_parquet(out_fname)
136+
#22 seconds on my machine
137+
#convert_to_enum3("~/JULY_FULL.parq", "~/JULY_FULL_enum3.parq", vc_df)
138+
return
139+
140+
141+
@app.cell
142+
def _(pl):
143+
_ROWS=5
144+
_COLS=5
145+
_df = pl.read_parquet("~/JULY_FULL_enum2.parq", n_rows=_ROWS)
146+
_df[_df.columns[:_COLS]]
147+
return
148+
149+
150+
@app.cell
151+
def _(pl):
152+
pl.read_parquet("~/JULY_FULL_enum3.parq", n_rows=5000)
117153
return
118154

119155

120156
@app.cell
121157
def _():
122-
cat_cols = [
123-
"Entity Type Code",
124-
"Replacement NPI",
125-
"Employer Identification Number (EIN)",
126-
"Provider Name Prefix Text",
127-
"Provider Name Suffix Text",
128-
"Provider Other Organization Name",
129-
"Provider Other Organization Name Type Code",
130-
"Provider Other Name Prefix Text",
131-
"Provider Other Name Suffix Text",
132-
"Provider Other Last Name Type Code",
133-
"Provider Business Mailing Address State Name",
134-
"Provider Business Mailing Address Country Code (If outside U.S.)",
135-
"Provider Business Practice Location Address State Name",
136-
"Provider Business Practice Location Address Country Code (If outside U.S.)",
137-
"NPI Deactivation Reason Code",
138-
"Provider Sex Code",
139-
"Provider License Number State Code_1",
140-
"Healthcare Provider Primary Taxonomy Switch_1",
141-
"Provider License Number State Code_2",
142-
"Healthcare Provider Primary Taxonomy Switch_2",
143-
"Provider License Number State Code_3",
144-
"Healthcare Provider Primary Taxonomy Switch_3",
145-
"Provider License Number State Code_4",
146-
"Healthcare Provider Primary Taxonomy Switch_4",
147-
"Provider License Number State Code_5",
148-
"Healthcare Provider Primary Taxonomy Switch_5",
149-
"Provider License Number State Code_6",
150-
"Healthcare Provider Primary Taxonomy Switch_6",
151-
"Provider License Number State Code_7",
152-
"Healthcare Provider Primary Taxonomy Switch_7",
153-
"Provider License Number State Code_8",
154-
"Healthcare Provider Primary Taxonomy Switch_8",
155-
"Provider License Number State Code_9",
156-
"Healthcare Provider Primary Taxonomy Switch_9",
157-
"Provider License Number State Code_10",
158-
"Healthcare Provider Primary Taxonomy Switch_10",
159-
"Provider License Number State Code_11",
160-
"Healthcare Provider Primary Taxonomy Switch_11",
161-
"Provider License Number State Code_12",
162-
"Healthcare Provider Primary Taxonomy Switch_12",
163-
"Healthcare Provider Taxonomy Code_13",
164-
"Provider License Number State Code_13",
165-
"Healthcare Provider Primary Taxonomy Switch_13",
166-
"Healthcare Provider Taxonomy Code_14",
167-
"Provider License Number State Code_14",
168-
"Healthcare Provider Primary Taxonomy Switch_14",
169-
"Healthcare Provider Taxonomy Code_15",
170-
"Provider License Number_15",
171-
"Provider License Number State Code_15",
172-
"Healthcare Provider Primary Taxonomy Switch_15",
173-
"Other Provider Identifier Type Code_1",
174-
"Other Provider Identifier State_1",
175-
"Other Provider Identifier Type Code_2",
176-
"Other Provider Identifier State_2",
177-
"Other Provider Identifier Type Code_3",
178-
"Other Provider Identifier State_3",
179-
"Other Provider Identifier Type Code_4",
180-
"Other Provider Identifier State_4",
181-
"Other Provider Identifier Type Code_5",
182-
"Other Provider Identifier State_5",
183-
"Other Provider Identifier Type Code_6",
184-
"Other Provider Identifier State_6",
185-
"Other Provider Identifier Type Code_7",
186-
"Other Provider Identifier State_7",
187-
"Other Provider Identifier Type Code_8",
188-
"Other Provider Identifier State_8",
189-
"Other Provider Identifier Type Code_9",
190-
"Other Provider Identifier State_9",
191-
"Other Provider Identifier Type Code_10",
192-
"Other Provider Identifier State_10",
193-
"Other Provider Identifier Type Code_11",
194-
"Other Provider Identifier State_11",
195-
"Other Provider Identifier Type Code_12",
196-
"Other Provider Identifier State_12",
197-
"Other Provider Identifier Type Code_13",
198-
"Other Provider Identifier State_13",
199-
"Other Provider Identifier Type Code_14",
200-
"Other Provider Identifier State_14",
201-
"Other Provider Identifier Type Code_15",
202-
"Other Provider Identifier State_15",
203-
"Other Provider Identifier Type Code_16",
204-
"Other Provider Identifier State_16",
205-
"Other Provider Identifier Type Code_17",
206-
"Other Provider Identifier State_17",
207-
"Other Provider Identifier Type Code_18",
208-
"Other Provider Identifier State_18",
209-
"Other Provider Identifier Type Code_19",
210-
"Other Provider Identifier State_19",
211-
"Other Provider Identifier Type Code_20",
212-
"Other Provider Identifier State_20",
213-
"Other Provider Identifier Issuer_20",
214-
"Other Provider Identifier Type Code_21",
215-
"Other Provider Identifier State_21",
216-
"Other Provider Identifier Issuer_21",
217-
"Other Provider Identifier Type Code_22",
218-
"Other Provider Identifier State_22",
219-
"Other Provider Identifier Issuer_22",
220-
"Other Provider Identifier_23",
221-
"Other Provider Identifier Type Code_23",
222-
"Other Provider Identifier State_23",
223-
"Other Provider Identifier Issuer_23",
224-
"Other Provider Identifier_24",
225-
"Other Provider Identifier Type Code_24",
226-
"Other Provider Identifier State_24",
227-
"Other Provider Identifier Issuer_24",
228-
"Other Provider Identifier_25",
229-
"Other Provider Identifier Type Code_25",
230-
"Other Provider Identifier State_25",
231-
"Other Provider Identifier Issuer_25",
232-
"Other Provider Identifier_26",
233-
"Other Provider Identifier Type Code_26",
234-
"Other Provider Identifier State_26",
235-
"Other Provider Identifier Issuer_26",
236-
"Other Provider Identifier_27",
237-
"Other Provider Identifier Type Code_27",
238-
"Other Provider Identifier State_27",
239-
"Other Provider Identifier Issuer_27",
240-
"Other Provider Identifier_28",
241-
"Other Provider Identifier Type Code_28",
242-
"Other Provider Identifier State_28",
243-
"Other Provider Identifier Issuer_28",
244-
"Other Provider Identifier_29",
245-
"Other Provider Identifier Type Code_29",
246-
"Other Provider Identifier State_29",
247-
"Other Provider Identifier Issuer_29",
248-
"Other Provider Identifier_30",
249-
"Other Provider Identifier Type Code_30",
250-
"Other Provider Identifier State_30",
251-
"Other Provider Identifier Issuer_30",
252-
"Other Provider Identifier_31",
253-
"Other Provider Identifier Type Code_31",
254-
"Other Provider Identifier State_31",
255-
"Other Provider Identifier Issuer_31",
256-
"Other Provider Identifier_32",
257-
"Other Provider Identifier Type Code_32",
258-
"Other Provider Identifier State_32",
259-
"Other Provider Identifier Issuer_32",
260-
"Other Provider Identifier_33",
261-
"Other Provider Identifier Type Code_33",
262-
"Other Provider Identifier State_33",
263-
"Other Provider Identifier Issuer_33",
264-
"Other Provider Identifier_34",
265-
"Other Provider Identifier Type Code_34",
266-
"Other Provider Identifier State_34",
267-
"Other Provider Identifier Issuer_34",
268-
"Other Provider Identifier_35",
269-
"Other Provider Identifier Type Code_35",
270-
"Other Provider Identifier State_35",
271-
"Other Provider Identifier Issuer_35",
272-
"Other Provider Identifier_36",
273-
"Other Provider Identifier Type Code_36",
274-
"Other Provider Identifier State_36",
275-
"Other Provider Identifier Issuer_36",
276-
"Other Provider Identifier_37",
277-
"Other Provider Identifier Type Code_37",
278-
"Other Provider Identifier State_37",
279-
"Other Provider Identifier Issuer_37",
280-
"Other Provider Identifier_38",
281-
"Other Provider Identifier Type Code_38",
282-
"Other Provider Identifier State_38",
283-
"Other Provider Identifier Issuer_38",
284-
"Other Provider Identifier_39",
285-
"Other Provider Identifier Type Code_39",
286-
"Other Provider Identifier State_39",
287-
"Other Provider Identifier Issuer_39",
288-
"Other Provider Identifier_40",
289-
"Other Provider Identifier Type Code_40",
290-
"Other Provider Identifier State_40",
291-
"Other Provider Identifier Issuer_40",
292-
"Other Provider Identifier_41",
293-
"Other Provider Identifier Type Code_41",
294-
"Other Provider Identifier State_41",
295-
"Other Provider Identifier Issuer_41",
296-
"Other Provider Identifier_42",
297-
"Other Provider Identifier Type Code_42",
298-
"Other Provider Identifier State_42",
299-
"Other Provider Identifier Issuer_42",
300-
"Other Provider Identifier_43",
301-
"Other Provider Identifier Type Code_43",
302-
"Other Provider Identifier State_43",
303-
"Other Provider Identifier Issuer_43",
304-
"Other Provider Identifier_44",
305-
"Other Provider Identifier Type Code_44",
306-
"Other Provider Identifier State_44",
307-
"Other Provider Identifier Issuer_44",
308-
"Other Provider Identifier_45",
309-
"Other Provider Identifier Type Code_45",
310-
"Other Provider Identifier State_45",
311-
"Other Provider Identifier Issuer_45",
312-
"Other Provider Identifier_46",
313-
"Other Provider Identifier Type Code_46",
314-
"Other Provider Identifier State_46",
315-
"Other Provider Identifier Issuer_46",
316-
"Other Provider Identifier_47",
317-
"Other Provider Identifier Type Code_47",
318-
"Other Provider Identifier State_47",
319-
"Other Provider Identifier Issuer_47",
320-
"Other Provider Identifier_48",
321-
"Other Provider Identifier Type Code_48",
322-
"Other Provider Identifier State_48",
323-
"Other Provider Identifier Issuer_48",
324-
"Other Provider Identifier_49",
325-
"Other Provider Identifier Type Code_49",
326-
"Other Provider Identifier State_49",
327-
"Other Provider Identifier Issuer_49",
328-
"Other Provider Identifier_50",
329-
"Other Provider Identifier Type Code_50",
330-
"Other Provider Identifier State_50",
331-
"Other Provider Identifier Issuer_50",
332-
"Is Sole Proprietor",
333-
"Is Organization Subpart",
334-
"Parent Organization TIN",
335-
"Authorized Official Name Prefix Text",
336-
"Authorized Official Name Suffix Text",
337-
"Healthcare Provider Taxonomy Group_1",
338-
"Healthcare Provider Taxonomy Group_2",
339-
"Healthcare Provider Taxonomy Group_3",
340-
"Healthcare Provider Taxonomy Group_4",
341-
"Healthcare Provider Taxonomy Group_5",
342-
"Healthcare Provider Taxonomy Group_6",
343-
"Healthcare Provider Taxonomy Group_7",
344-
"Healthcare Provider Taxonomy Group_8",
345-
"Healthcare Provider Taxonomy Group_9",
346-
"Healthcare Provider Taxonomy Group_10",
347-
"Healthcare Provider Taxonomy Group_11",
348-
"Healthcare Provider Taxonomy Group_12",
349-
"Healthcare Provider Taxonomy Group_13",
350-
"Healthcare Provider Taxonomy Group_14",
351-
"Healthcare Provider Taxonomy Group_15"
352-
]
158+
import fastparquet
159+
160+
return (fastparquet,)
161+
162+
163+
@app.cell
164+
def _(fastparquet):
165+
pf2 = fastparquet.ParquetFile("/Users/paddy/JULY_FULL_enum2.parq")
166+
pf2.schema.schema_elements_by_name["Provider License Number State Code_7"]
167+
return
168+
169+
170+
@app.cell
171+
def _(fastparquet):
172+
pf3 = fastparquet.ParquetFile("/Users/paddy/JULY_FULL_enum3.parq")
173+
pf3.schema.schema_elements_by_name["Provider License Number State Code_7"]
174+
return
175+
176+
177+
@app.cell
178+
def _(convert_to_enum, pl, scan_vc):
179+
def long_running_function():
180+
scan_vc("~/JULY_FULL.parq", "~/JULY_FULL_vc.parq")
181+
_vc_df = pl.read_parquet("~/JULY_FULL_vc.parq")
182+
convert_to_enum("~/JULY_FULL.parq", "~/JULY_FULL_enum.parq", _vc_df)
353183
return
354184

355185

0 commit comments

Comments
 (0)