@@ -11,6 +11,18 @@ def _():
1111 return (pl ,)
1212
1313
14+ @app .cell
15+ def _ (mo ):
16+ mo .md (
17+ r"""
18+ # Dealing with entire large dataframes
19+
20+ This blog walks through some techniques for dealing with a large dataframe when you need to look at the entire thing.
21+ """
22+ )
23+ return
24+
25+
1426@app .cell
1527def _ (mo ):
1628 mo .md (
@@ -25,6 +37,11 @@ def _(mo):
2537
2638@app .cell
2739def _ (pl ):
40+ def get_lazy_df (fname ):
41+ if fname .endswith ("csv" ):
42+ return pl .scan_csv (fname )
43+ else :
44+ return pl .scan_parquet (fname )
2845
2946 def get_categoricals (df , n_vals = 250 ):
3047 cat_columns = []
@@ -34,24 +51,25 @@ def get_categoricals(df, n_vals=250):
3451 return cat_columns
3552
3653 def scan_vc (fname , out_fname , n_rows = 500_000 ):
37- small_df = pl .read_parquet (fname , n_rows = n_rows )
54+ small_df = get_lazy_df (fname )[:n_rows ].collect ()
55+ #small_df = pl.read_parquet(fname, n_rows=n_rows)
3856 cat_columns = get_categoricals (small_df )
3957 print ("finished get_categoricals this many columns" , len (cat_columns ))
4058 select_args = []
4159 for k in small_df .columns :
4260 if k in cat_columns :
4361 select_args .append (pl .col (k ).value_counts (sort = True ).implode ())
44- lazy_df = pl .scan_csv (fname ) if fname .endswith ("csv" ) else pl .scan_parquet (fname )
45- pl . scan_parquet (fname ).select (select_args ).sink_parquet (out_fname )
62+ # lazy_df = pl.scan_csv(fname) if fname.endswith("csv") else pl.scan_parquet(fname)
63+ get_lazy_df (fname ).select (select_args ).sink_parquet (out_fname )
4664 #scan_vc("~/JULY_FULL.parq", "~/JULY_FULL_vc2.parq")
4765 #this took 57 seconds on my MBA M1
4866 return get_categoricals , scan_vc
4967
5068
5169@app .cell
52- def _ (scan_vc ):
53- scan_vc ("~/NPPES_Data_Dissemination_July_2025/npidata_pfile_20050523-20250713.csv" , "~/JULY_FULL_vc2.parq" )
54-
70+ def _ ():
71+ # scan_vc("~/NPPES_Data_Dissemination_July_2025/npidata_pfile_20050523-20250713.csv", "~/JULY_FULL_vc2.parq")
72+ #1m24s on my laptop
5573 return
5674
5775
@@ -109,247 +127,59 @@ def _():
109127
110128
111129@app .cell
112- def _ (convert_to_enum , pl , scan_vc ):
113- def long_running_function ():
114- scan_vc ("~/JULY_FULL.parq" , "~/JULY_FULL_vc.parq" )
115- _vc_df = pl .read_parquet ("~/JULY_FULL_vc.parq" )
116- convert_to_enum ("~/JULY_FULL.parq" , "~/JULY_FULL_enum.parq" , _vc_df )
130+ def _ (pl ):
131+ def convert_to_enum3 (fname , out_fname , vc_df ):
132+ enum_select = []
133+ for col in vc_df .columns :
134+ enum_select .append (pl .col (col ))
135+ pl .scan_parquet (fname ).select (enum_select ).sink_parquet (out_fname )
136+ #22 seconds on my machine
137+ #convert_to_enum3("~/JULY_FULL.parq", "~/JULY_FULL_enum3.parq", vc_df)
138+ return
139+
140+
141+ @app .cell
142+ def _ (pl ):
143+ _ROWS = 5
144+ _COLS = 5
145+ _df = pl .read_parquet ("~/JULY_FULL_enum2.parq" , n_rows = _ROWS )
146+ _df [_df .columns [:_COLS ]]
147+ return
148+
149+
150+ @app .cell
151+ def _ (pl ):
152+ pl .read_parquet ("~/JULY_FULL_enum3.parq" , n_rows = 5000 )
117153 return
118154
119155
120156@app .cell
121157def _ ():
122- cat_cols = [
123- "Entity Type Code" ,
124- "Replacement NPI" ,
125- "Employer Identification Number (EIN)" ,
126- "Provider Name Prefix Text" ,
127- "Provider Name Suffix Text" ,
128- "Provider Other Organization Name" ,
129- "Provider Other Organization Name Type Code" ,
130- "Provider Other Name Prefix Text" ,
131- "Provider Other Name Suffix Text" ,
132- "Provider Other Last Name Type Code" ,
133- "Provider Business Mailing Address State Name" ,
134- "Provider Business Mailing Address Country Code (If outside U.S.)" ,
135- "Provider Business Practice Location Address State Name" ,
136- "Provider Business Practice Location Address Country Code (If outside U.S.)" ,
137- "NPI Deactivation Reason Code" ,
138- "Provider Sex Code" ,
139- "Provider License Number State Code_1" ,
140- "Healthcare Provider Primary Taxonomy Switch_1" ,
141- "Provider License Number State Code_2" ,
142- "Healthcare Provider Primary Taxonomy Switch_2" ,
143- "Provider License Number State Code_3" ,
144- "Healthcare Provider Primary Taxonomy Switch_3" ,
145- "Provider License Number State Code_4" ,
146- "Healthcare Provider Primary Taxonomy Switch_4" ,
147- "Provider License Number State Code_5" ,
148- "Healthcare Provider Primary Taxonomy Switch_5" ,
149- "Provider License Number State Code_6" ,
150- "Healthcare Provider Primary Taxonomy Switch_6" ,
151- "Provider License Number State Code_7" ,
152- "Healthcare Provider Primary Taxonomy Switch_7" ,
153- "Provider License Number State Code_8" ,
154- "Healthcare Provider Primary Taxonomy Switch_8" ,
155- "Provider License Number State Code_9" ,
156- "Healthcare Provider Primary Taxonomy Switch_9" ,
157- "Provider License Number State Code_10" ,
158- "Healthcare Provider Primary Taxonomy Switch_10" ,
159- "Provider License Number State Code_11" ,
160- "Healthcare Provider Primary Taxonomy Switch_11" ,
161- "Provider License Number State Code_12" ,
162- "Healthcare Provider Primary Taxonomy Switch_12" ,
163- "Healthcare Provider Taxonomy Code_13" ,
164- "Provider License Number State Code_13" ,
165- "Healthcare Provider Primary Taxonomy Switch_13" ,
166- "Healthcare Provider Taxonomy Code_14" ,
167- "Provider License Number State Code_14" ,
168- "Healthcare Provider Primary Taxonomy Switch_14" ,
169- "Healthcare Provider Taxonomy Code_15" ,
170- "Provider License Number_15" ,
171- "Provider License Number State Code_15" ,
172- "Healthcare Provider Primary Taxonomy Switch_15" ,
173- "Other Provider Identifier Type Code_1" ,
174- "Other Provider Identifier State_1" ,
175- "Other Provider Identifier Type Code_2" ,
176- "Other Provider Identifier State_2" ,
177- "Other Provider Identifier Type Code_3" ,
178- "Other Provider Identifier State_3" ,
179- "Other Provider Identifier Type Code_4" ,
180- "Other Provider Identifier State_4" ,
181- "Other Provider Identifier Type Code_5" ,
182- "Other Provider Identifier State_5" ,
183- "Other Provider Identifier Type Code_6" ,
184- "Other Provider Identifier State_6" ,
185- "Other Provider Identifier Type Code_7" ,
186- "Other Provider Identifier State_7" ,
187- "Other Provider Identifier Type Code_8" ,
188- "Other Provider Identifier State_8" ,
189- "Other Provider Identifier Type Code_9" ,
190- "Other Provider Identifier State_9" ,
191- "Other Provider Identifier Type Code_10" ,
192- "Other Provider Identifier State_10" ,
193- "Other Provider Identifier Type Code_11" ,
194- "Other Provider Identifier State_11" ,
195- "Other Provider Identifier Type Code_12" ,
196- "Other Provider Identifier State_12" ,
197- "Other Provider Identifier Type Code_13" ,
198- "Other Provider Identifier State_13" ,
199- "Other Provider Identifier Type Code_14" ,
200- "Other Provider Identifier State_14" ,
201- "Other Provider Identifier Type Code_15" ,
202- "Other Provider Identifier State_15" ,
203- "Other Provider Identifier Type Code_16" ,
204- "Other Provider Identifier State_16" ,
205- "Other Provider Identifier Type Code_17" ,
206- "Other Provider Identifier State_17" ,
207- "Other Provider Identifier Type Code_18" ,
208- "Other Provider Identifier State_18" ,
209- "Other Provider Identifier Type Code_19" ,
210- "Other Provider Identifier State_19" ,
211- "Other Provider Identifier Type Code_20" ,
212- "Other Provider Identifier State_20" ,
213- "Other Provider Identifier Issuer_20" ,
214- "Other Provider Identifier Type Code_21" ,
215- "Other Provider Identifier State_21" ,
216- "Other Provider Identifier Issuer_21" ,
217- "Other Provider Identifier Type Code_22" ,
218- "Other Provider Identifier State_22" ,
219- "Other Provider Identifier Issuer_22" ,
220- "Other Provider Identifier_23" ,
221- "Other Provider Identifier Type Code_23" ,
222- "Other Provider Identifier State_23" ,
223- "Other Provider Identifier Issuer_23" ,
224- "Other Provider Identifier_24" ,
225- "Other Provider Identifier Type Code_24" ,
226- "Other Provider Identifier State_24" ,
227- "Other Provider Identifier Issuer_24" ,
228- "Other Provider Identifier_25" ,
229- "Other Provider Identifier Type Code_25" ,
230- "Other Provider Identifier State_25" ,
231- "Other Provider Identifier Issuer_25" ,
232- "Other Provider Identifier_26" ,
233- "Other Provider Identifier Type Code_26" ,
234- "Other Provider Identifier State_26" ,
235- "Other Provider Identifier Issuer_26" ,
236- "Other Provider Identifier_27" ,
237- "Other Provider Identifier Type Code_27" ,
238- "Other Provider Identifier State_27" ,
239- "Other Provider Identifier Issuer_27" ,
240- "Other Provider Identifier_28" ,
241- "Other Provider Identifier Type Code_28" ,
242- "Other Provider Identifier State_28" ,
243- "Other Provider Identifier Issuer_28" ,
244- "Other Provider Identifier_29" ,
245- "Other Provider Identifier Type Code_29" ,
246- "Other Provider Identifier State_29" ,
247- "Other Provider Identifier Issuer_29" ,
248- "Other Provider Identifier_30" ,
249- "Other Provider Identifier Type Code_30" ,
250- "Other Provider Identifier State_30" ,
251- "Other Provider Identifier Issuer_30" ,
252- "Other Provider Identifier_31" ,
253- "Other Provider Identifier Type Code_31" ,
254- "Other Provider Identifier State_31" ,
255- "Other Provider Identifier Issuer_31" ,
256- "Other Provider Identifier_32" ,
257- "Other Provider Identifier Type Code_32" ,
258- "Other Provider Identifier State_32" ,
259- "Other Provider Identifier Issuer_32" ,
260- "Other Provider Identifier_33" ,
261- "Other Provider Identifier Type Code_33" ,
262- "Other Provider Identifier State_33" ,
263- "Other Provider Identifier Issuer_33" ,
264- "Other Provider Identifier_34" ,
265- "Other Provider Identifier Type Code_34" ,
266- "Other Provider Identifier State_34" ,
267- "Other Provider Identifier Issuer_34" ,
268- "Other Provider Identifier_35" ,
269- "Other Provider Identifier Type Code_35" ,
270- "Other Provider Identifier State_35" ,
271- "Other Provider Identifier Issuer_35" ,
272- "Other Provider Identifier_36" ,
273- "Other Provider Identifier Type Code_36" ,
274- "Other Provider Identifier State_36" ,
275- "Other Provider Identifier Issuer_36" ,
276- "Other Provider Identifier_37" ,
277- "Other Provider Identifier Type Code_37" ,
278- "Other Provider Identifier State_37" ,
279- "Other Provider Identifier Issuer_37" ,
280- "Other Provider Identifier_38" ,
281- "Other Provider Identifier Type Code_38" ,
282- "Other Provider Identifier State_38" ,
283- "Other Provider Identifier Issuer_38" ,
284- "Other Provider Identifier_39" ,
285- "Other Provider Identifier Type Code_39" ,
286- "Other Provider Identifier State_39" ,
287- "Other Provider Identifier Issuer_39" ,
288- "Other Provider Identifier_40" ,
289- "Other Provider Identifier Type Code_40" ,
290- "Other Provider Identifier State_40" ,
291- "Other Provider Identifier Issuer_40" ,
292- "Other Provider Identifier_41" ,
293- "Other Provider Identifier Type Code_41" ,
294- "Other Provider Identifier State_41" ,
295- "Other Provider Identifier Issuer_41" ,
296- "Other Provider Identifier_42" ,
297- "Other Provider Identifier Type Code_42" ,
298- "Other Provider Identifier State_42" ,
299- "Other Provider Identifier Issuer_42" ,
300- "Other Provider Identifier_43" ,
301- "Other Provider Identifier Type Code_43" ,
302- "Other Provider Identifier State_43" ,
303- "Other Provider Identifier Issuer_43" ,
304- "Other Provider Identifier_44" ,
305- "Other Provider Identifier Type Code_44" ,
306- "Other Provider Identifier State_44" ,
307- "Other Provider Identifier Issuer_44" ,
308- "Other Provider Identifier_45" ,
309- "Other Provider Identifier Type Code_45" ,
310- "Other Provider Identifier State_45" ,
311- "Other Provider Identifier Issuer_45" ,
312- "Other Provider Identifier_46" ,
313- "Other Provider Identifier Type Code_46" ,
314- "Other Provider Identifier State_46" ,
315- "Other Provider Identifier Issuer_46" ,
316- "Other Provider Identifier_47" ,
317- "Other Provider Identifier Type Code_47" ,
318- "Other Provider Identifier State_47" ,
319- "Other Provider Identifier Issuer_47" ,
320- "Other Provider Identifier_48" ,
321- "Other Provider Identifier Type Code_48" ,
322- "Other Provider Identifier State_48" ,
323- "Other Provider Identifier Issuer_48" ,
324- "Other Provider Identifier_49" ,
325- "Other Provider Identifier Type Code_49" ,
326- "Other Provider Identifier State_49" ,
327- "Other Provider Identifier Issuer_49" ,
328- "Other Provider Identifier_50" ,
329- "Other Provider Identifier Type Code_50" ,
330- "Other Provider Identifier State_50" ,
331- "Other Provider Identifier Issuer_50" ,
332- "Is Sole Proprietor" ,
333- "Is Organization Subpart" ,
334- "Parent Organization TIN" ,
335- "Authorized Official Name Prefix Text" ,
336- "Authorized Official Name Suffix Text" ,
337- "Healthcare Provider Taxonomy Group_1" ,
338- "Healthcare Provider Taxonomy Group_2" ,
339- "Healthcare Provider Taxonomy Group_3" ,
340- "Healthcare Provider Taxonomy Group_4" ,
341- "Healthcare Provider Taxonomy Group_5" ,
342- "Healthcare Provider Taxonomy Group_6" ,
343- "Healthcare Provider Taxonomy Group_7" ,
344- "Healthcare Provider Taxonomy Group_8" ,
345- "Healthcare Provider Taxonomy Group_9" ,
346- "Healthcare Provider Taxonomy Group_10" ,
347- "Healthcare Provider Taxonomy Group_11" ,
348- "Healthcare Provider Taxonomy Group_12" ,
349- "Healthcare Provider Taxonomy Group_13" ,
350- "Healthcare Provider Taxonomy Group_14" ,
351- "Healthcare Provider Taxonomy Group_15"
352- ]
158+ import fastparquet
159+
160+ return (fastparquet ,)
161+
162+
163+ @app .cell
164+ def _ (fastparquet ):
165+ pf2 = fastparquet .ParquetFile ("/Users/paddy/JULY_FULL_enum2.parq" )
166+ pf2 .schema .schema_elements_by_name ["Provider License Number State Code_7" ]
167+ return
168+
169+
170+ @app .cell
171+ def _ (fastparquet ):
172+ pf3 = fastparquet .ParquetFile ("/Users/paddy/JULY_FULL_enum3.parq" )
173+ pf3 .schema .schema_elements_by_name ["Provider License Number State Code_7" ]
174+ return
175+
176+
177+ @app .cell
178+ def _ (convert_to_enum , pl , scan_vc ):
179+ def long_running_function ():
180+ scan_vc ("~/JULY_FULL.parq" , "~/JULY_FULL_vc.parq" )
181+ _vc_df = pl .read_parquet ("~/JULY_FULL_vc.parq" )
182+ convert_to_enum ("~/JULY_FULL.parq" , "~/JULY_FULL_enum.parq" , _vc_df )
353183 return
354184
355185
0 commit comments