-
Notifications
You must be signed in to change notification settings - Fork 17
Expand file tree
/
Copy pathmod.rs
More file actions
2748 lines (2373 loc) · 103 KB
/
mod.rs
File metadata and controls
2748 lines (2373 loc) · 103 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
//! Query execution module for ggsql
//!
//! Provides shared execution logic for building data maps from queries,
//! handling both global SQL and layer-specific data sources.
//!
//! This module is organized into submodules:
//! - `cte`: CTE extraction, transformation, and materialization
//! - `schema`: Schema extraction, type inference, and min/max ranges
//! - `casting`: Type requirements determination and casting logic
//! - `layer`: Layer query building, data transforms, and stat application
//! - `scale`: Scale creation, resolution, type coercion, and OOB handling
mod casting;
mod cte;
mod layer;
mod position;
mod scale;
mod schema;
// Re-export public API
pub use casting::TypeRequirement;
pub use cte::CteDefinition;
pub use schema::TypeInfo;
use crate::naming;
use crate::parser;
use crate::plot::aesthetic::{is_positional_aesthetic, AestheticContext};
use crate::plot::facet::{resolve_properties as resolve_facet_properties, FacetDataContext};
use crate::plot::layer::is_transposed;
use crate::plot::{AestheticValue, Layer, Scale, ScaleTypeKind, Schema};
use crate::{DataFrame, DataSource, GgsqlError, Plot, Result};
use std::collections::{HashMap, HashSet};
use crate::reader::Reader;
#[cfg(all(feature = "duckdb", test))]
use crate::reader::DuckDBReader;
// =============================================================================
// Validation
// =============================================================================
/// Validate all layers against their schemas
///
/// Validates:
/// - Required aesthetics exist for each geom
/// - SETTING parameters are valid for each geom
/// - Aesthetic columns exist in schema
/// - Partition_by columns exist in schema
/// - Remapping target aesthetics are supported by geom
/// - Remapping source columns are valid stat columns for geom
fn validate(
layers: &[Layer],
layer_schemas: &[Schema],
aesthetic_context: &Option<AestheticContext>,
) -> Result<()> {
for (idx, (layer, schema)) in layers.iter().zip(layer_schemas.iter()).enumerate() {
let schema_columns: HashSet<&str> = schema.iter().map(|c| c.name.as_str()).collect();
let supported = layer.geom.aesthetics().supported();
// Validate required aesthetics for this geom
layer
.validate_mapping(aesthetic_context, false)
.map_err(|e| GgsqlError::ValidationError(format!("Layer {}: {}", idx + 1, e)))?;
// Validate SETTING parameters are valid for this geom
layer
.validate_settings()
.map_err(|e| GgsqlError::ValidationError(format!("Layer {}: {}", idx + 1, e)))?;
// Validate aesthetic columns exist in schema
for (aesthetic, value) in &layer.mappings.aesthetics {
// Only validate aesthetics supported by this geom
if !supported.contains(&aesthetic.as_str()) {
continue;
}
if let Some(col_name) = value.column_name() {
// Skip synthetic columns (stat-generated or constants)
if naming::is_synthetic_column(col_name) {
continue;
}
if !schema_columns.contains(col_name) {
return Err(GgsqlError::ValidationError(format!(
"Layer {}: aesthetic '{}' references non-existent column '{}'",
idx + 1,
aesthetic,
col_name
)));
}
}
}
// Validate partition_by columns exist in schema
for col in &layer.partition_by {
if !schema_columns.contains(col.as_str()) {
return Err(GgsqlError::ValidationError(format!(
"Layer {}: PARTITION BY references non-existent column '{}'",
idx + 1,
col
)));
}
}
// Validate remapping target aesthetics are supported by geom
// REMAPPING can target any aesthetic (including Delayed ones from stat transforms)
let aesthetics_info = layer.geom.aesthetics();
for target_aesthetic in layer.remappings.aesthetics.keys() {
if !aesthetics_info.contains(target_aesthetic) {
return Err(GgsqlError::ValidationError(format!(
"Layer {}: REMAPPING targets unsupported aesthetic '{}' for geom '{}'",
idx + 1,
target_aesthetic,
layer.geom
)));
}
}
// Validate remapping source columns are valid stat columns for this geom
let valid_stat_columns = layer.geom.valid_stat_columns();
for stat_value in layer.remappings.aesthetics.values() {
if let Some(stat_col) = stat_value.column_name() {
if !valid_stat_columns.contains(&stat_col) {
if valid_stat_columns.is_empty() {
return Err(GgsqlError::ValidationError(format!(
"Layer {}: REMAPPING not supported for geom '{}' (no stat transform)",
idx + 1,
layer.geom
)));
} else {
return Err(GgsqlError::ValidationError(format!(
"Layer {}: REMAPPING references unknown stat column '{}'. Valid stat columns for geom '{}' are: {}",
idx + 1,
stat_col,
layer.geom,
crate::and_list(valid_stat_columns)
)));
}
}
}
}
}
Ok(())
}
// =============================================================================
// Global Mapping & Color Splitting
// =============================================================================
/// Check if an aesthetic value is a null sentinel (explicit removal marker)
fn is_null_sentinel(value: &AestheticValue) -> bool {
matches!(
value,
AestheticValue::Literal(crate::plot::ParameterValue::Null)
)
}
/// Merge global mappings into layer aesthetics and expand wildcards
///
/// This function performs smart wildcard expansion with schema awareness:
/// 1. Merges explicit global aesthetics into layers (layer aesthetics take precedence)
/// 2. Only merges aesthetics that the geom supports
/// 3. Expands wildcards by adding mappings only for supported aesthetics that:
/// - Are not already mapped (either from global or layer)
/// - Have a matching column in the layer's schema
/// 4. Moreover it propagates 'color' to 'fill' and 'stroke'
fn merge_global_mappings_into_layers(specs: &mut [Plot], layer_schemas: &[Schema]) {
for spec in specs {
let aesthetic_ctx = spec.get_aesthetic_context();
for (layer, schema) in spec.layers.iter_mut().zip(layer_schemas.iter()) {
// Skip annotation layers - they don't inherit global mappings
if matches!(layer.source, Some(DataSource::Annotation)) {
continue;
}
let supported = layer.geom.aesthetics().supported();
let schema_columns: HashSet<&str> = schema.iter().map(|c| c.name.as_str()).collect();
// 1. First merge explicit global aesthetics (layer overrides global)
// Note: "color"/"colour" are accepted even though not in supported,
// because split_color_aesthetic will convert them to fill/stroke later
// Note: facet aesthetics (panel, row, column) are also accepted,
// as they apply to all layers regardless of geom support
for (aesthetic, value) in &spec.global_mappings.aesthetics {
let is_color_alias = matches!(aesthetic.as_str(), "color" | "colour");
let is_facet_aesthetic = crate::plot::scale::is_facet_aesthetic(aesthetic.as_str());
if supported.contains(&aesthetic.as_str()) || is_color_alias || is_facet_aesthetic {
layer
.mappings
.aesthetics
.entry(aesthetic.clone())
.or_insert(value.clone());
}
}
// 2. Smart wildcard expansion: only expand to columns that exist in schema
let has_wildcard = layer.mappings.wildcard || spec.global_mappings.wildcard;
if has_wildcard {
for aes in &supported {
// Convert internal name to user-facing name for schema matching
let user_name = aesthetic_ctx.map_internal_to_user(aes);
// Only create mapping if the user-facing column exists in the schema
if schema_columns.contains(user_name.as_str()) {
layer
.mappings
.aesthetics
.entry(crate::parser::builder::normalise_aes_name(aes))
.or_insert(AestheticValue::standard_column(&user_name));
}
}
}
// Clear wildcard flag since it's been resolved
layer.mappings.wildcard = false;
// Remove null sentinel mappings (explicit "don't inherit" markers)
layer
.mappings
.aesthetics
.retain(|_, value| !is_null_sentinel(value));
}
}
}
/// Let 'color' aesthetics fill defaults for the 'stroke' and 'fill' aesthetics.
/// Also splits 'color' scale to 'fill' and 'stroke' scales.
/// Removes 'color' from both mappings and scales after splitting to avoid
/// non-deterministic behavior from HashMap iteration order.
fn split_color_aesthetic(spec: &mut Plot) {
// 1. Split color SCALE to fill/stroke scales
if let Some(color_scale_idx) = spec.scales.iter().position(|s| s.aesthetic == "color") {
let color_scale = spec.scales[color_scale_idx].clone();
// Add fill scale if not already present
if !spec.scales.iter().any(|s| s.aesthetic == "fill") {
let mut fill_scale = color_scale.clone();
fill_scale.aesthetic = "fill".to_string();
spec.scales.push(fill_scale);
}
// Add stroke scale if not already present
if !spec.scales.iter().any(|s| s.aesthetic == "stroke") {
let mut stroke_scale = color_scale.clone();
stroke_scale.aesthetic = "stroke".to_string();
spec.scales.push(stroke_scale);
}
// Remove the color scale
spec.scales.remove(color_scale_idx);
}
// 2. Split color mapping to fill/stroke in layers, then remove color
for layer in &mut spec.layers {
if let Some(color_value) = layer.mappings.aesthetics.get("color").cloned() {
let aesthetics = layer.geom.aesthetics();
for &aes in &["stroke", "fill"] {
if aesthetics.is_supported(aes) {
layer
.mappings
.aesthetics
.entry(aes.to_string())
.or_insert(color_value.clone());
}
}
// Remove color after splitting
layer.mappings.aesthetics.remove("color");
}
}
// 3. Split color parameter (SETTING) to fill/stroke in layers
for layer in &mut spec.layers {
if let Some(color_value) = layer.parameters.get("color").cloned() {
let aesthetics = layer.geom.aesthetics();
for &aes in &["stroke", "fill"] {
if aesthetics.is_supported(aes) {
layer
.parameters
.entry(aes.to_string())
.or_insert(color_value.clone());
}
}
// Remove color after splitting
layer.parameters.remove("color");
}
}
}
// =============================================================================
// Facet Mapping Injection
// =============================================================================
/// Add facet variable mappings to each layer's mappings.
///
/// This allows facet aesthetics to flow through the same code paths as
/// regular aesthetics (scale resolution, type casting, SELECT list building,
/// partition_by handling, etc.).
///
/// Skips injection if:
/// - The layer already has the facet aesthetic mapped (from MAPPING or global)
/// - The variables list is empty (inferred from layer mappings, not FACET clause)
/// - The column doesn't exist in this layer's schema (different data source)
fn add_facet_mappings_to_layers(
layers: &mut [Layer],
facet: &crate::plot::Facet,
layer_type_info: &[Vec<schema::TypeInfo>],
) {
for (layer_idx, layer) in layers.iter_mut().enumerate() {
if layer_idx >= layer_type_info.len() {
continue;
}
let type_info = &layer_type_info[layer_idx];
// Use internal aesthetic names (facet1, facet2) since transformation has already occurred
for (var, aesthetic) in facet.layout.get_internal_aesthetic_mappings() {
// Skip if layer already has this facet aesthetic mapped (from MAPPING or global)
if layer.mappings.aesthetics.contains_key(&aesthetic) {
continue;
}
// Only inject if the column exists in this layer's schema
// (variables list is empty when inferred from layer mappings - no injection needed)
if type_info.iter().any(|(col, _, _)| col == var) {
// Add mapping: variable → facet aesthetic (internal name)
layer.mappings.aesthetics.insert(
aesthetic,
AestheticValue::Column {
name: var.to_string(),
original_name: Some(var.to_string()),
is_dummy: false,
},
);
}
}
}
}
// =============================================================================
// Facet Missing Column Detection and Handling
// =============================================================================
/// Identify which layers are missing the facet column.
///
/// Returns a vector of booleans, one per layer. A layer is considered "missing"
/// the facet column if ANY of the facet variables are not present in the layer's
/// schema (type_info).
///
/// This is used to determine which layers need data duplication when
/// `missing => 'repeat'` is set on the facet.
fn identify_layers_missing_facet_column(
layers: &[Layer],
facet: &crate::plot::Facet,
layer_type_info: &[Vec<schema::TypeInfo>],
) -> Vec<bool> {
let facet_variables = facet.get_variables();
// If variables list is empty (inferred from layer mappings), no layers are "missing"
if facet_variables.is_empty() {
return vec![false; layers.len()];
}
layers
.iter()
.enumerate()
.map(|(layer_idx, _layer)| {
if layer_idx >= layer_type_info.len() {
return false;
}
let type_info = &layer_type_info[layer_idx];
let schema_columns: std::collections::HashSet<&str> =
type_info.iter().map(|(name, _, _)| name.as_str()).collect();
// Layer is missing if ANY facet variable is absent from its schema
facet_variables
.iter()
.any(|var| !schema_columns.contains(var.as_str()))
})
.collect()
}
/// Get unique facet values from layers that have the facet column.
///
/// Collects all unique values for a facet aesthetic from layers that have the column,
/// to be used for cross-joining with layers that are missing the column.
fn get_unique_facet_values(
data_map: &HashMap<String, DataFrame>,
facet_aesthetic: &str,
layers: &[Layer],
layers_missing_facet: &[bool],
) -> Option<polars::prelude::Series> {
use polars::prelude::*;
let aes_col = naming::aesthetic_column(facet_aesthetic);
let mut all_values: Vec<Series> = Vec::new();
for (idx, layer) in layers.iter().enumerate() {
// Skip layers that are missing the facet column
if idx < layers_missing_facet.len() && layers_missing_facet[idx] {
continue;
}
if let Some(ref data_key) = layer.data_key {
if let Some(df) = data_map.get(data_key) {
if let Ok(col) = df.column(&aes_col) {
all_values.push(col.as_materialized_series().clone());
}
}
}
}
if all_values.is_empty() {
return None;
}
// Concatenate all series and get unique values
let mut combined = all_values.remove(0);
for s in all_values {
let _ = combined.extend(&s);
}
combined.unique().ok()
}
/// Cross-join a DataFrame with facet values (duplicate for each facet panel).
///
/// Creates a new DataFrame where every row is duplicated for each unique facet value.
/// The facet column is added with the appropriate values.
fn cross_join_with_facet_values(
df: &DataFrame,
unique_values: &polars::prelude::Series,
facet_aesthetic: &str,
) -> Result<DataFrame> {
use polars::prelude::*;
let aes_col = naming::aesthetic_column(facet_aesthetic);
let n_values = unique_values.len();
if n_values == 0 {
return Ok(df.clone());
}
let n_rows = df.height();
// Create the repeated data manually (polars cross_join requires an import we may not have)
// For each row in df, repeat n_values times
// For facet column, for each row's repetitions, cycle through unique_values
// 1. Repeat each original column n_values times
let mut new_columns: Vec<Column> = Vec::new();
for col in df.get_columns() {
// Repeat each value n_values times: [a, b, c] with n_values=2 -> [a, a, b, b, c, c]
let indices: Vec<u32> = (0..n_rows)
.flat_map(|i| std::iter::repeat_n(i as u32, n_values))
.collect();
let idx = IdxCa::new(PlSmallStr::EMPTY, &indices);
let repeated = col.as_materialized_series().take(&idx).map_err(|e| {
crate::GgsqlError::InternalError(format!("Failed to repeat column: {}", e))
})?;
new_columns.push(repeated.into());
}
// 2. Create the facet column: tile unique_values for each row
// [v1, v2, v1, v2, v1, v2] for n_rows=3, n_values=2
let facet_indices: Vec<u32> = (0..n_rows)
.flat_map(|_| (0..n_values).map(|j| j as u32))
.collect();
let facet_idx = IdxCa::new(PlSmallStr::EMPTY, &facet_indices);
let facet_col = unique_values
.take(&facet_idx)
.map_err(|e| {
crate::GgsqlError::InternalError(format!("Failed to create facet column: {}", e))
})?
.with_name(aes_col.into());
new_columns.push(facet_col.into());
DataFrame::new(new_columns).map_err(|e| {
crate::GgsqlError::InternalError(format!("Failed to create expanded DataFrame: {}", e))
})
}
/// Handle layers missing the facet column based on facet.missing setting.
///
/// - `repeat` (default): Cross-join layer data with all unique facet values,
/// effectively duplicating the layer's data across all facet panels.
/// - `null`: Do nothing (current behavior - nulls added during unification,
/// layer appears only in null panel if null is in scale's input range).
fn handle_missing_facet_columns(
spec: &Plot,
data_map: &mut HashMap<String, DataFrame>,
layers_missing_facet: &[bool],
) -> Result<()> {
use crate::plot::ParameterValue;
let facet = match &spec.facet {
Some(f) => f,
None => return Ok(()),
};
// Get the missing setting (default to "repeat")
let missing_setting = facet
.properties
.get("missing")
.and_then(|v| {
if let ParameterValue::String(s) = v {
Some(s.as_str())
} else {
None
}
})
.unwrap_or("repeat");
// If null, do nothing (existing behavior handles this)
if missing_setting == "null" {
return Ok(());
}
// Get internal facet aesthetics from layout (facet1, facet2)
let facet_aesthetics = facet.layout.internal_facet_names();
// Process each facet aesthetic
for facet_aesthetic in &facet_aesthetics {
// Get unique values from layers that HAVE the column
let unique_values = match get_unique_facet_values(
data_map,
facet_aesthetic,
&spec.layers,
layers_missing_facet,
) {
Some(v) => v,
None => continue, // No layers have this column, skip
};
// For each layer MISSING the column, cross-join with facet values
for (idx, layer) in spec.layers.iter().enumerate() {
if idx >= layers_missing_facet.len() || !layers_missing_facet[idx] {
continue;
}
if let Some(ref data_key) = layer.data_key {
if let Some(df) = data_map.get(data_key) {
// Only process if this DataFrame doesn't already have the column
let aes_col = naming::aesthetic_column(facet_aesthetic);
if df.column(&aes_col).is_err() {
let expanded_df =
cross_join_with_facet_values(df, &unique_values, facet_aesthetic)?;
data_map.insert(data_key.clone(), expanded_df);
}
}
}
}
}
Ok(())
}
// =============================================================================
// Facet Resolution from Layer Mappings
// =============================================================================
/// Resolve facet configuration from layer mappings and FACET clause.
///
/// Logic:
/// 1. Collect all facet aesthetic mappings from layers (after global merge)
/// 2. Validate no conflicting layout types (cannot mix 'panel' with 'row'/'column')
/// 3. Validate Grid layout has both 'row' and 'column' if either is used
/// 4. If FACET clause exists:
/// - Validate layer mappings are compatible with layout type
/// - Layer mappings take precedence (override FACET clause columns)
/// 5. If no FACET clause: infer layout from layer mappings
///
/// Returns:
/// - `Ok(Some(Facet))` - Resolved facet configuration
/// - `Ok(None)` - No faceting needed
/// - `Err(...)` - Validation error
fn resolve_facet(
layers: &[crate::plot::Layer],
existing_facet: Option<crate::plot::Facet>,
) -> Result<Option<crate::plot::Facet>> {
use crate::plot::facet::FacetLayout;
use crate::plot::scale::is_facet_aesthetic;
// Collect facet aesthetic mappings from all layers
// After transformation: panel → facet1, row → facet1, column → facet2
// If only facet1 exists → wrap layout (panel only)
// If facet1 AND facet2 exist → grid layout (row AND column)
let mut has_facet1 = false;
let mut has_facet2 = false;
for layer in layers {
for aesthetic in layer.mappings.aesthetics.keys() {
if is_facet_aesthetic(aesthetic) {
match aesthetic.as_str() {
"facet1" => has_facet1 = true,
"facet2" => has_facet2 = true,
_ => {}
}
}
}
}
// Validate: Grid requires both facet1 and facet2 (row and column)
// Having only facet2 is an error (column without row)
if has_facet2 && !has_facet1 {
return Err(GgsqlError::ValidationError(
"Grid facet layout requires both 'row' and 'column' aesthetics. Missing: 'row'"
.to_string(),
));
}
// Determine inferred layout from layer mappings
// facet1 only → wrap layout (originally 'panel')
// facet1 AND facet2 → grid layout (originally 'row' AND 'column')
let inferred_layout = if has_facet1 && has_facet2 {
Some(FacetLayout::Grid {
row: vec![], // Empty - each layer has its own mapping
column: vec![], // Empty - each layer has its own mapping
})
} else if has_facet1 {
Some(FacetLayout::Wrap {
variables: vec![], // Empty - each layer has its own mapping
})
} else {
None
};
// If no layer mappings and no FACET clause, no faceting
if inferred_layout.is_none() && existing_facet.is_none() {
return Ok(None);
}
// If FACET clause exists, validate compatibility with layer mappings
if let Some(ref facet) = existing_facet {
let is_wrap = facet.is_wrap();
// Wrap layout (FACET var) but layer has both facet1 AND facet2 (row/column)
// This indicates the layer was declared with Grid aesthetics
if is_wrap && has_facet2 {
return Err(GgsqlError::ValidationError(
"FACET clause uses Wrap layout, but layer mappings use 'row'/'column' (Grid layout). \
Remove FACET clause to infer Grid layout, or use 'panel' aesthetic instead.".to_string()
));
}
// Grid layout (FACET row BY col) but layer has only facet1 without facet2
// This indicates the layer was declared with Wrap aesthetic (panel only)
// Note: Grid layout declared by user means they expect both row and column
// If layer only has facet1, it's compatible (will use only row mapping)
// This is actually okay - we don't need to error here
// FACET clause exists and is compatible - use it (layer mappings will override columns)
return Ok(Some(facet.clone()));
}
// No FACET clause - infer from layer mappings
if let Some(layout) = inferred_layout {
return Ok(Some(crate::plot::Facet::new(layout)));
}
Ok(None)
}
// =============================================================================
// Discrete Column Handling
// =============================================================================
/// Add discrete mapped columns to partition_by for all layers
///
/// For each layer, examines all aesthetic mappings and adds any that map to
/// discrete columns to the layer's partition_by. This ensures proper grouping
/// for all layers, not just stat geoms.
///
/// Discreteness is determined by:
/// 1. If the aesthetic has an explicit scale with a scale_type:
/// - ScaleTypeKind::Discrete or Binned → discrete (add to partition_by)
/// - ScaleTypeKind::Continuous → not discrete (skip)
/// - ScaleTypeKind::Identity → fall back to schema
/// 2. Otherwise, use schema's is_discrete flag (based on column data type)
///
/// Columns already in partition_by (from explicit PARTITION BY clause) are skipped.
/// Stat-consumed aesthetics (x for bar, x for histogram) are also skipped.
fn add_discrete_columns_to_partition_by(
layers: &mut [Layer],
layer_schemas: &[Schema],
scales: &[Scale],
aesthetic_ctx: &AestheticContext,
) {
// Build a map of aesthetic -> scale for quick lookup
let scale_map: HashMap<&str, &Scale> =
scales.iter().map(|s| (s.aesthetic.as_str(), s)).collect();
for (layer, schema) in layers.iter_mut().zip(layer_schemas.iter()) {
let schema_columns: HashSet<&str> = schema.iter().map(|c| c.name.as_str()).collect();
let discrete_columns: HashSet<&str> = schema
.iter()
.filter(|c| c.is_discrete)
.map(|c| c.name.as_str())
.collect();
// Build set of excluded aesthetics that should not trigger auto-grouping:
// - Stat-consumed aesthetics (transformed, not grouped)
// - 'label' aesthetic (text content to display, not grouping categories)
let consumed_aesthetics = layer.geom.stat_consumed_aesthetics();
let mut excluded_aesthetics: HashSet<&str> = consumed_aesthetics.iter().copied().collect();
excluded_aesthetics.insert("label");
for (aesthetic, value) in &layer.mappings.aesthetics {
// Skip positional aesthetics - these should not trigger auto-grouping.
// Stats that need to group by positional aesthetics (like bar/histogram)
// already handle this themselves via stat_consumed_aesthetics().
if is_positional_aesthetic(aesthetic) {
continue;
}
// Skip excluded aesthetics (stat-consumed or label)
if excluded_aesthetics.contains(aesthetic.as_str()) {
continue;
}
if let Some(col) = value.column_name() {
// Skip if column doesn't exist in schema
if !schema_columns.contains(col) {
continue;
}
// Determine if this aesthetic is discrete:
// 1. Check if there's an explicit scale with a scale_type
// 2. Fall back to schema's is_discrete
//
// Discrete and Binned scales produce categorical groupings.
// Continuous scales don't group. Identity defers to column type.
let primary_aes = aesthetic_ctx
.primary_internal_positional(aesthetic)
.unwrap_or(aesthetic);
let is_discrete = if let Some(scale) = scale_map.get(primary_aes) {
if let Some(ref scale_type) = scale.scale_type {
match scale_type.scale_type_kind() {
ScaleTypeKind::Discrete
| ScaleTypeKind::Binned
| ScaleTypeKind::Ordinal => true,
ScaleTypeKind::Continuous => false,
ScaleTypeKind::Identity => discrete_columns.contains(col),
}
} else {
// Scale exists but no explicit type - use schema
discrete_columns.contains(col)
}
} else {
// No scale for this aesthetic - use schema
discrete_columns.contains(col)
};
// Skip if not discrete
if !is_discrete {
continue;
}
// Use the prefixed aesthetic column name, since the query renames
// columns to prefixed names (e.g., island → __ggsql_aes_fill__)
let aes_col_name = naming::aesthetic_column(aesthetic);
// Skip if already in partition_by
if layer.partition_by.contains(&aes_col_name) {
continue;
}
layer.partition_by.push(aes_col_name);
}
}
}
}
// =============================================================================
// Column Pruning
// =============================================================================
/// Collect the set of column names required for a specific layer.
///
/// Returns column names needed for:
/// - Aesthetic mappings (e.g., `__ggsql_aes_x__`, `__ggsql_aes_y__`)
/// - Bin end columns for binned scales (e.g., `__ggsql_aes_x2__`)
/// - Facet variables (shared across all layers)
/// - Partition columns (for Vega-Lite detail encoding)
/// - Order column for Path geoms
fn collect_layer_required_columns(layer: &Layer, spec: &Plot) -> HashSet<String> {
use crate::plot::layer::geom::GeomType;
let mut required = HashSet::new();
// Facet aesthetic columns (shared across all layers)
// Only the aesthetic-prefixed columns are needed for Vega-Lite output.
// The original variable names (e.g., "species") are not needed after
// the aesthetic columns (e.g., "__ggsql_aes_facet1__") have been created.
if let Some(ref facet) = spec.facet {
for aesthetic in facet.layout.internal_facet_names() {
required.insert(naming::aesthetic_column(&aesthetic));
}
}
// Aesthetic columns for this layer
for aesthetic in layer.mappings.aesthetics.keys() {
let aes_col = naming::aesthetic_column(aesthetic);
required.insert(aes_col.clone());
// Check if this aesthetic has a binned scale
if let Some(scale) = spec.find_scale(aesthetic) {
if let Some(ref scale_type) = scale.scale_type {
if scale_type.scale_type_kind() == ScaleTypeKind::Binned {
required.insert(naming::bin_end_column(&aes_col));
}
}
}
}
// Partition columns for this layer (used by Vega-Lite detail encoding)
for col in &layer.partition_by {
required.insert(col.clone());
}
// Order column for Path geoms
if layer.geom.geom_type() == GeomType::Path {
required.insert(naming::ORDER_COLUMN.to_string());
}
// Position offset column for position adjustments that create pos1offset
// This column is created by dodge/jitter positions and is not in layer.mappings
if layer.position.creates_pos1offset() {
required.insert(naming::aesthetic_column("pos1offset"));
}
// Position offset column for position adjustments that create pos2offset
// This column is created by jitter position for vertical jittering
if layer.position.creates_pos2offset() {
required.insert(naming::aesthetic_column("pos2offset"));
}
required
}
/// Prune columns from a DataFrame to only include required columns.
///
/// Columns that don't exist in the DataFrame are silently ignored.
fn prune_dataframe(df: &DataFrame, required: &HashSet<String>) -> Result<DataFrame> {
let columns_to_keep: Vec<String> = df
.get_column_names()
.into_iter()
.filter(|name| required.contains(name.as_str()))
.map(|name| name.to_string())
.collect();
if columns_to_keep.is_empty() {
return Err(GgsqlError::InternalError(format!(
"No columns remain after pruning. Required columns: {:?}",
required
)));
}
df.select(&columns_to_keep)
.map_err(|e| GgsqlError::InternalError(format!("Failed to prune columns: {}", e)))
}
/// Prune all DataFrames in the data map based on layer requirements.
///
/// Each layer's DataFrame is pruned to only include columns needed by that layer.
fn prune_dataframes_per_layer(
specs: &[Plot],
data_map: &mut HashMap<String, DataFrame>,
) -> Result<()> {
for spec in specs {
for layer in &spec.layers {
if let Some(ref data_key) = layer.data_key {
if let Some(df) = data_map.get(data_key) {
let required = collect_layer_required_columns(layer, spec);
let pruned = prune_dataframe(df, &required)?;
data_map.insert(data_key.clone(), pruned);
}
}
}
}
Ok(())
}
// =============================================================================
// Public API: PreparedData
// =============================================================================
/// Result of preparing data for visualization
pub struct PreparedData {
/// Data map with global and layer-specific DataFrames
pub data: HashMap<String, DataFrame>,
/// Parsed and resolved visualization specifications
pub specs: Vec<Plot>,
/// The SQL portion of the query
pub sql: String,
/// The VISUALISE portion of the query
pub visual: String,
}
/// Build data map from a query using a Reader
///
/// This is the main entry point for preparing visualization data from a ggsql query.
///
/// # Arguments
/// * `query` - The full ggsql query string
/// * `reader` - A Reader implementation for executing SQL
pub fn prepare_data_with_reader<R: Reader>(query: &str, reader: &R) -> Result<PreparedData> {
let execute_query = |sql: &str| reader.execute_sql(sql);
let dialect = reader.dialect();
// Parse once and create SourceTree
let source_tree = parser::SourceTree::new(query)?;
source_tree.validate()?;
// Check if query has VISUALISE statements
let root = source_tree.root();
if source_tree
.find_node(&root, "(visualise_statement) @viz")
.is_none()
{
return Err(GgsqlError::ValidationError(
"No visualization specifications found".to_string(),
));
}
// Build AST from existing tree
let mut specs = parser::build_ast(&source_tree)?;
if specs.is_empty() {
return Err(GgsqlError::ValidationError(
"No visualization specifications found".to_string(),
));
}
// Extract CTE definitions from the source tree (in declaration order)
let ctes = cte::extract_ctes(&source_tree);
// Materialize CTEs as registered tables via reader.register()
let materialized_ctes = cte::materialize_ctes(&ctes, reader)?;
// Build data map for multi-source support
let mut data_map: HashMap<String, DataFrame> = HashMap::new();
// Extract SQL once (reused later for PreparedData)
let sql_part = source_tree.extract_sql();
// Execute global SQL if present
// If there's a WITH clause, extract just the trailing SELECT and transform CTE references.
// The global result is stored as a temp table so filtered layers can query it efficiently.
// Track whether we actually create the temp table (depends on transform_global_sql succeeding)
let mut has_global_table = false;
if sql_part.is_some() {
if let Some(transformed_sql) = cte::transform_global_sql(&source_tree, &materialized_ctes) {
// Execute global result SQL and register result as a temp table
let df = execute_query(&transformed_sql)?;
reader.register(&naming::global_table(), df, true)?;
// NOTE: Don't read into data_map yet - defer until after casting is determined
// The temp table exists and can be used for schema fetching
has_global_table = true;
}
}
// Validate all layers have a data source (explicit source or global data)
for (idx, layer) in specs[0].layers.iter().enumerate() {
if layer.source.is_none() && !has_global_table {
return Err(GgsqlError::ValidationError(format!(
"Layer {} has no data source. Either provide a SQL query before VISUALISE or use FROM in the layer.",
idx + 1
)));
}
}
// Build source queries for each layer to fetch initial type info
// Every layer now has its own source query (either explicit source or global table)
// For annotation layers, this is where array recycling and parameter→mapping conversion happens
let layer_source_queries: Vec<String> = specs[0]
.layers
.iter_mut()
.map(|l| layer::layer_source_query(l, &materialized_ctes, has_global_table, dialect))
.collect::<Result<Vec<_>>>()?;
// Get types for each layer from source queries (Phase 1: types only, no min/max yet)
let mut layer_type_info: Vec<Vec<schema::TypeInfo>> = Vec::new();
for source_query in &layer_source_queries {
let type_info = schema::fetch_schema_types(source_query, &execute_query)?;
layer_type_info.push(type_info);
}
// Initial schemas (types only, no min/max - will be completed after base queries)
let mut layer_schemas: Vec<Schema> = layer_type_info
.iter()
.map(|ti| schema::type_info_to_schema(ti))
.collect();
// Merge global mappings into layer aesthetics and expand wildcards
// Smart wildcard expansion only creates mappings for columns that exist in schema
// NOTE: Both global and layer aesthetics are already in internal format (pos1, pos2)
// because transformation happens in builder.rs right after parsing