@@ -115,10 +115,45 @@ Select from the table records after ordering the records by primary key value
115115in descending order. --ordered-desc and --ordered-asc are also available to
116116control whether sort is descending or ascending, respectively.
117117
118- =item B<--random >
118+ =item B<--random[= >I< method >B< ] >
119119
120- Randomize the rows initially selected from each table. May significantly
121- increase the running time of the script.
120+ Randomize the rows initially selected from each table. If no method is specified,
121+ uses the best available method for your PostgreSQL version (BERNOULLI on 9.5+,
122+ LEGACY on older versions).
123+
124+ Available methods (abbreviations supported):
125+
126+ =over 8
127+
128+ =item * I<bernoulli > (or I<b > , I<bern > , etc.)
129+
130+ TABLESAMPLE BERNOULLI - Row-level random sampling. Each row is independently
131+ selected with the specified probability. Provides the most uniformly random
132+ distribution but slower on very large tables. Requires PostgreSQL 9.5+.
133+
134+ =item * I<system > (or I<s > , I<sys > , etc.)
135+
136+ TABLESAMPLE SYSTEM - Block-level random sampling at 8KB page granularity.
137+ Much faster than BERNOULLI (5-20x on large tables) but with potential clustering
138+ bias since entire blocks are selected together. Best for large tables (>1M rows)
139+ where performance matters more than perfect randomness. Requires PostgreSQL 9.5+.
140+
141+ =item * I<legacy > (or I<l > , I<leg > , etc.)
142+
143+ ORDER BY random() - Traditional method that works on all PostgreSQL versions.
144+ Slowest method (sorts entire table). Only use on PostgreSQL <9.5 or when
145+ testing backward compatibility.
146+
147+ =back
148+
149+ Examples:
150+
151+ --random Use smart default (BERNOULLI on PG 9.5+)
152+ --random=system Fast sampling for large tables
153+ --random=b Quick typing with abbreviation
154+ --random=sys Same as --random=system
155+
156+ Note: --random and --ordered are mutually exclusive.
122157
123158=item B<--sample-schema= >I<schema >
124159
@@ -419,6 +454,86 @@ sub quote_constant (@) {
419454 return wantarray ? @quoted : $quoted [0];
420455}
421456
457+ # Get row count - approximate by default, exact if requested.
458+ # Uses pg_class.reltuples for fast approximate counts.
459+ # Falls back to exact SELECT count(*) if statistics are unavailable.
460+ sub get_row_count {
461+ my ($dbh , $table , %opt ) = @_ ;
462+
463+ # Use exact count if explicitly requested (for LEGACY method)
464+ if ($opt {use_exact }) {
465+ return $dbh -> selectrow_array(qq{ SELECT count(*) FROM $table } );
466+ }
467+
468+ # Try to get approximate count from PostgreSQL statistics
469+ my ($approx_count ) = $dbh -> selectrow_array(qq{
470+ SELECT c.reltuples::bigint
471+ FROM pg_class c
472+ JOIN pg_namespace n ON n.oid = c.relnamespace
473+ WHERE n.nspname = ?
474+ AND c.relname = ?
475+ AND c.relkind IN ('r', 'p')
476+ } , undef , $table -> schema, $table -> table);
477+
478+ # Fall back to exact count if no statistics available
479+ # (reltuples is -1 for never-analyzed tables, NULL if table doesn't exist)
480+ if (!defined ($approx_count ) || $approx_count < 0) {
481+ return $dbh -> selectrow_array(qq{ SELECT count(*) FROM $table } );
482+ }
483+
484+ return $approx_count ;
485+ }
486+
487+ # Resolve random method from user input with prefix matching.
488+ # Supports exact matches and unique prefixes (case-insensitive).
489+ # Dies with helpful error message if method is unknown or ambiguous.
490+ sub resolve_random_method {
491+ my $input = shift // ' ' ;
492+
493+ return undef unless length $input ;
494+
495+ my @methods = qw/ BERNOULLI SYSTEM LEGACY / ;
496+ my $search = uc $input ;
497+
498+ # Try to find which method(s) this is a prefix of
499+ my @matches = grep { index ($_ , $search ) == 0 } @methods ;
500+
501+ return $matches [0] if 1 == @matches ;
502+
503+ if (@matches > 1) {
504+ die " Error: Ambiguous random method '$input '\n " .
505+ " Could be: " . join (' or ' , map { lc } @matches ) . " \n " .
506+ " Please use at least " . (length ($search ) + 1) . " characters to disambiguate\n " ;
507+ }
508+
509+ # No matches found
510+ die " Error: Unknown random method '$input '\n " .
511+ " Valid methods: bernoulli, system, legacy\n " .
512+ " (You can use abbreviations like 'b', 's', 'l')\n " ;
513+ }
514+
515+ # Determine which random method to use based on PostgreSQL version
516+ # and user preference. Returns 'BERNOULLI', 'SYSTEM', or 'LEGACY'.
517+ # Dies with helpful error if user requests unsupported method for their version.
518+ sub get_random_method {
519+ my $pg_version = shift ;
520+
521+ # User didn't specify a method - use smart default
522+ if (!defined $opt {random_method }) {
523+ return $pg_version >= version-> declare(' 9.5' ) ? ' BERNOULLI' : ' LEGACY' ;
524+ }
525+
526+ # User specified method - validate version compatibility
527+ if ($opt {random_method } =~ / ^(BERNOULLI|SYSTEM)$ / &&
528+ $pg_version < version-> declare(' 9.5' )) {
529+ die " Error: $opt {random_method} requires PostgreSQL 9.5+\n " .
530+ " Your version: $pg_version \n " .
531+ " Use --random=legacy (or just --random) for older versions\n " ;
532+ }
533+
534+ return $opt {random_method };
535+ }
536+
422537# Encode the actual schema and table name into a new table
423538# name that lives under our sample schema. e.g., a table like
424539# users.details (schema users, table details) would be converted
@@ -506,7 +621,6 @@ sub notice (@) {
506621 db_port => ' ' ,
507622 keep => 0,
508623 ordered => 0,
509- random => 0,
510624 schema => undef ,
511625 sample_schema => ' _pg_sample' ,
512626 verbose => 0,
@@ -529,7 +643,7 @@ GetOptions(\%opt,
529643 " ordered" ,
530644 " ordered_desc|ordered-desc" ,
531645 " ordered_asc|ordered-asc" ,
532- " random" ,
646+ " random:s " ,
533647 " sample_schema=s" ,
534648 " schema=s" ,
535649 " trace" ,
@@ -552,9 +666,22 @@ $opt{ordered} = $opt{ordered_desc} ? 'DESC'
552666 : $opt {ordered_asc } ? ' ASC'
553667 : $opt {ordered } ? ' DESC'
554668 : undef ;
669+
670+ # Process --random[=method] option
671+ # If --random was specified, resolve the method (or set it to undef for auto-detection)
672+ # Note: Getopt::Long with :s returns '' (empty string) when flag is used without value
673+ if (defined $opt {random }) {
674+ # --random with explicit method: resolve it now
675+ # Handle both empty string and '0' (both mean "no method specified")
676+ if ($opt {random } ne ' ' && $opt {random } ne ' 0' ) {
677+ $opt {random_method } = resolve_random_method($opt {random });
678+ }
679+ # --random without method: random_method stays undef, will auto-detect later
680+ # Convert to boolean for backward compatibility checks
681+ $opt {random } = 1;
682+ }
555683if ($opt {random } && $opt {ordered }) {
556- print (" Error: --random and --ordered are mutually exclusive" );
557- exit 1;
684+ die " Error: --random and --ordered are mutually exclusive\n " ;
558685}
559686
560687@ARGV or die " \n Usage: $0 [ option... ] [ dbname ]\n\n\t " .
@@ -689,11 +816,22 @@ foreach my $row (@{$table_info}) {
689816 if (!$opt {random } || $pg_version < version-> declare(' 9.5' )) {
690817 $limit = " LIMIT $_ ->[1]" ;
691818 } else {
692- my ($table_num_rows ) = $dbh -> selectrow_array(qq{
693- SELECT greatest(count(*), ?) FROM $table
694- } , undef , $_ -> [1]);
695- my $percent = 100 * $_ -> [1] / $table_num_rows ;
696- $tablesample = " TABLESAMPLE BERNOULLI ($percent )" ;
819+ my $method = get_random_method($pg_version );
820+
821+ if ($method eq ' LEGACY' ) {
822+ # LEGACY: use exact count for consistency
823+ my ($table_num_rows ) = get_row_count($dbh , $table , use_exact => 1);
824+ $table_num_rows = $_ -> [1] if $table_num_rows < $_ -> [1];
825+ my $percent = 100 * $_ -> [1] / $table_num_rows ;
826+ $limit = " LIMIT $_ ->[1]" ;
827+ # ORDER BY will be set below at line ~820
828+ } else {
829+ # TABLESAMPLE: use approximate count (it's already approximate!)
830+ my ($table_num_rows ) = get_row_count($dbh , $table );
831+ $table_num_rows = $_ -> [1] if $table_num_rows < $_ -> [1];
832+ my $percent = 100 * $_ -> [1] / $table_num_rows ;
833+ $tablesample = " TABLESAMPLE $method ($percent )" ;
834+ }
697835 }
698836 } elsif ($_ -> [1] =~ / ^\d +(\.\d +)?%$ / ) { # percent value turned into LIMIT
699837 if (not $opt {random } or $pg_version < version-> declare(' 9.5' )) {
@@ -703,8 +841,19 @@ foreach my $row (@{$table_info}) {
703841
704842 $limit = " LIMIT $total_rows " ;
705843 } else {
844+ my $method = get_random_method($pg_version );
706845 my $percent = (substr $_ -> [1], 0, (length $_ -> [1]) - 1);
707- $tablesample = " TABLESAMPLE BERNOULLI ($percent )" ;
846+
847+ if ($method eq ' LEGACY' ) {
848+ # LEGACY: need to convert percentage to row count
849+ my ($table_num_rows ) = get_row_count($dbh , $table , use_exact => 1);
850+ my $total_rows = int ($table_num_rows * $percent / 100);
851+ $limit = " LIMIT $total_rows " ;
852+ # ORDER BY will be set below at line ~820
853+ } else {
854+ # TABLESAMPLE: just pass the percentage directly (no count needed!)
855+ $tablesample = " TABLESAMPLE $method ($percent )" ;
856+ }
708857 }
709858 } else { # otherwise treated as subselect
710859 $where = " ($_ ->[1])" ;
@@ -715,7 +864,10 @@ foreach my $row (@{$table_info}) {
715864 # warn "\n[LIMIT] $table WHERE $where $limit\n";
716865
717866 if ($opt {random } && $pg_version < version-> declare(' 9.5' )) {
718- $order = $opt {random } ? ' ORDER BY random()' : ' ' ;
867+ $order = ' ORDER BY random()' ;
868+ } elsif ($opt {random } && defined ($opt {random_method }) && $opt {random_method } eq ' LEGACY' ) {
869+ # User explicitly requested LEGACY method on PG 9.5+
870+ $order = ' ORDER BY random()' ;
719871 } elsif (my $direction = $opt {ordered }) {
720872 my @cols = find_candidate_key($table );
721873 if (@cols ) {
0 commit comments