From d073c1b90b4f5e245a79c9658262ee1101b0c45e Mon Sep 17 00:00:00 2001 From: Sai Asish Y Date: Tue, 19 May 2026 14:27:30 -0700 Subject: [PATCH] feat(csvclean): add --remove-empty-columns option --- CHANGELOG.rst | 5 +++++ csvkit/cleanup.py | 21 +++++++++++++-------- csvkit/utilities/csvclean.py | 20 ++++++++++++++++---- docs/scripts/csvclean.rst | 5 +++++ tests/test_utilities/test_csvclean.py | 14 ++++++++++++++ 5 files changed, 53 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index d24f090f0..f8bbaf20d 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,3 +1,8 @@ +Unreleased +---------- + +- feat: :doc:`/scripts/csvclean` adds a :code:`--remove-empty-columns` option to drop columns that are empty in all data rows from standard output. + 2.2.0 - December 15, 2025 ------------------------- diff --git a/csvkit/cleanup.py b/csvkit/cleanup.py index 67b161a6f..c19e7b391 100644 --- a/csvkit/cleanup.py +++ b/csvkit/cleanup.py @@ -49,6 +49,7 @@ def __init__( # Other zero_based=False, omit_error_rows=False, + report_empty_columns=True, ): self.reader = reader # Checks @@ -62,6 +63,7 @@ def __init__( # Other self.zero_based = zero_based self.omit_error_rows = omit_error_rows + self.report_empty_columns = report_empty_columns try: self.column_names = next(reader) @@ -71,6 +73,7 @@ def __init__( self.column_names = [] self.errors = [] + self.empty_column_indices = [] def checked_rows(self): """ @@ -147,12 +150,14 @@ def checked_rows(self): if row_count: # Don't report all columns as empty if there are no data rows. if empty_columns := [i for i, count in enumerate(empty_counts) if count == row_count]: - offset = 0 if self.zero_based else 1 - self.errors.append( - Error( - 1, - ["" for _ in range(len_column_names)], - f"Empty columns named {', '.join(repr(self.column_names[i]) for i in empty_columns)}! " - f"Try: csvcut -C {','.join(str(i + offset) for i in empty_columns)}", + self.empty_column_indices = empty_columns + if self.report_empty_columns: + offset = 0 if self.zero_based else 1 + self.errors.append( + Error( + 1, + ["" for _ in range(len_column_names)], + f"Empty columns named {', '.join(repr(self.column_names[i]) for i in empty_columns)}! " + f"Try: csvcut -C {','.join(str(i + offset) for i in empty_columns)}", + ) ) - ) diff --git a/csvkit/utilities/csvclean.py b/csvkit/utilities/csvclean.py index ca8e15f62..3940c4085 100644 --- a/csvkit/utilities/csvclean.py +++ b/csvkit/utilities/csvclean.py @@ -45,6 +45,9 @@ def add_arguments(self): self.argparser.add_argument( '--fillvalue', dest='fillvalue', help='The value with which to fill short rows. Defaults to none.') + self.argparser.add_argument( + '--remove-empty-columns', dest='remove_empty_columns', action='store_true', + help='Remove columns that are empty in all data rows from standard output.') def main(self): if self.additional_input_expected(): @@ -59,6 +62,7 @@ def main(self): and not self.args.header_normalize_space and not self.args.join_short_rows and not self.args.fill_short_rows + and not self.args.remove_empty_columns ): self.argparser.error('No checks or fixes were enabled. See available options with: csvclean --help') @@ -73,7 +77,7 @@ def main(self): reader, # Checks length_mismatch=default or self.args.length_mismatch, - empty_columns=default or self.args.empty_columns, + empty_columns=default or self.args.empty_columns or self.args.remove_empty_columns, # Fixes header_normalize_space=self.args.header_normalize_space, join_short_rows=self.args.join_short_rows, @@ -83,6 +87,7 @@ def main(self): # Other zero_based=self.args.zero_based, omit_error_rows=self.args.omit_error_rows, + report_empty_columns=default or self.args.empty_columns, ) label = self.args.label @@ -93,9 +98,16 @@ def main(self): label = self.input_file.name output_writer = agate.csv.writer(self.output_file, **self.writer_kwargs) - output_writer.writerow(checker.column_names) - for row in checker.checked_rows(): - output_writer.writerow(row) + if self.args.remove_empty_columns: + rows = list(checker.checked_rows()) + keep = [i for i, name in enumerate(checker.column_names) if i not in checker.empty_column_indices] + output_writer.writerow([checker.column_names[i] for i in keep]) + for row in rows: + output_writer.writerow([row[i] if i < len(row) else '' for i in keep]) + else: + output_writer.writerow(checker.column_names) + for row in checker.checked_rows(): + output_writer.writerow(row) if checker.errors: error_writer = agate.csv.writer(self.error_file, **self.writer_kwargs) diff --git a/docs/scripts/csvclean.rst b/docs/scripts/csvclean.rst index f5e8b8522..4bbe281e3 100644 --- a/docs/scripts/csvclean.rst +++ b/docs/scripts/csvclean.rst @@ -76,6 +76,8 @@ Fixes 1,Alice,US 2,Bob,CA +- If a CSV has columns that are empty in every data row, use :code:`--remove-empty-columns` to drop them from standard output. + .. seealso:: :code:`--header-normalize-space` under :ref:`csvclean-usage`. @@ -140,6 +142,9 @@ Usage --fillvalue FILLVALUE The value with which to fill short rows. Defaults to none. + --remove-empty-columns + Remove columns that are empty in all data rows from + standard output. See also: :doc:`../common_arguments`. diff --git a/tests/test_utilities/test_csvclean.py b/tests/test_utilities/test_csvclean.py index b2fd5a866..45c87384b 100644 --- a/tests/test_utilities/test_csvclean.py +++ b/tests/test_utilities/test_csvclean.py @@ -168,6 +168,20 @@ def test_empty_columns_zero(self): ['1', "Empty columns named 'b', '', ''! Try: csvcut -C 1,3,4", '', '', '', '', ''], ]) + def test_remove_empty_columns(self): + self.assertCleaned(['--remove-empty-columns', 'examples/test_empty_columns.csv'], [ + ['a', 'c'], + ['a', ''], + ['', 'c'], + ['', ''], + ]) + + def test_remove_empty_columns_no_empty(self): + self.assertCleaned(['--remove-empty-columns', 'examples/dummy.csv'], [ + ['a', 'b', 'c'], + ['1', '2', '3'], + ]) + def test_enable_all_checks(self): self.assertCleaned(['-a', 'examples/test_empty_columns.csv'], [ ['a', 'b', 'c', '', ''],