Skip to content

Commit 04db804

Browse files
Fix: Force index type to str (#28)
1 parent e61fbe2 commit 04db804

2 files changed

Lines changed: 8 additions & 2 deletions

File tree

data_validation_framework/task.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -295,10 +295,14 @@ def kwargs(self):
295295
def read_dataset(self):
296296
"""Import the dataset to a :class:`pandas.DataFrame`.
297297
298+
Note that the index column is loaded as a string.
299+
298300
This method can be overridden to load custom data (e.g. GeoDataFrame, etc.).
299301
The dataset should always be loaded from the path given by `self.dataset_df`.
300302
"""
301-
return pd.read_csv(self.dataset_df, index_col=self.input_index_col)
303+
return pd.read_csv(
304+
self.dataset_df, index_col=self.input_index_col, dtype={self.input_index_col: str}
305+
)
302306

303307
def pre_process(self, df, args, kwargs):
304308
"""Method executed before applying the external function."""

tests/test_task.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1274,7 +1274,9 @@ def check_exception(failed_task, exception): # pylint: disable=unused-variable
12741274
assert not luigi.build([failing_task], local_scheduler=True)
12751275

12761276
assert failed_tasks == [str(failing_task)]
1277-
assert exceptions == [str(IndexError("The following index values are duplicated: [0, 1]"))]
1277+
assert exceptions == [
1278+
str(IndexError("The following index values are duplicated: ['0', '1']"))
1279+
]
12781280

12791281
def test_change_index(self, tmpdir, TestTask):
12801282
dataset_df_path = str(tmpdir / "dataset.csv")

0 commit comments

Comments
 (0)