diff --git a/docs/source/change-log.rst b/docs/source/change-log.rst index 2ac4e76..867e71d 100644 --- a/docs/source/change-log.rst +++ b/docs/source/change-log.rst @@ -11,6 +11,14 @@ This project adheres to `Semantic Versioning`_. .. _Semantic Versioning: http://semver.org/ +0.8.2 +----- +2026-04-27 + +* Bug Fixes + + * An error was corrected in :py:meth:`~ipumspy.readers.read_hierarchical_microdata()` that was causing ``ValueError``s when the ``as_dict`` keyword argument was set to ``False``. + 0.8.1 ----- 2026-04-23 @@ -20,7 +28,6 @@ This project adheres to `Semantic Versioning`_. * ``ipumspy`` now accomodates a re-rodering of record types in the ``fileStr`` section of the DDI created by the IPUMS extract system. * The building of extract requests for IPUMS MTUS and AHTUS now correctly accounts for the fact that ``attach_characteristics`` is not a supported feature on these data collections. - 0.8.0 ----- 2026-04-13 diff --git a/pyproject.toml b/pyproject.toml index 384e5a4..0eb87b0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "ipumspy" -version = "0.8.1" +version = "0.8.2" description = "A collection of tools for working with IPUMS data" authors = ["Kevin H. Wilson ", "Renae Rodgers "] diff --git a/src/ipumspy/readers.py b/src/ipumspy/readers.py index f943375..5a8ac67 100644 --- a/src/ipumspy/readers.py +++ b/src/ipumspy/readers.py @@ -330,15 +330,23 @@ def read_hierarchical_microdata( pandas data frame or a dictionary of pandas data frames """ # RECTYPE must be included if subset list is specified + # add it if it is missing if subset is not None: - if "RECTYPE" not in subset: - raise ValueError( - "RECTYPE must be included in the subset list for hierarchical extracts." + if ddi.file_description.rectype_idvar not in subset: + subset.append(ddi.file_description.rectype_idvar) + warnings.warn( + f"{ddi.file_description.rectype_idvar} is required to read hierarchical microdata; this variable has been added to the `subset`." ) - else: - data_description = [ - desc for desc in ddi.data_description if desc.name in subset - ] + # add the file's keyvar if not included in the subset + if ddi.file_description.rectype_keyvar not in subset: + subset.append(ddi.file_description.rectype_keyvar) + warnings.warn( + f"{ddi.file_description.rectype_keyvar} is required to read hierarchical microdata; this variable has been added to the `subset`." + ) + + data_description = [ + desc for desc in ddi.data_description if desc.name in subset + ] else: data_description = ddi.data_description @@ -378,7 +386,9 @@ def read_hierarchical_microdata( ) # filter out non-relevant rectype records - df_dict[rectype] = rectype_df[rectype_df["RECTYPE"] == rectype].copy() + df_dict[rectype] = rectype_df[ + rectype_df[ddi.file_description.rectype_idvar] == rectype + ].copy() # Now that the non-relevant rows have been dropped, make data types correct if dtype is None: @@ -400,78 +410,12 @@ def read_hierarchical_microdata( if as_dict: return df_dict else: - dtype_str = {desc.name: pd.StringDtype() for desc in data_description} - # read the hierarchical file - df = pd.concat( - [ - df - for df in _read_microdata( - ddi=ddi, - filename=filename, - encoding=encoding, - dtype=dtype_str, - subset=subset, - **kwargs, - ) - ] - ) - - # for each rectype, nullify variables that belong to other rectypes - for rectype in df_dict.keys(): - # create a list of variables that are for rectypes other than the current rectype - # and are not included in the list of varaibles that are common across rectypes - non_rt_cols = [ - cols - for rt in df_dict.keys() - for cols in df_dict[rt].columns - if rt != rectype and cols not in common_vars - ] - dtype_rt = dtype - if dtype_rt is None: - # this fix means that _fix_float_dtypes is actually being called both from within - # _read_microdata() and this method, which is not ideal, but is also the least disruptive - # solution I have found so far. - dtype_rt = { - desc.name: desc.pandas_type - for desc in data_description - if desc.name in non_rt_cols - } - - for col in non_rt_cols: - # maintain data type when "nullifying" variables from other record types - if dtype_rt[col] == pd.Int64Dtype(): - df[col] = np.where(df["RECTYPE"] == rectype, pd.NA, df[col]) - df[col] = df[col].astype( - _fix_float_dtypes({col: dtype_rt[col]}, df[[col]].copy()) - ) - elif ( - dtype_rt[col] == pd.StringDtype() - or dtype_rt[col] == str - or dtype_rt[col] == "string" - ): - df[col] = np.where(df["RECTYPE"] == rectype, "", df[col]) - df[col] = df[col].astype(pd.StringDtype()) - elif ( - dtype_rt[col].dtype == float - or dtype_rt[col] == pd.Float64Dtype() - or dtype_rt[col] == np.float64 - ): - df[col] = np.where(df["RECTYPE"] == rectype, np.nan, df[col]) - df[col] = df[col].astype(dtype_rt[col]) - # this should (theoretically) never be hit... unless someone specifies an illegal data type - # themselves, but that should also be caught before this stage. - else: - raise TypeError( - f"Data type {df[col].dtype} for {col} is not an allowed type." - ) - # XXX common vars are defaulting to pandas. This is probably fine, but could be more flexible. - common_dtype = { - desc.name: desc.pandas_type - for desc in data_description - if desc.name in common_vars - } - for col in common_vars: - df[col] = df[col].astype(common_dtype[col]) + df = pd.concat([df_dict[k] for k in df_dict.keys()]) + # XXX the rectype_keyvar is not always enough to correctly sort the hierarchical extracts + # as it may only be unique within sample. + # Save the pandas index to follow the original file order to use as a temporary sort key + df["idx"] = df.index + df = df.sort_values(by=["idx"]).drop(columns="idx") return df diff --git a/tests/test_readers.py b/tests/test_readers.py index 0e174a4..899b093 100644 --- a/tests/test_readers.py +++ b/tests/test_readers.py @@ -490,21 +490,25 @@ def _assert_cps_rectantular_subset(data: pd.DataFrame): def _assert_cps_hierarchical_subset(data: pd.DataFrame): """Tests subset functionality on hierarchical extracts""" - assert len(data.columns) == 3 + # even though specified subset is 3 vars, we must add SERIAL + assert len(data.columns) == 4 + assert "SERIAL" in data.columns # there has to be a better way to do this... # splitting out nan and non-nan values + print(data.head()) assert (data["MISH"].iloc[:2] == np.array([7, 5])).all() assert data["MISH"].iloc[2:5].isna().all() assert (data["AGE"].iloc[2:5] == np.array([36, 41, 5])).all() assert data["AGE"].iloc[:2].isna().all() + assert (data["RECTYPE"].iloc[:3] == np.array(["H", "H", "P"])).all() def _assert_cps_hierarchical_subset_dict(data: Dict): """Tests subset functionality on hierarchical extracts as dictionaries""" p_data = data["P"] h_data = data["H"] - assert len(p_data.columns) == 2 - assert len(h_data.columns) == 2 + assert len(p_data.columns) == 3 + assert len(h_data.columns) == 3 assert (h_data["MISH"].iloc[:5] == np.array([7, 5, 1, 2, 1])).all() assert (p_data["AGE"].iloc[:5] == np.array([36, 41, 5, 7, 50])).all() @@ -935,13 +939,13 @@ def test_subset_option(fixtures_path: Path): _assert_cps_hierarchical_subset_dict(data) - # ValueError should be raised when rectype not included in hierarchical subset - with pytest.raises(ValueError): + # warn when the rectype var is not included in the subset + with pytest.warns() as record: data = readers.read_hierarchical_microdata( ddi, fixtures_path / "cps_00421.dat.gz", subset=["MISH", "AGE"] ) - with pytest.raises(ValueError): - data = readers.read_hierarchical_microdata( - ddi, fixtures_path / "cps_00421.dat.gz", subset=["MISH", "AGE"] - ) + assert ( + str(record[0].message) + == "RECTYPE is required to read hierarchical microdata; this variable has been added to the `subset`." + )