Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion docs/source/change-log.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@ This project adheres to `Semantic Versioning`_.
.. _Semantic Versioning: http://semver.org/


0.8.2
-----
2026-04-27

* Bug Fixes

* An error was corrected in :py:meth:`~ipumspy.readers.read_hierarchical_microdata()` that was causing ``ValueError``s when the ``as_dict`` keyword argument was set to ``False``.

0.8.1
-----
2026-04-23
Expand All @@ -20,7 +28,6 @@ This project adheres to `Semantic Versioning`_.
* ``ipumspy`` now accomodates a re-rodering of record types in the ``fileStr`` section of the DDI created by the IPUMS extract system.
* The building of extract requests for IPUMS MTUS and AHTUS now correctly accounts for the fact that ``attach_characteristics`` is not a supported feature on these data collections.


0.8.0
-----
2026-04-13
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "ipumspy"
version = "0.8.1"
version = "0.8.2"
description = "A collection of tools for working with IPUMS data"
authors = ["Kevin H. Wilson <kevin_wilson@brown.edu>",
"Renae Rodgers <rodge103@umn.edu>"]
Expand Down
104 changes: 24 additions & 80 deletions src/ipumspy/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,15 +330,23 @@ def read_hierarchical_microdata(
pandas data frame or a dictionary of pandas data frames
"""
# RECTYPE must be included if subset list is specified
# add it if it is missing
if subset is not None:
if "RECTYPE" not in subset:
raise ValueError(
"RECTYPE must be included in the subset list for hierarchical extracts."
if ddi.file_description.rectype_idvar not in subset:
subset.append(ddi.file_description.rectype_idvar)
warnings.warn(
f"{ddi.file_description.rectype_idvar} is required to read hierarchical microdata; this variable has been added to the `subset`."
)
else:
data_description = [
desc for desc in ddi.data_description if desc.name in subset
]
# add the file's keyvar if not included in the subset
if ddi.file_description.rectype_keyvar not in subset:
subset.append(ddi.file_description.rectype_keyvar)
warnings.warn(
f"{ddi.file_description.rectype_keyvar} is required to read hierarchical microdata; this variable has been added to the `subset`."
)

data_description = [
desc for desc in ddi.data_description if desc.name in subset
]
else:
data_description = ddi.data_description

Expand Down Expand Up @@ -378,7 +386,9 @@ def read_hierarchical_microdata(
)

# filter out non-relevant rectype records
df_dict[rectype] = rectype_df[rectype_df["RECTYPE"] == rectype].copy()
df_dict[rectype] = rectype_df[
rectype_df[ddi.file_description.rectype_idvar] == rectype
].copy()

# Now that the non-relevant rows have been dropped, make data types correct
if dtype is None:
Expand All @@ -400,78 +410,12 @@ def read_hierarchical_microdata(
if as_dict:
return df_dict
else:
dtype_str = {desc.name: pd.StringDtype() for desc in data_description}
# read the hierarchical file
df = pd.concat(
[
df
for df in _read_microdata(
ddi=ddi,
filename=filename,
encoding=encoding,
dtype=dtype_str,
subset=subset,
**kwargs,
)
]
)

# for each rectype, nullify variables that belong to other rectypes
for rectype in df_dict.keys():
# create a list of variables that are for rectypes other than the current rectype
# and are not included in the list of varaibles that are common across rectypes
non_rt_cols = [
cols
for rt in df_dict.keys()
for cols in df_dict[rt].columns
if rt != rectype and cols not in common_vars
]
dtype_rt = dtype
if dtype_rt is None:
# this fix means that _fix_float_dtypes is actually being called both from within
# _read_microdata() and this method, which is not ideal, but is also the least disruptive
# solution I have found so far.
dtype_rt = {
desc.name: desc.pandas_type
for desc in data_description
if desc.name in non_rt_cols
}

for col in non_rt_cols:
# maintain data type when "nullifying" variables from other record types
if dtype_rt[col] == pd.Int64Dtype():
df[col] = np.where(df["RECTYPE"] == rectype, pd.NA, df[col])
df[col] = df[col].astype(
_fix_float_dtypes({col: dtype_rt[col]}, df[[col]].copy())
)
elif (
dtype_rt[col] == pd.StringDtype()
or dtype_rt[col] == str
or dtype_rt[col] == "string"
):
df[col] = np.where(df["RECTYPE"] == rectype, "", df[col])
df[col] = df[col].astype(pd.StringDtype())
elif (
dtype_rt[col].dtype == float
or dtype_rt[col] == pd.Float64Dtype()
or dtype_rt[col] == np.float64
):
df[col] = np.where(df["RECTYPE"] == rectype, np.nan, df[col])
df[col] = df[col].astype(dtype_rt[col])
# this should (theoretically) never be hit... unless someone specifies an illegal data type
# themselves, but that should also be caught before this stage.
else:
raise TypeError(
f"Data type {df[col].dtype} for {col} is not an allowed type."
)
# XXX common vars are defaulting to pandas. This is probably fine, but could be more flexible.
common_dtype = {
desc.name: desc.pandas_type
for desc in data_description
if desc.name in common_vars
}
for col in common_vars:
df[col] = df[col].astype(common_dtype[col])
df = pd.concat([df_dict[k] for k in df_dict.keys()])
# XXX the rectype_keyvar is not always enough to correctly sort the hierarchical extracts
# as it may only be unique within sample.
# Save the pandas index to follow the original file order to use as a temporary sort key
df["idx"] = df.index
df = df.sort_values(by=["idx"]).drop(columns="idx")

return df

Expand Down
22 changes: 13 additions & 9 deletions tests/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,21 +490,25 @@ def _assert_cps_rectantular_subset(data: pd.DataFrame):

def _assert_cps_hierarchical_subset(data: pd.DataFrame):
"""Tests subset functionality on hierarchical extracts"""
assert len(data.columns) == 3
# even though specified subset is 3 vars, we must add SERIAL
assert len(data.columns) == 4
assert "SERIAL" in data.columns
# there has to be a better way to do this...
# splitting out nan and non-nan values
print(data.head())
assert (data["MISH"].iloc[:2] == np.array([7, 5])).all()
assert data["MISH"].iloc[2:5].isna().all()
assert (data["AGE"].iloc[2:5] == np.array([36, 41, 5])).all()
assert data["AGE"].iloc[:2].isna().all()
assert (data["RECTYPE"].iloc[:3] == np.array(["H", "H", "P"])).all()


def _assert_cps_hierarchical_subset_dict(data: Dict):
"""Tests subset functionality on hierarchical extracts as dictionaries"""
p_data = data["P"]
h_data = data["H"]
assert len(p_data.columns) == 2
assert len(h_data.columns) == 2
assert len(p_data.columns) == 3
assert len(h_data.columns) == 3
assert (h_data["MISH"].iloc[:5] == np.array([7, 5, 1, 2, 1])).all()
assert (p_data["AGE"].iloc[:5] == np.array([36, 41, 5, 7, 50])).all()

Expand Down Expand Up @@ -935,13 +939,13 @@ def test_subset_option(fixtures_path: Path):

_assert_cps_hierarchical_subset_dict(data)

# ValueError should be raised when rectype not included in hierarchical subset
with pytest.raises(ValueError):
# warn when the rectype var is not included in the subset
with pytest.warns() as record:
data = readers.read_hierarchical_microdata(
ddi, fixtures_path / "cps_00421.dat.gz", subset=["MISH", "AGE"]
)

with pytest.raises(ValueError):
data = readers.read_hierarchical_microdata(
ddi, fixtures_path / "cps_00421.dat.gz", subset=["MISH", "AGE"]
)
assert (
str(record[0].message)
== "RECTYPE is required to read hierarchical microdata; this variable has been added to the `subset`."
)
Loading