ipums · renae-r · Apr 27, 2026 · Apr 27, 2026 · Apr 27, 2026 · Apr 27, 2026
diff --git a/docs/source/change-log.rst b/docs/source/change-log.rst
@@ -11,6 +11,14 @@ This project adheres to `Semantic Versioning`_.
 .. _Semantic Versioning: http://semver.org/
 
 
+0.8.2
+-----
+2026-04-27
+
+* Bug Fixes
+
+  * An error was corrected in :py:meth:`~ipumspy.readers.read_hierarchical_microdata()` that was causing ``ValueError``s when the ``as_dict`` keyword argument was set to ``False``.
+
 0.8.1
 -----
 2026-04-23
@@ -20,7 +28,6 @@ This project adheres to `Semantic Versioning`_.
   * ``ipumspy`` now accomodates a re-rodering of record types in the ``fileStr`` section of the DDI created by the IPUMS extract system.
   * The building of extract requests for IPUMS MTUS and AHTUS now correctly accounts for the fact that ``attach_characteristics`` is not a supported feature on these data collections.
 
-
 0.8.0
 -----
 2026-04-13

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "ipumspy"
-version = "0.8.1"
+version = "0.8.2"
 description = "A collection of tools for working with IPUMS data"
 authors = ["Kevin H. Wilson <kevin_wilson@brown.edu>",
            "Renae Rodgers <rodge103@umn.edu>"]

diff --git a/src/ipumspy/readers.py b/src/ipumspy/readers.py
@@ -330,15 +330,23 @@ def read_hierarchical_microdata(
         pandas data frame or a dictionary of pandas data frames
     """
     # RECTYPE must be included if subset list is specified
+    # add it if it is missing
     if subset is not None:
-        if "RECTYPE" not in subset:
-            raise ValueError(
-                "RECTYPE must be included in the subset list for hierarchical extracts."
+        if ddi.file_description.rectype_idvar not in subset:
+            subset.append(ddi.file_description.rectype_idvar)
+            warnings.warn(
+                f"{ddi.file_description.rectype_idvar} is required to read hierarchical microdata; this variable has been added to the `subset`."
             )
-        else:
-            data_description = [
-                desc for desc in ddi.data_description if desc.name in subset
-            ]
+        # add the file's keyvar if not included in the subset
+        if ddi.file_description.rectype_keyvar not in subset:
+            subset.append(ddi.file_description.rectype_keyvar)
+            warnings.warn(
+                f"{ddi.file_description.rectype_keyvar} is required to read hierarchical microdata; this variable has been added to the `subset`."
+            )
+
+        data_description = [
+            desc for desc in ddi.data_description if desc.name in subset
+        ]
     else:
         data_description = ddi.data_description
 
@@ -378,7 +386,9 @@ def read_hierarchical_microdata(
             )
 
             # filter out non-relevant rectype records
-            df_dict[rectype] = rectype_df[rectype_df["RECTYPE"] == rectype].copy()
+            df_dict[rectype] = rectype_df[
+                rectype_df[ddi.file_description.rectype_idvar] == rectype
+            ].copy()
 
             # Now that the non-relevant rows have been dropped, make data types correct
             if dtype is None:
@@ -400,78 +410,12 @@ def read_hierarchical_microdata(
         if as_dict:
             return df_dict
         else:
-            dtype_str = {desc.name: pd.StringDtype() for desc in data_description}
-            # read the hierarchical file
-            df = pd.concat(
-                [
-                    df
-                    for df in _read_microdata(
-                        ddi=ddi,
-                        filename=filename,
-                        encoding=encoding,
-                        dtype=dtype_str,
-                        subset=subset,
-                        **kwargs,
-                    )
-                ]
-            )
-
-            # for each rectype, nullify variables that belong to other rectypes
-            for rectype in df_dict.keys():
-                # create a list of variables that are for rectypes other than the current rectype
-                # and are not included in the list of varaibles that are common across rectypes
-                non_rt_cols = [
-                    cols
-                    for rt in df_dict.keys()
-                    for cols in df_dict[rt].columns
-                    if rt != rectype and cols not in common_vars
-                ]
-                dtype_rt = dtype
-                if dtype_rt is None:
-                    # this fix means that _fix_float_dtypes is actually being called both from within
-                    # _read_microdata() and this method, which is not ideal, but is also the least disruptive
-                    # solution I have found so far.
-                    dtype_rt = {
-                        desc.name: desc.pandas_type
-                        for desc in data_description
-                        if desc.name in non_rt_cols
-                    }
-
-                for col in non_rt_cols:
-                    # maintain data type when "nullifying" variables from other record types
-                    if dtype_rt[col] == pd.Int64Dtype():
-                        df[col] = np.where(df["RECTYPE"] == rectype, pd.NA, df[col])
-                        df[col] = df[col].astype(
-                            _fix_float_dtypes({col: dtype_rt[col]}, df[[col]].copy())
-                        )
-                    elif (
-                        dtype_rt[col] == pd.StringDtype()
-                        or dtype_rt[col] == str
-                        or dtype_rt[col] == "string"
-                    ):
-                        df[col] = np.where(df["RECTYPE"] == rectype, "", df[col])
-                        df[col] = df[col].astype(pd.StringDtype())
-                    elif (
-                        dtype_rt[col].dtype == float
-                        or dtype_rt[col] == pd.Float64Dtype()
-                        or dtype_rt[col] == np.float64
-                    ):
-                        df[col] = np.where(df["RECTYPE"] == rectype, np.nan, df[col])
-                        df[col] = df[col].astype(dtype_rt[col])
-                    # this should (theoretically) never be hit... unless someone specifies an illegal data type
-                    # themselves, but that should also be caught before this stage.
-                    else:
-                        raise TypeError(
-                            f"Data type {df[col].dtype} for {col} is not an allowed type."
-                        )
-            # XXX common vars are defaulting to pandas. This is probably fine, but could be more flexible.
-            common_dtype = {
-                desc.name: desc.pandas_type
-                for desc in data_description
-                if desc.name in common_vars
-            }
-            for col in common_vars:
-                df[col] = df[col].astype(common_dtype[col])
+            df = pd.concat([df_dict[k] for k in df_dict.keys()])
+            # XXX the rectype_keyvar is not always enough to correctly sort the hierarchical extracts
+            # as it may only be unique within sample.
+            # Save the pandas index to follow the original file order to use as a temporary sort key
+            df["idx"] = df.index
+            df = df.sort_values(by=["idx"]).drop(columns="idx")
 
             return df
 

diff --git a/tests/test_readers.py b/tests/test_readers.py
@@ -490,21 +490,25 @@ def _assert_cps_rectantular_subset(data: pd.DataFrame):
 
 def _assert_cps_hierarchical_subset(data: pd.DataFrame):
     """Tests subset functionality on hierarchical extracts"""
-    assert len(data.columns) == 3
+    # even though specified subset is 3 vars, we must add SERIAL
+    assert len(data.columns) == 4
+    assert "SERIAL" in data.columns
     # there has to be a better way to do this...
     # splitting out nan and non-nan values
+    print(data.head())
     assert (data["MISH"].iloc[:2] == np.array([7, 5])).all()
     assert data["MISH"].iloc[2:5].isna().all()
     assert (data["AGE"].iloc[2:5] == np.array([36, 41, 5])).all()
     assert data["AGE"].iloc[:2].isna().all()
+    assert (data["RECTYPE"].iloc[:3] == np.array(["H", "H", "P"])).all()
 
 
 def _assert_cps_hierarchical_subset_dict(data: Dict):
     """Tests subset functionality on hierarchical extracts as dictionaries"""
     p_data = data["P"]
     h_data = data["H"]
-    assert len(p_data.columns) == 2
-    assert len(h_data.columns) == 2
+    assert len(p_data.columns) == 3
+    assert len(h_data.columns) == 3
     assert (h_data["MISH"].iloc[:5] == np.array([7, 5, 1, 2, 1])).all()
     assert (p_data["AGE"].iloc[:5] == np.array([36, 41, 5, 7, 50])).all()
 
@@ -935,13 +939,13 @@ def test_subset_option(fixtures_path: Path):
 
     _assert_cps_hierarchical_subset_dict(data)
 
-    # ValueError should be raised when rectype not included in hierarchical subset
-    with pytest.raises(ValueError):
+    # warn when the rectype var is not included in the subset
+    with pytest.warns() as record:
         data = readers.read_hierarchical_microdata(
             ddi, fixtures_path / "cps_00421.dat.gz", subset=["MISH", "AGE"]
         )
 
-    with pytest.raises(ValueError):
-        data = readers.read_hierarchical_microdata(
-            ddi, fixtures_path / "cps_00421.dat.gz", subset=["MISH", "AGE"]
-        )
+    assert (
+        str(record[0].message)
+        == "RECTYPE is required to read hierarchical microdata; this variable has been added to the `subset`."
+    )