Merge branch 'report'

alpae · alpae · commit 9f509062f922 · 2024-01-15T14:06:56.000+01:00
diff --git a/FastOMA/fastoma_notebook_stat.ipynb b/FastOMA/fastoma_notebook_stat.ipynb
@@ -167,9 +167,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sns.histplot(data=protein_df.groupby(\"species\", as_index=False).count(), stat=\"probability\", bins=30)\n",
+    "sns.histplot(data=protein_df.groupby(\"species\", as_index=False).count(), stat=\"count\", bins=30)\n",
     "plt.xlabel(\"Nr proteins in each species\")\n",
-    "plt.ylabel(\"Frequency\")\n",
+    "plt.ylabel(\"Counts of proteomes\")\n",
     "plt.title(\"Distribution of number of proteins per species\")"
    ]
   },
@@ -204,7 +204,8 @@
    "source": [
     "sns.displot(protein_df, x=\"prot_len\", hue=\"species\", kind=\"kde\", height=8)\n",
     "plt.xlim(0, 2000)\n",
-    "plt.title(\"Protein length distribution per species\", fontsize=20)\n"
+    "plt.title(\"Protein length distribution per species\", fontsize=20)\n",
+    "plt.xlabel(\"Protein length\")\n"
    ]
   },
   {
@@ -269,6 +270,8 @@
     "df_seq = pd.merge(hog_summary_df, protein_df.groupby(\"species\", as_index=False).count(), on='species')\n",
     "df_seq['minor_splice'] = df_seq['prot_len']-df_seq['genes']\n",
     "df_seq = df_seq[['species', 'genes', 'not_in_group','minor_splice']]\n",
+    "order = species_tree.get_leaf_names()\n",
+    "df_seq = df_seq.sort_values(by=['species'], key=lambda s: s.apply(order.index)).set_index('species')\n",
     "df_seq"
    ]
   },
@@ -289,11 +292,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df_seq.set_index('species').plot(kind='bar', stacked=True)\n",
+    "df_seq.plot(kind='bar', stacked=True)\n",
     "plt.title('Number of proteins in HOGs / singltons / minor splice variants', fontsize=16)\n",
     "plt.xlabel('Species')\n",
     "plt.ylabel('Counts')\n",
-    "plt.xticks(rotation=45);"
+    "plt.xticks(rotation=90);"
    ]
   },
   {
@@ -316,14 +319,47 @@
     "hog_df"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "44872248-8ad8-487c-89c2-147337d2c5f1",
+   "metadata": {},
+   "source": [
+    "Now, we can slice the HOGs in various ways. Remember, they are nested, so it usually makes sense to analyse either all the HOGs at their root level or alternativly look at a specific taxonomic level.\n",
+    "\n",
+    "## Roothogs (deepest levels for every HOG)\n",
+    "\n",
+    "Here, we first look at the all the RootHOGs. We can select them using the `is_roothog` column"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "id": "0a8c5e3f-daed-431c-ba79-46a0f701a779",
    "metadata": {},
    "outputs": [],
    "source": [
-    "sns.scatterplot(data=hog_df[hog_df['is_roothog']==True], x='nr_members', y='CompletenessScore')"
+    "roothog_df = hog_df[(hog_df['is_roothog']==True)]\n",
+    "print(f\"Number of RootHOGs: {len(roothog_df)}\")\n",
+    "roothog_df.describe()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ec10e7c9-b803-45c1-b131-302c12b1da6c",
+   "metadata": {},
+   "source": [
+    "We can further analyse how complete these roothogs are. The `CompletenessScore` contains the fraction of species that have at least one gene in the HOG. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c90ff7ef-c88f-4c74-bfbb-0cf2fa0d5901",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sns.displot(roothog_df, x=\"CompletenessScore\", kind=\"ecdf\")\n",
+    "plt.title(\"Cumulative distribution of CompletenessScore for all RootHOGs\", fontsize=16);\n"
    ]
   },
   {
@@ -333,7 +369,101 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sns.jointplot(data=hog_df[hog_df['is_roothog']==True], x='nr_members', y='CompletenessScore')"
+    "g = sns.jointplot(data=roothog_df, \n",
+    "                  kind=\"scatter\",\n",
+    "                  x='nr_members', \n",
+    "                  y='CompletenessScore', \n",
+    "                  marginal_kws=dict(bins=20, element=\"step\"), \n",
+    "                  marginal_ticks=True,\n",
+    "                  height=11)\n",
+    "g.fig.suptitle(\"Distribution of HOG sizes and CompletenessScores for all RootHOGs\", fontsize=16)\n",
+    "g.ax_joint.set_xlabel(\"HOG size (number of member genes)\", fontsize=14) \n",
+    "g.ax_joint.set_ylabel(\"Completeness Score of HOG\", fontsize=14)\n",
+    "g.fig.tight_layout()\n",
+    "g.fig.subplots_adjust(top=0.95)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9954b3f6-f435-496a-8fab-7da09baffc47",
+   "metadata": {},
+   "source": [
+    "Alternatively, we can also analyse the HOGs at a given taxonomic level. Here, we generate the plot for a relatively deep level in the species tree. You can change this to more useful levels tfor your dataset.\n",
+    "\n",
+    "## HOGs at a taxonomic level"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e6476e7-694f-4108-8a02-3e86374416ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "desired_subtree_size = max(2, int(0.4*len(species_tree.get_leaf_names())))\n",
+    "node = species_tree\n",
+    "while True:\n",
+    "    nr_child = [len(c.get_leaves()) for c in node.get_descendants()]\n",
+    "    if max(nr_child) < desired_subtree_size:\n",
+    "        break\n",
+    "    k = nr_child.index(max(nr_child))\n",
+    "    node = node.get_descendants()[k]\n",
+    "level = node.name\n",
+    "print(f\"We've selected {level} as our level of interest\\nIt contains {len(node.get_leaf_names())} species (out of {len(species_tree.get_leaves())}).\\nYou can use a different level by setting the level variable in the next cell instead.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f164e5f4-85f9-40ec-9dc7-1f7ff963ca25",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#level = \"XXX\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7066668b-f260-4b7d-a9a0-68141f1149ef",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "level_df = hog_df[(hog_df['level']==level)]\n",
+    "print(f\"Number of HOGs at {level}: {len(level_df)}\")\n",
+    "level_df.describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ee844afe-fa0a-4b3b-8fe3-4e9fec48bf36",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sns.displot(level_df, x=\"CompletenessScore\", kind=\"ecdf\")\n",
+    "plt.title(f\"Cumulative distribution on CompletenessScore for HOGs at {level}\");\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3f572007-d324-4310-bc55-b210e19034f8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "g = sns.jointplot(data=level_df, \n",
+    "              kind=\"scatter\",\n",
+    "              x='nr_members', \n",
+    "              y='CompletenessScore', \n",
+    "              marginal_kws=dict(bins=20, element=\"step\"), \n",
+    "              marginal_ticks=True,\n",
+    "              height=11)\n",
+    "g.fig.suptitle(f\"Distribution of HOG sizes and CompletenessScores for HOGs at {level}\", fontsize=16)\n",
+    "g.ax_joint.set_xlabel(\"HOG size (number of member genes)\", fontsize=14) \n",
+    "g.ax_joint.set_ylabel(\"Completeness Score of HOG\", fontsize=14)\n",
+    "g.fig.tight_layout()\n",
+    "g.fig.subplots_adjust(top=0.95) "
    ]
   },
   {