Skip to content

Commit 9f50906

Browse files
committed
Merge branch 'report'
2 parents 7ab51c6 + 072600d commit 9f50906

1 file changed

Lines changed: 137 additions & 7 deletions

File tree

FastOMA/fastoma_notebook_stat.ipynb

Lines changed: 137 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -167,9 +167,9 @@
167167
"metadata": {},
168168
"outputs": [],
169169
"source": [
170-
"sns.histplot(data=protein_df.groupby(\"species\", as_index=False).count(), stat=\"probability\", bins=30)\n",
170+
"sns.histplot(data=protein_df.groupby(\"species\", as_index=False).count(), stat=\"count\", bins=30)\n",
171171
"plt.xlabel(\"Nr proteins in each species\")\n",
172-
"plt.ylabel(\"Frequency\")\n",
172+
"plt.ylabel(\"Counts of proteomes\")\n",
173173
"plt.title(\"Distribution of number of proteins per species\")"
174174
]
175175
},
@@ -204,7 +204,8 @@
204204
"source": [
205205
"sns.displot(protein_df, x=\"prot_len\", hue=\"species\", kind=\"kde\", height=8)\n",
206206
"plt.xlim(0, 2000)\n",
207-
"plt.title(\"Protein length distribution per species\", fontsize=20)\n"
207+
"plt.title(\"Protein length distribution per species\", fontsize=20)\n",
208+
"plt.xlabel(\"Protein length\")\n"
208209
]
209210
},
210211
{
@@ -269,6 +270,8 @@
269270
"df_seq = pd.merge(hog_summary_df, protein_df.groupby(\"species\", as_index=False).count(), on='species')\n",
270271
"df_seq['minor_splice'] = df_seq['prot_len']-df_seq['genes']\n",
271272
"df_seq = df_seq[['species', 'genes', 'not_in_group','minor_splice']]\n",
273+
"order = species_tree.get_leaf_names()\n",
274+
"df_seq = df_seq.sort_values(by=['species'], key=lambda s: s.apply(order.index)).set_index('species')\n",
272275
"df_seq"
273276
]
274277
},
@@ -289,11 +292,11 @@
289292
"metadata": {},
290293
"outputs": [],
291294
"source": [
292-
"df_seq.set_index('species').plot(kind='bar', stacked=True)\n",
295+
"df_seq.plot(kind='bar', stacked=True)\n",
293296
"plt.title('Number of proteins in HOGs / singltons / minor splice variants', fontsize=16)\n",
294297
"plt.xlabel('Species')\n",
295298
"plt.ylabel('Counts')\n",
296-
"plt.xticks(rotation=45);"
299+
"plt.xticks(rotation=90);"
297300
]
298301
},
299302
{
@@ -316,14 +319,47 @@
316319
"hog_df"
317320
]
318321
},
322+
{
323+
"cell_type": "markdown",
324+
"id": "44872248-8ad8-487c-89c2-147337d2c5f1",
325+
"metadata": {},
326+
"source": [
327+
"Now, we can slice the HOGs in various ways. Remember, they are nested, so it usually makes sense to analyse either all the HOGs at their root level or alternativly look at a specific taxonomic level.\n",
328+
"\n",
329+
"## Roothogs (deepest levels for every HOG)\n",
330+
"\n",
331+
"Here, we first look at the all the RootHOGs. We can select them using the `is_roothog` column"
332+
]
333+
},
319334
{
320335
"cell_type": "code",
321336
"execution_count": null,
322337
"id": "0a8c5e3f-daed-431c-ba79-46a0f701a779",
323338
"metadata": {},
324339
"outputs": [],
325340
"source": [
326-
"sns.scatterplot(data=hog_df[hog_df['is_roothog']==True], x='nr_members', y='CompletenessScore')"
341+
"roothog_df = hog_df[(hog_df['is_roothog']==True)]\n",
342+
"print(f\"Number of RootHOGs: {len(roothog_df)}\")\n",
343+
"roothog_df.describe()"
344+
]
345+
},
346+
{
347+
"cell_type": "markdown",
348+
"id": "ec10e7c9-b803-45c1-b131-302c12b1da6c",
349+
"metadata": {},
350+
"source": [
351+
"We can further analyse how complete these roothogs are. The `CompletenessScore` contains the fraction of species that have at least one gene in the HOG. "
352+
]
353+
},
354+
{
355+
"cell_type": "code",
356+
"execution_count": null,
357+
"id": "c90ff7ef-c88f-4c74-bfbb-0cf2fa0d5901",
358+
"metadata": {},
359+
"outputs": [],
360+
"source": [
361+
"sns.displot(roothog_df, x=\"CompletenessScore\", kind=\"ecdf\")\n",
362+
"plt.title(\"Cumulative distribution of CompletenessScore for all RootHOGs\", fontsize=16);\n"
327363
]
328364
},
329365
{
@@ -333,7 +369,101 @@
333369
"metadata": {},
334370
"outputs": [],
335371
"source": [
336-
"sns.jointplot(data=hog_df[hog_df['is_roothog']==True], x='nr_members', y='CompletenessScore')"
372+
"g = sns.jointplot(data=roothog_df, \n",
373+
" kind=\"scatter\",\n",
374+
" x='nr_members', \n",
375+
" y='CompletenessScore', \n",
376+
" marginal_kws=dict(bins=20, element=\"step\"), \n",
377+
" marginal_ticks=True,\n",
378+
" height=11)\n",
379+
"g.fig.suptitle(\"Distribution of HOG sizes and CompletenessScores for all RootHOGs\", fontsize=16)\n",
380+
"g.ax_joint.set_xlabel(\"HOG size (number of member genes)\", fontsize=14) \n",
381+
"g.ax_joint.set_ylabel(\"Completeness Score of HOG\", fontsize=14)\n",
382+
"g.fig.tight_layout()\n",
383+
"g.fig.subplots_adjust(top=0.95)"
384+
]
385+
},
386+
{
387+
"cell_type": "markdown",
388+
"id": "9954b3f6-f435-496a-8fab-7da09baffc47",
389+
"metadata": {},
390+
"source": [
391+
"Alternatively, we can also analyse the HOGs at a given taxonomic level. Here, we generate the plot for a relatively deep level in the species tree. You can change this to more useful levels tfor your dataset.\n",
392+
"\n",
393+
"## HOGs at a taxonomic level"
394+
]
395+
},
396+
{
397+
"cell_type": "code",
398+
"execution_count": null,
399+
"id": "0e6476e7-694f-4108-8a02-3e86374416ad",
400+
"metadata": {},
401+
"outputs": [],
402+
"source": [
403+
"desired_subtree_size = max(2, int(0.4*len(species_tree.get_leaf_names())))\n",
404+
"node = species_tree\n",
405+
"while True:\n",
406+
" nr_child = [len(c.get_leaves()) for c in node.get_descendants()]\n",
407+
" if max(nr_child) < desired_subtree_size:\n",
408+
" break\n",
409+
" k = nr_child.index(max(nr_child))\n",
410+
" node = node.get_descendants()[k]\n",
411+
"level = node.name\n",
412+
"print(f\"We've selected {level} as our level of interest\\nIt contains {len(node.get_leaf_names())} species (out of {len(species_tree.get_leaves())}).\\nYou can use a different level by setting the level variable in the next cell instead.\")"
413+
]
414+
},
415+
{
416+
"cell_type": "code",
417+
"execution_count": null,
418+
"id": "f164e5f4-85f9-40ec-9dc7-1f7ff963ca25",
419+
"metadata": {},
420+
"outputs": [],
421+
"source": [
422+
"#level = \"XXX\""
423+
]
424+
},
425+
{
426+
"cell_type": "code",
427+
"execution_count": null,
428+
"id": "7066668b-f260-4b7d-a9a0-68141f1149ef",
429+
"metadata": {},
430+
"outputs": [],
431+
"source": [
432+
"level_df = hog_df[(hog_df['level']==level)]\n",
433+
"print(f\"Number of HOGs at {level}: {len(level_df)}\")\n",
434+
"level_df.describe()"
435+
]
436+
},
437+
{
438+
"cell_type": "code",
439+
"execution_count": null,
440+
"id": "ee844afe-fa0a-4b3b-8fe3-4e9fec48bf36",
441+
"metadata": {},
442+
"outputs": [],
443+
"source": [
444+
"sns.displot(level_df, x=\"CompletenessScore\", kind=\"ecdf\")\n",
445+
"plt.title(f\"Cumulative distribution on CompletenessScore for HOGs at {level}\");\n"
446+
]
447+
},
448+
{
449+
"cell_type": "code",
450+
"execution_count": null,
451+
"id": "3f572007-d324-4310-bc55-b210e19034f8",
452+
"metadata": {},
453+
"outputs": [],
454+
"source": [
455+
"g = sns.jointplot(data=level_df, \n",
456+
" kind=\"scatter\",\n",
457+
" x='nr_members', \n",
458+
" y='CompletenessScore', \n",
459+
" marginal_kws=dict(bins=20, element=\"step\"), \n",
460+
" marginal_ticks=True,\n",
461+
" height=11)\n",
462+
"g.fig.suptitle(f\"Distribution of HOG sizes and CompletenessScores for HOGs at {level}\", fontsize=16)\n",
463+
"g.ax_joint.set_xlabel(\"HOG size (number of member genes)\", fontsize=14) \n",
464+
"g.ax_joint.set_ylabel(\"Completeness Score of HOG\", fontsize=14)\n",
465+
"g.fig.tight_layout()\n",
466+
"g.fig.subplots_adjust(top=0.95) "
337467
]
338468
},
339469
{

0 commit comments

Comments
 (0)