|
167 | 167 | "metadata": {}, |
168 | 168 | "outputs": [], |
169 | 169 | "source": [ |
170 | | - "sns.histplot(data=protein_df.groupby(\"species\", as_index=False).count(), stat=\"probability\", bins=30)\n", |
| 170 | + "sns.histplot(data=protein_df.groupby(\"species\", as_index=False).count(), stat=\"count\", bins=30)\n", |
171 | 171 | "plt.xlabel(\"Nr proteins in each species\")\n", |
172 | | - "plt.ylabel(\"Frequency\")\n", |
| 172 | + "plt.ylabel(\"Counts of proteomes\")\n", |
173 | 173 | "plt.title(\"Distribution of number of proteins per species\")" |
174 | 174 | ] |
175 | 175 | }, |
|
204 | 204 | "source": [ |
205 | 205 | "sns.displot(protein_df, x=\"prot_len\", hue=\"species\", kind=\"kde\", height=8)\n", |
206 | 206 | "plt.xlim(0, 2000)\n", |
207 | | - "plt.title(\"Protein length distribution per species\", fontsize=20)\n" |
| 207 | + "plt.title(\"Protein length distribution per species\", fontsize=20)\n", |
| 208 | + "plt.xlabel(\"Protein length\")\n" |
208 | 209 | ] |
209 | 210 | }, |
210 | 211 | { |
|
269 | 270 | "df_seq = pd.merge(hog_summary_df, protein_df.groupby(\"species\", as_index=False).count(), on='species')\n", |
270 | 271 | "df_seq['minor_splice'] = df_seq['prot_len']-df_seq['genes']\n", |
271 | 272 | "df_seq = df_seq[['species', 'genes', 'not_in_group','minor_splice']]\n", |
| 273 | + "order = species_tree.get_leaf_names()\n", |
| 274 | + "df_seq = df_seq.sort_values(by=['species'], key=lambda s: s.apply(order.index)).set_index('species')\n", |
272 | 275 | "df_seq" |
273 | 276 | ] |
274 | 277 | }, |
|
289 | 292 | "metadata": {}, |
290 | 293 | "outputs": [], |
291 | 294 | "source": [ |
292 | | - "df_seq.set_index('species').plot(kind='bar', stacked=True)\n", |
| 295 | + "df_seq.plot(kind='bar', stacked=True)\n", |
293 | 296 | "plt.title('Number of proteins in HOGs / singltons / minor splice variants', fontsize=16)\n", |
294 | 297 | "plt.xlabel('Species')\n", |
295 | 298 | "plt.ylabel('Counts')\n", |
296 | | - "plt.xticks(rotation=45);" |
| 299 | + "plt.xticks(rotation=90);" |
297 | 300 | ] |
298 | 301 | }, |
299 | 302 | { |
|
316 | 319 | "hog_df" |
317 | 320 | ] |
318 | 321 | }, |
| 322 | + { |
| 323 | + "cell_type": "markdown", |
| 324 | + "id": "44872248-8ad8-487c-89c2-147337d2c5f1", |
| 325 | + "metadata": {}, |
| 326 | + "source": [ |
| 327 | + "Now, we can slice the HOGs in various ways. Remember, they are nested, so it usually makes sense to analyse either all the HOGs at their root level or alternativly look at a specific taxonomic level.\n", |
| 328 | + "\n", |
| 329 | + "## Roothogs (deepest levels for every HOG)\n", |
| 330 | + "\n", |
| 331 | + "Here, we first look at the all the RootHOGs. We can select them using the `is_roothog` column" |
| 332 | + ] |
| 333 | + }, |
319 | 334 | { |
320 | 335 | "cell_type": "code", |
321 | 336 | "execution_count": null, |
322 | 337 | "id": "0a8c5e3f-daed-431c-ba79-46a0f701a779", |
323 | 338 | "metadata": {}, |
324 | 339 | "outputs": [], |
325 | 340 | "source": [ |
326 | | - "sns.scatterplot(data=hog_df[hog_df['is_roothog']==True], x='nr_members', y='CompletenessScore')" |
| 341 | + "roothog_df = hog_df[(hog_df['is_roothog']==True)]\n", |
| 342 | + "print(f\"Number of RootHOGs: {len(roothog_df)}\")\n", |
| 343 | + "roothog_df.describe()" |
| 344 | + ] |
| 345 | + }, |
| 346 | + { |
| 347 | + "cell_type": "markdown", |
| 348 | + "id": "ec10e7c9-b803-45c1-b131-302c12b1da6c", |
| 349 | + "metadata": {}, |
| 350 | + "source": [ |
| 351 | + "We can further analyse how complete these roothogs are. The `CompletenessScore` contains the fraction of species that have at least one gene in the HOG. " |
| 352 | + ] |
| 353 | + }, |
| 354 | + { |
| 355 | + "cell_type": "code", |
| 356 | + "execution_count": null, |
| 357 | + "id": "c90ff7ef-c88f-4c74-bfbb-0cf2fa0d5901", |
| 358 | + "metadata": {}, |
| 359 | + "outputs": [], |
| 360 | + "source": [ |
| 361 | + "sns.displot(roothog_df, x=\"CompletenessScore\", kind=\"ecdf\")\n", |
| 362 | + "plt.title(\"Cumulative distribution of CompletenessScore for all RootHOGs\", fontsize=16);\n" |
327 | 363 | ] |
328 | 364 | }, |
329 | 365 | { |
|
333 | 369 | "metadata": {}, |
334 | 370 | "outputs": [], |
335 | 371 | "source": [ |
336 | | - "sns.jointplot(data=hog_df[hog_df['is_roothog']==True], x='nr_members', y='CompletenessScore')" |
| 372 | + "g = sns.jointplot(data=roothog_df, \n", |
| 373 | + " kind=\"scatter\",\n", |
| 374 | + " x='nr_members', \n", |
| 375 | + " y='CompletenessScore', \n", |
| 376 | + " marginal_kws=dict(bins=20, element=\"step\"), \n", |
| 377 | + " marginal_ticks=True,\n", |
| 378 | + " height=11)\n", |
| 379 | + "g.fig.suptitle(\"Distribution of HOG sizes and CompletenessScores for all RootHOGs\", fontsize=16)\n", |
| 380 | + "g.ax_joint.set_xlabel(\"HOG size (number of member genes)\", fontsize=14) \n", |
| 381 | + "g.ax_joint.set_ylabel(\"Completeness Score of HOG\", fontsize=14)\n", |
| 382 | + "g.fig.tight_layout()\n", |
| 383 | + "g.fig.subplots_adjust(top=0.95)" |
| 384 | + ] |
| 385 | + }, |
| 386 | + { |
| 387 | + "cell_type": "markdown", |
| 388 | + "id": "9954b3f6-f435-496a-8fab-7da09baffc47", |
| 389 | + "metadata": {}, |
| 390 | + "source": [ |
| 391 | + "Alternatively, we can also analyse the HOGs at a given taxonomic level. Here, we generate the plot for a relatively deep level in the species tree. You can change this to more useful levels tfor your dataset.\n", |
| 392 | + "\n", |
| 393 | + "## HOGs at a taxonomic level" |
| 394 | + ] |
| 395 | + }, |
| 396 | + { |
| 397 | + "cell_type": "code", |
| 398 | + "execution_count": null, |
| 399 | + "id": "0e6476e7-694f-4108-8a02-3e86374416ad", |
| 400 | + "metadata": {}, |
| 401 | + "outputs": [], |
| 402 | + "source": [ |
| 403 | + "desired_subtree_size = max(2, int(0.4*len(species_tree.get_leaf_names())))\n", |
| 404 | + "node = species_tree\n", |
| 405 | + "while True:\n", |
| 406 | + " nr_child = [len(c.get_leaves()) for c in node.get_descendants()]\n", |
| 407 | + " if max(nr_child) < desired_subtree_size:\n", |
| 408 | + " break\n", |
| 409 | + " k = nr_child.index(max(nr_child))\n", |
| 410 | + " node = node.get_descendants()[k]\n", |
| 411 | + "level = node.name\n", |
| 412 | + "print(f\"We've selected {level} as our level of interest\\nIt contains {len(node.get_leaf_names())} species (out of {len(species_tree.get_leaves())}).\\nYou can use a different level by setting the level variable in the next cell instead.\")" |
| 413 | + ] |
| 414 | + }, |
| 415 | + { |
| 416 | + "cell_type": "code", |
| 417 | + "execution_count": null, |
| 418 | + "id": "f164e5f4-85f9-40ec-9dc7-1f7ff963ca25", |
| 419 | + "metadata": {}, |
| 420 | + "outputs": [], |
| 421 | + "source": [ |
| 422 | + "#level = \"XXX\"" |
| 423 | + ] |
| 424 | + }, |
| 425 | + { |
| 426 | + "cell_type": "code", |
| 427 | + "execution_count": null, |
| 428 | + "id": "7066668b-f260-4b7d-a9a0-68141f1149ef", |
| 429 | + "metadata": {}, |
| 430 | + "outputs": [], |
| 431 | + "source": [ |
| 432 | + "level_df = hog_df[(hog_df['level']==level)]\n", |
| 433 | + "print(f\"Number of HOGs at {level}: {len(level_df)}\")\n", |
| 434 | + "level_df.describe()" |
| 435 | + ] |
| 436 | + }, |
| 437 | + { |
| 438 | + "cell_type": "code", |
| 439 | + "execution_count": null, |
| 440 | + "id": "ee844afe-fa0a-4b3b-8fe3-4e9fec48bf36", |
| 441 | + "metadata": {}, |
| 442 | + "outputs": [], |
| 443 | + "source": [ |
| 444 | + "sns.displot(level_df, x=\"CompletenessScore\", kind=\"ecdf\")\n", |
| 445 | + "plt.title(f\"Cumulative distribution on CompletenessScore for HOGs at {level}\");\n" |
| 446 | + ] |
| 447 | + }, |
| 448 | + { |
| 449 | + "cell_type": "code", |
| 450 | + "execution_count": null, |
| 451 | + "id": "3f572007-d324-4310-bc55-b210e19034f8", |
| 452 | + "metadata": {}, |
| 453 | + "outputs": [], |
| 454 | + "source": [ |
| 455 | + "g = sns.jointplot(data=level_df, \n", |
| 456 | + " kind=\"scatter\",\n", |
| 457 | + " x='nr_members', \n", |
| 458 | + " y='CompletenessScore', \n", |
| 459 | + " marginal_kws=dict(bins=20, element=\"step\"), \n", |
| 460 | + " marginal_ticks=True,\n", |
| 461 | + " height=11)\n", |
| 462 | + "g.fig.suptitle(f\"Distribution of HOG sizes and CompletenessScores for HOGs at {level}\", fontsize=16)\n", |
| 463 | + "g.ax_joint.set_xlabel(\"HOG size (number of member genes)\", fontsize=14) \n", |
| 464 | + "g.ax_joint.set_ylabel(\"Completeness Score of HOG\", fontsize=14)\n", |
| 465 | + "g.fig.tight_layout()\n", |
| 466 | + "g.fig.subplots_adjust(top=0.95) " |
337 | 467 | ] |
338 | 468 | }, |
339 | 469 | { |
|
0 commit comments