Merge pull request #1 from DocMinus/updates

DocMinus · web-flow · commit 657b9e8d1adb · 2024-02-22T23:28:16.000+01:00
Updates
diff --git a/.gitignore b/.gitignore
@@ -1,11 +1,36 @@
 #  IDE
 **/.idea/
 **/.vscode/
-**/__pycache__
 **/.ipynb_checkpoints
 
 # other
 /datasets/*_TD.*
 /datasets/*.pkl
 /datasets/*.gz
+.pytest_cache/
 
+# Python build artifacts
+__pycache__/
+*.py[cod]
+*$py.class
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
diff --git a/AB2C_reaction_TDs_example.py b/AB2C_reaction_TDs_example.py
@@ -3,6 +3,7 @@
 """
 V2.1.4 (Mar. 08, 08:00:00 2023)
 Update: 2023-06-24 (cleanup for ChemRxiv submission)
+Update: 2024-02-22 (minor cleanup and file renaming)
 
 @author: Alexander Minidis (DocMinus)
 Purpose: TDs from csv
@@ -59,20 +60,20 @@ def main():
     # Calculate TDs
     transforms_descriptors = transform_descriptors(cmpd1_smi, cmpd2_smi, prod_smi)
 
-    # for output create table with structures and combine with calculated TDs
+    # combination of the three structure list to a df
     _df = pd.DataFrame(
         {"Compound 1": cmpd1_smi, "Compound 2": cmpd2_smi, "Product": prod_smi}
     )
-    # In addition: filter when empty structures
+    # filter when empty structures
     _df = _df[~((_df.iloc[:, :3] == "").any(axis=1))]
-    # The three tables are concatenated to one
+    # Final table combines the structure list and the TDs
     final_table = pd.concat(
         [in_rct_df["ID"], _df, transforms_descriptors], axis=1, join="inner"
     )
-    # output (optional)
-    print(final_table.tail())
     #############################################################################
-    # Write pickle & csv file
+    # Output, multiple options
+    print(final_table.tail())
+    # Write binary and tsv
     print("\nWriting to file: ", final_output_pkl)
     final_table.to_pickle(final_output_pkl)
     print("\nWriting to file: ", final_output_tsv)
diff --git a/README.md b/README.md
@@ -3,36 +3,42 @@
 
 # Reaction Transform descriptors
 Python code to calculate reaction transform descriptors as described in [CHEMRXIV](https://chemrxiv.org/engage/chemrxiv/article-details/649888d41dcbb92a5e8e3475), by [@DocMinus](https://github.com/docminus) and [@DrAlatriste](https://github.com/DrAlatriste). <br>
-Not a full fledged package, some scripting know-how necessary to use or incorporate in own code might be necessary.
 
 ## Installation
 See _environment_ folder.
+Updated the installation with a setup file to enable the tools to be part of ones Python environment. Testing has also been added. 
 
-## Usage
-Run the provided script by providing a file with tab/semicolon separated data (also comma or space, though not recommended):<br>
-`python 2AB_reaction_TDs.py path/inputfilename`<br>
+## Example Usage
+Run the example script by providing a file with tab/semicolon separated data (also comma or space, though not recommended):
+```shell
+python AB2C_reaction_TDs_example.py inputfilename
+```
 <br>
-You can get help by calling the script using -h: `python 2AB_reaction_TDs.py -h` <br>
+You can get help by calling the script using -h: `python AB2C_reaction_TDs_example.py -h` <br>
 <br>
-This particular script uses fileformat<br>
-_ID reactant1 reactant2 product_<br>
+This particular script expects the input order of the file as<br>
+
+_ID reactant1 reactant2 product_ <br>
 <br>
-The script will provide a simple cleaning of the structures; "extreme" broken structures might not get fixed with the provided method.<br>
+Simple cleaning of structures is included; "extreme" broken structures might not get fixed with the provided method.
 <br>
-Two small test-sets are provided with made up reactions, one of them containing a "faulty" structure to demonstrate correct filtration in the end result. Alternatively, run the _test.py_ script (see below)<br>
+Two small test-sets are provided with made up reactions, one of them containing a "faulty" structure to demonstrate correct filtration in the output result. <br>Execute via:  `python AB2C_reaction_TDs_examples.py ./datsets/testreactions.tsv`<br>
 
 ## Syntax
 If you only want to use the TD function, your script requires the following minimum lines with the smiles as string tuples (even if only a single reaction):
+```shell
+from td_tools.rxntools import transform_descriptors
+
+output_table = transform_descriptors(['smiles_reactant1'],['smiles_reactant2'],['product'])
 ```
-    from td_tools.rxntools import transform_descriptors
-    
-    output_table = transform_descriptors(['smiles_reactant1'],['smiles_reactant2'],['product'])
+A cleaning function as well as a file reader function is included for larger datasets:
+```shell
+from td_tools.rxntools import clean_smiles_multi, read_rct2pd  
 ```
-A cleaning function as well as a file reader function is included for larger datasets.<br>
-Provided scripts include examples on how to concatenate the structures versus the TDs.<br>
-<br>
-For quick testing and timing use `Python test.py`.<br>
-Not a pytest package, but it nevertheless does the trick for quick demonstrating/testing.<br>
+The provided script includes examples on how to concatenate the structures versus the TDs.<br>
+
+## Testing
+Python testing has been added instead of the previous test.py, see the README.md under /tests.<br>
 <br>
 
 ### Acknowledgments
diff --git a/environment/README.md b/environment/README.md
@@ -2,23 +2,33 @@
 
 ## Requirements
 Python >= 3.9 is required to use the [modern style](https://peps.python.org/pep-0585/) of type annotations.<br>
-Recommended: 3.11 (due to increased performance over versions <=3.10)<br>
-Modules required are sort of standard for chemistry scripting, rdkit, pandas & numpy, the latter two are nowadays part of a standard conda install.
+Recommended: 3.11 (due to increased performance over vearlier versions)<br>
+Modules required are sort of standard for chemistry scripting, rdkit, pandas & numpy, the latter two are nowadays part of a standard conda install. 
 
 
-## Installation with Anaconda/Miniconda
-If you nevertheless want a separate environment:<br>
-Run the two commands from the root directory.
+## Installation
+1. Anaconda/Miniconda
+    If you nevertheless want a separate environment:<br>
+    Run the two commands from the root directory.
 
-```shell
-conda env create -f ./environment/conda.yaml
-conda activate rxn_tds
-```
+    ```shell
+    conda env create -f ./environment/conda.yaml
+    conda activate rxn_tds
+    ```
 
-## Installation with Pip
-If you already have an environment you want to add this into, then:<br>
-Run the command from the root directory
+    1b. (alternatively) Venv
+    Note that venv would also work if you prefer that.
 
-```shell
-python -m pip install -r ./environment/requirements.txt
-```
+2. Pip
+    Now run the requirements with pip into this new environment or into any that you already have.<br>
+    Run the command from the root directory
+
+    ```shell
+    pip install -r ./environment/requirements.txt
+    pip install .
+    ```
+
+    The latter installs the rxn_tools into the environment. The example script would work without that, but testing requires that.
+
+## Running Tests
+`pytest` is available for testing. See the README.md in /tests.
diff --git a/setup.py b/setup.py
@@ -0,0 +1,15 @@
+from setuptools import find_packages, setup
+
+setup(
+    name="td_tools",
+    version="2.1.3",
+    pythonrequires=">=3.9",
+    packages=find_packages(),
+    package_data={
+        "td_tools": ["*.txt"],
+    },
+    description="Reaction Transform Descriptor Tools",
+    author="DocMinus",
+    author_email="alexander.minidis@gmail.com",
+    url="https://github.com/DocMinus/RxnTransformDescriptors",
+)
diff --git a/test.py b/test.py
diff --git a/tests/README.md b/tests/README.md
@@ -0,0 +1,13 @@
+## Running Tests
+`pytest` is available for testing. Follow these steps:
+1. Ensure you have installed the project dependencies, as described in the Installation section.
+2. Install the testing dependencies:
+    ```bash
+    pip install pytest
+    ```
+    followed by 
+    ```bash
+    pytest
+    ```
+
+    This command will discover and run all the test cases in the `tests/` directory.
diff --git a/tests/__init.py__ b/tests/__init.py__
diff --git a/tests/test_all.py b/tests/test_all.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+# coding: utf-8
+""" test script, creating articifial data and testing the TDs calculation for reactions
+    only tests the combination and final outcome, not the individual functions
+    2024-02-22; DocMinus
+"""
+
+import pandas as pd
+import pytest
+
+from td_tools.rxntools import clean_smiles_multi, transform_descriptors
+
+
+def test_clean_smiles_multi_and_transform_descriptors():
+    dataset_size = 4  # number of compounds
+    # we define some faulty/missing compounds, then the output table should have 3 rows less than the input table
+    reactant1 = ["CCCN" for _ in range(dataset_size - 1)]
+    reactant1.append("cc")  # incorrect structure
+    reactant1.append("CCO")
+    reactant1.append("CCCl")
+    reactant1.append("CCCl")
+    total_dataset_size = len(reactant1)
+
+    reactant2 = ["CCCCO" for _ in range(dataset_size)]
+    reactant2.append("CC")
+    reactant2.append("CCO")
+    reactant2.append("CCBr")
+
+    product = ["ClCC1=C(B)C(P)=CC(Br)=C1O" for _ in range(dataset_size)]
+    product.append("cc")  # incorrect structure
+    product.append("")  # missing structure
+    product.append("CCI")
+
+    """ a total of 3 rows faulty rows, from bottom of created table it would 2nd, 3rd and 4th last row."""
+
+    g0 = clean_smiles_multi(reactant1)
+    g1 = clean_smiles_multi(reactant2)
+    g2 = clean_smiles_multi(product)
+
+    TD_numbers = transform_descriptors(g0, g1, g2)
+    print(f"{TD_numbers.shape = }, {TD_numbers.shape[1] = }")
+
+    final_table = pd.DataFrame({"Compound 1": g0, "Compound 2": g1, "Product": g2})
+    final_table = final_table[~((final_table.iloc[:, :3] == "").any(axis=1))]
+    final_table = pd.concat([final_table, TD_numbers], axis=1, join="inner")
+
+    # check if the final table is as expected (3 rows less than the input table)
+    assert (
+        final_table.shape[0] == total_dataset_size - 3
+    ), "Number of rows in the final table is not as expected."
+
+    # Check that the 2nd, 3rd, and 4th last rows have been removed
+    removed_indices = [
+        total_dataset_size - 2,
+        total_dataset_size - 3,
+        total_dataset_size - 4,
+    ]
+    for index in removed_indices:
+        assert (
+            index not in final_table.index
+        ), f"Row {index} should have been removed but is still in the final table."