diff --git a/CHANGELOG.md b/CHANGELOG.md index a2afb16..083758c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,13 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +### Added + +- `SmirkBigSmilesFast` Tokenizer for BigSMILES line notation representation of polymers ([#8](https://github.com/BattModels/smirk/pull/8)) + ### Changed - Bumped GitHub Actions, Python, Rust, and documentation dependencies ([#10](https://github.com/BattModels/smirk/pull/10) -- [#24](https://github.com/BattModels/smirk/pull/24)) ### Fixed -- Build issue due to leading `./` in included file paths ([#7](https://github.com/BattModels/smirk/pull/7)) +- Build issue due to leading `./` in included file paths ([#7](https://github.com/BattModels/smirk/pull/7)) - Fixed Dependabot configuration ([#9](https://github.com/BattModels/smirk/pull/9)) ## [v0.2.0](https://github.com/BattModels/smirk/tree/v0.2.0) diff --git a/docs/api.rst b/docs/api.rst index 5a8a09e..4b67cab 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -33,7 +33,13 @@ API Reference .. seealso:: :py:meth:`transformers.PreTrainedTokenizerBase.batch_decode` for the 🤗 documentation -.. autofunction:: smirk.SmirkSelfiesFast +.. autoclass:: smirk.SmirkSelfiesFast + :special-members: + :members: + +.. autoclass:: smirk.SmirkBigSmilesFast + :special-members: + :members: .. autodata:: smirk.SPECIAL_TOKENS diff --git a/docs/big_smirk_demo.ipynb b/docs/big_smirk_demo.ipynb new file mode 100644 index 0000000..70512ce --- /dev/null +++ b/docs/big_smirk_demo.ipynb @@ -0,0 +1,471 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0", + "metadata": {}, + "source": [ + "# BigSMILES and BigSmirk\n", + "\n", + "\n", + " \"Open\n", + "\n", + "\n", + " \"Binder\"\n", + "\n", + "\n", + "\n", + "\n", + "BigSmirk tokenizes the [BigSMILES] encoding for macromolecules all the way down to their constituent elements.\n", + "\n", + "Let's see it in action!\n", + "\n", + "[BigSMILES]: https://olsenlabmit.github.io/BigSMILES/docs/#the-bigsmiles-project" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "1", + "metadata": {}, + "source": [ + "🐍 Installation is easy with pre-build binaries on [PyPI](https://pypi.org/project/smirk/) and [GitHub](https://github.com/BattModels/smirk/releases). Just run: `pip install smirk`\n", + "\n", + "> Installing from source? See [installing from source](./developer.md#installing-from-source) for instructions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2", + "metadata": { + "tags": [ + "hide-output" + ] + }, + "outputs": [], + "source": [ + "!python -m pip install smirk transformers" + ] + }, + { + "cell_type": "markdown", + "id": "3", + "metadata": {}, + "source": [ + "## First steps\n", + "\n", + "🤗 smirk subclasses Hugging Face's [PreTrainedTokenizerBase](#transformers.PreTrainedTokenizerBase) for seamless compatibility and leverages [Tokenizers] for raw rust-powered speed. No need to learn another framework; everything works out of the box 🎁\n", + "\n", + "[Tokenizers]: https://huggingface.co/docs/tokenizers/index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4", + "metadata": {}, + "outputs": [], + "source": [ + "from smirk import SmirkBigSmilesFast\n", + "\n", + "# Just import and tokenize!\n", + "bigsmirk = SmirkBigSmilesFast()\n", + "bigsmirk(\"{[][$]CC[$],[$]CC(CC)[$][]}\") # ethylene butene copolymer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5", + "metadata": {}, + "outputs": [], + "source": [ + "# Batch Tokenization with Padding\n", + "batch = bigsmirk([\n", + " \"[H]O{[>][<]C(=O)CCCCC(=O)[<],[>]NCCCCCCN[>][<]}[H]\", # nylon-6,6\n", + " \"{[][<]OCC[>][<]}{[>][<]OC(C)C[>][]}\", # block copolymer\n", + " \"{[][<]C(=O)c1ccc(cc1)C(=O)[<],[>]OCCO[>][]}\", # alternation co-polymer\n", + "], padding=\"longest\")\n", + "batch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6", + "metadata": {}, + "outputs": [], + "source": [ + "# Back to polymers!\n", + "bigsmirk.batch_decode(batch[\"input_ids\"], skip_special_tokens=True)" + ] + }, + { + "cell_type": "markdown", + "id": "7", + "metadata": {}, + "source": [ + "Let's visualize BigSMILES token boundaries for PVC (Polyvinyl chloride ) and sPP (Syndiotactic Polypropylene) by coloring each token in sequence." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8", + "metadata": {}, + "outputs": [], + "source": [ + "import hashlib\n", + "from html import escape\n", + "from IPython.display import HTML\n", + "\n", + "\n", + "def render_colored_tokens(text, tokenizer=bigsmirk):\n", + " tokens = tokenizer.tokenize(text)\n", + " palette = [\n", + " \"#f94144\", \"#f3722c\", \"#f8961e\", \"#f9844a\", \"#f9c74f\",\n", + " \"#90be6d\", \"#43aa8b\", \"#4d908e\", \"#577590\", \"#277da1\",\n", + " ]\n", + " spans = []\n", + " for tok in tokens:\n", + " digest = hashlib.sha1(tok.encode(\"utf-8\")).digest()\n", + " color = palette[int.from_bytes(digest[:2], \"big\") % len(palette)]\n", + " label = escape(tok)\n", + " spans.append(\n", + " f\"{label}\"\n", + " )\n", + " return HTML(\"
\" + \"\".join(spans) + \"
\")\n", + "\n", + "\n", + "pvc = \"{[][$]CC(Cl)[$][]}\"\n", + "render_colored_tokens(pvc)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9", + "metadata": {}, + "outputs": [], + "source": [ + "spp = \"CC{[>][<]C[C@@H](C)C[C@H](C)[>];[<]C=CC,[<]C[C@H](C)C=CC[]}\"\n", + "render_colored_tokens(spp)" + ] + }, + { + "cell_type": "markdown", + "id": "10", + "metadata": {}, + "source": [ + "### Handling the Fragment Name Definition Notation\n", + "\n", + "The BigSMILES line notation allows some portions of the BigSMILES representations be replaced by more abstract but compact proxy [fragment names], for example, the names of repeating units.\n", + "The smirk BigSMILES tokenizers handles this `'[' + '#' + fragment_name + ']'` syntax by replacing the fragment name with its definition and then tokenizing the expanded BigSMILES.\n", + "The definition should be provided within the BigSMILES as specified by the line notation i.e.:\n", + "`BigSMILES_string + '[' + '#' + fragment_name + ']' + BigSMILES_string + '.' + '{' + '#' + fragment_name + '=' + BigSMILES_string + '}'`\n", + "\n", + "For example:\n", + "```\n", + "C([#Arm1])([#Arm2]).{#Arm1=CO}.{#Arm2=N} --> C(CO)(N) ---> 'C', '(', 'C', 'O', ')', '(', 'N', ')'\n", + "```\n", + "More examples of valid and invalid use of the fragment name definition notation are provided below.\n", + "\n", + "[fragment names]: https://olsenlabmit.github.io/BigSMILES/docs/line_notation.html#simplifications-and-abbreviations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11", + "metadata": {}, + "outputs": [], + "source": [ + "abstract_label_examples = {\n", + " \"defined [#label] placeholder\": \"C([#Arm])([#Arm])([#Arm])[#Arm].{#Arm=CO{[<][>]CCO[<][>]}}\",\n", + " \"multiple [#label] placeholders\": \"C([#Arm1])([#Arm2]).{#Arm1=CO}.{#Arm2=N}\",\n", + " \"[#label] in stochastic object\": \"{[][$]CC(C)([#Side])[$][]}.{#Side=C(=O)OCC}\",\n", + " \"undefined [#label] placeholder\": \"C([#Arm])([#Arm])\",\n", + " \"bare labels stay unknown\": r\"A([$1[<1]1])R(A'[$1[>1]1]).{#A=C}.{#R=C}\",\n", + "}\n", + "\n", + "for label, text in abstract_label_examples.items():\n", + " encoded = bigsmirk(text, add_special_tokens=False)\n", + " tokens = bigsmirk.convert_ids_to_tokens(encoded[\"input_ids\"])\n", + " decoded = bigsmirk.decode(encoded[\"input_ids\"], skip_special_tokens=True)\n", + "\n", + " print(label)\n", + " print(\"input:\", text)\n", + " print(\"tokens:\", tokens)\n", + " print(\"unknowns:\", tokens.count(bigsmirk.unk_token))\n", + " print(\"decoded:\", decoded)\n", + " print()\n" + ] + }, + { + "cell_type": "markdown", + "id": "12", + "metadata": {}, + "source": [ + "## Zero to Polymer Foundation Model with Smirk!" + ] + }, + { + "cell_type": "markdown", + "id": "13", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "Let's train a small [RoBERTa] model on polymers from [S. Choi et al., 2024] using Hugging Face and smirk.\n", + "\n", + "[RoBERTa]: https://doi.org/10.48550/ARXIV.1907.11692\n", + "[S. Choi et al., 2024]:https://www.nature.com/articles/s41597-024-03212-4" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "hide-output", + "remove-stderr" + ] + }, + "outputs": [], + "source": [ + "!python -m pip install accelerate datasets torch" + ] + }, + { + "cell_type": "markdown", + "id": "15", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "### Dataset Preprocessing\n", + "\n", + "Download the dataset for generated by [Choi et al] from [figshare] and unzip it:\n", + "\n", + "```\n", + "curl -L -o with_Tg.zip \"https://springernature.figshare.com/ndownloader/files/42507037\"\n", + "unzip with_Tg.zip -d with_Tg\n", + "```\n", + "\n", + "[Choi et al]: https://www.nature.com/articles/s41597-024-03212-4#citeas\n", + "[figshare]: \"https://springernature.figshare.com/ndownloader/files/42507037\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "remove-stderr" + ] + }, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "import os\n", + "\n", + "# Location to unzipped data\n", + "data_dir = \"with_Tg\"\n", + "dataset=load_dataset(\"csv\", data_files=[os.path.join(data_dir,\"JCIM_sup_bigsmiles.csv\")])[\"train\"].select_columns([\"BigSMILES\"]).train_test_split(test_size=0.2)\n", + "dataset=dataset.map(bigsmirk, input_columns=[\"BigSMILES\"], desc=\"Tokenizing\")" + ] + }, + { + "cell_type": "markdown", + "id": "17", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "> 💡 Hugging Face/ Tokenizers may raise a warning about being forked as we've already used our tokenizers (this isn't a smirk issue).\n", + "> It's harmless, but when actually training it's best to avoid tokenization until after the fork to benefit from the rust-level parallelism\n", + "\n", + "🎉 That's it! We've tokenized all of the BigSMILES dataset using smirk!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "dataset[\"train\"].to_pandas().head()" + ] + }, + { + "cell_type": "markdown", + "id": "19", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "### Training\n", + "Once we've tokenized the dataset, training the model is just a matter of configuration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "remove-stderr" + ] + }, + "outputs": [], + "source": [ + "from transformers import Trainer, RobertaForMaskedLM, RobertaConfig, DataCollatorForLanguageModeling\n", + "\n", + "# A very small model for demonstrating training a molecular foundation model with smirk \n", + "config = RobertaConfig(\n", + " vocab_size=len(bigsmirk),\n", + " hidden_size=256,\n", + " intermediate_size=1024,\n", + " num_hidden_layers=4,\n", + " num_attention_heads=4,\n", + ")\n", + "model = RobertaForMaskedLM(config)\n", + "\n", + "# Setup up the trainer to use our dataset\n", + "trainer = Trainer(\n", + " model=model,\n", + " train_dataset=dataset[\"train\"],\n", + " eval_dataset=dataset[\"test\"],\n", + " processing_class=bigsmirk,\n", + " data_collator=DataCollatorForLanguageModeling(bigsmirk), # The data collator needs to know about our tokenizer\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21", + "metadata": { + "editable": true, + "slideshow": { + "slide_type": "" + }, + "tags": [ + "skip-execution" + ] + }, + "outputs": [], + "source": [ + "trainer.train()" + ] + }, + { + "cell_type": "markdown", + "id": "22", + "metadata": {}, + "source": [ + "### MLM Example: Predict a Masked Token\n", + "\n", + "Mask one token in a BigSMILES string and ask the trained model for top predictions." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "inference_model = trainer.model\n", + "inference_model.eval()\n", + "device = next(inference_model.parameters()).device\n", + "\n", + "sample = dataset[\"test\"][5][\"BigSMILES\"]\n", + "\n", + "# Encode and choose a position to mask\n", + "encoded = bigsmirk(sample, add_special_tokens=False)\n", + "input_ids = encoded[\"input_ids\"]\n", + "tokens = bigsmirk.convert_ids_to_tokens(input_ids)\n", + "mask_pos = len(tokens) // 2\n", + "\n", + "masked_ids = input_ids.copy()\n", + "masked_ids[mask_pos] = bigsmirk.mask_token_id\n", + "masked_tokens = bigsmirk.convert_ids_to_tokens(masked_ids)\n", + "\n", + "inputs = {\n", + " \"input_ids\": torch.tensor([masked_ids],device=device),\n", + " \"attention_mask\": torch.ones((1, len(masked_ids)), device=device),\n", + "}\n", + "\n", + "with torch.no_grad():\n", + " logits = inference_model(**inputs).logits[0, mask_pos].detach().cpu()\n", + " probs = torch.softmax(logits, dim=-1)\n", + "\n", + "top_k = 5\n", + "top_ids = torch.topk(probs, k=top_k).indices.tolist()\n", + "top_tokens = bigsmirk.convert_ids_to_tokens(top_ids)\n", + "\n", + "print(\"Original:\", sample)\n", + "print(\"Masked :\", \"\".join(masked_tokens))\n", + "print(f\"Masked token index: {mask_pos} (original token: {tokens[mask_pos]})\")\n", + "print(\"\\nTop predictions:\")\n", + "\n", + "for rank, (tok_id, tok) in enumerate(zip(top_ids, top_tokens), start=1):\n", + " candidate_ids = masked_ids.copy()\n", + " candidate_ids[mask_pos] = tok_id\n", + " candidate = bigsmirk.decode(candidate_ids, skip_special_tokens=True)\n", + " score = probs[tok_id].item()\n", + " print(f\"{rank}. token={tok!r:>4} p={score:.4f} -> {candidate}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/opt/build_vocab.py b/opt/build_vocab.py index d157a6e..dfb2b86 100755 --- a/opt/build_vocab.py +++ b/opt/build_vocab.py @@ -29,6 +29,8 @@ CHIRAL_CONFIG = ["TH", "AL", "SP", "TB", "OH"] BONDS = [".", "-", "=", "#", "$", ":", "/", "\\"] DIGITS = [str(x) for x in range(10)] +BIGSMILES_EXTRA_TOKENS = ["{", "}", ",", ";", "<", ">"] +BIGSMILES_LABEL = r"[A-Z][A-Za-z0-9']*" def build_smiles_alphabet(): @@ -59,17 +61,23 @@ def build_smiles_alphabet(): return vocab -def const_str(name, regex, comment=None, public=False): +def const_str(name, regex, comment=None, public=False, separator="|", wrapper=None): out = f"const {name}: &'static str =" if isinstance(regex, list): - out += " concat!(\n" + if wrapper: + out += f" {wrapper}!(concat!(\n" + else: + out += " concat!(\n" for idx, r in enumerate(regex): out += f' r"{r}' if idx < len(regex) - 1: - out += "|" + out += separator out += '",\n' - out += ");" + if wrapper: + out += "));" + else: + out += ");" else: out += f' r"{regex}";' @@ -110,6 +118,35 @@ def merge_tokens(tokens): return sorted(out) +def merge_tokens_grouped(tokens): + branches = defaultdict(set) + for token in tokens: + assert len(token) in [1, 2] + if len(token) == 1: + branches[token[0]] |= {None} + else: + branches[token[0]].add(token[1]) + + out = [] + for leader, tail in branches.items(): + if None in tail: + tail -= {None} + if len(tail) == 0: + cr = leader + elif len(tail) == 1: + cr = f"{leader}{tail.pop()}?" + else: + cr = f"{leader}(?:{'|'.join(sorted(tail))})?" + else: + if len(tail) == 1: + cr = f"{leader}{tail.pop()}" + else: + cr = f"{leader}(?:{'|'.join(sorted(tail))})" + + out.append(cr) + return sorted(out) + + def match_chars(chars: list[str]): """Combine chars into a regex: `[chars]`, adding escapes as needed""" return "[" + re.escape("".join(chars)) + "]" @@ -145,6 +182,90 @@ def build_smiles_pretokenizer(): print(const_str("CHIRAL", f"@(?:@|{chiral})?")) +def build_bigsmiles_pretokenizer(): + print( + const_str( + "BRACKETED_SYMBOL", + [ + *merge_tokens_grouped(ELEMENT_SYMBOLS), + *merge_tokens_grouped(AROMATIC_SYMBOLS), + r"\*", + ], + ) + ) + chiral = "|".join(merge_tokens_grouped(CHIRAL_CONFIG)) + print(const_str("CHIRAL", f"@(?:@|{chiral})?")) + print( + const_str( + "MATCH_OUTER_BIGSMILES", + [ + "|".join(merge_tokens(ALIPHATIC_ORGANIC)), # organic subset elements + "|".join(merge_tokens(AROMATIC_ORGANIC)), # aromatic organic subset + r"\*", # wildcard + match_chars(BONDS), # bonds + r"\d|%", # ring closures + r"\(|\)", # branches + r"\{|\}", # stochastic object delimiters + r",|;", # repeat unit separator and end group separator + BIGSMILES_LABEL, # bare spec labels + r"\[(?:[^\[\]]+|\[[^\[\]]*\])*\]", # bracketed atoms/descriptors + ], + public=True, + ) + ) + print( + const_str( + "MATCH_INNER_BIGSMILES", + [ + r"^(?:", + r"", + r"|", + r"(\$|<|>)(\d+)?", + r"|", + rf"(\$|<|>)(\d+)?(\[)(\$|<|>|{BIGSMILES_LABEL})(\d+)?(\])(\d+)", + r"|", + r"(#[!-~]+)", + r"|", + r"(\d+)?", + r"({BRACKETED_SYMBOL})", + r"(?:({CHIRAL})(\d{{1,2}})?)?", + r"(?:(H)(\d)?)?", + r"(?:([+-]{{1,2}})(\d{{1,2}})?)?", + r"(?:(:)(\d+))?", + r")$", + ], + public=True, + separator="", + wrapper="formatcp", + ) + ) + print( + const_str( + "BONDING_DESCRIPTOR", + [ + r"(\$|<|>)", # descriptor type + r"(\d+)?", # optional index + ], + public=True, + separator="", + ) + ) + print( + const_str( + "LADDER_BONDING_DESCRIPTOR", + [ + r"(\$|<|>)", # outer descriptor type + r"(\d+)?", # outer descriptor id + rf"(\[)(\$|<|>|{BIGSMILES_LABEL})(\d+)?(\])", # inner descriptor + r"(\d+)", # group id + ], + public=True, + separator="", + ) + ) + print(const_str("FRAGMENT_REFERENCE", r"(#[!-~]+)", public=True)) + + def build_selfies_pretokenizer(): print( "|".join( @@ -171,35 +292,57 @@ def build_selfies_alphabet(): return vocab +def build_bigsmiles_alphabet(): + vocab = build_smiles_alphabet() + vocab.update(BIGSMILES_EXTRA_TOKENS) + + return vocab + + def build_vocab(tokens: set): tokens = ["[UNK]", *sorted(tokens)] return {token: id for id, token in enumerate(tokens)} +def build_bigsmiles_vocab(): + vocab = build_vocab(build_smiles_alphabet()) + for token in BIGSMILES_EXTRA_TOKENS: + vocab[token] = len(vocab) + return vocab + + if __name__ == "__main__": p = argparse.ArgumentParser() p.add_argument("-t", "--type", choices=["vocab", "regex"], default="vocab") - p.add_argument("-f", "--format", choices=["smiles", "selfies"], default="smiles") + p.add_argument( + "-f", "--format", choices=["smiles", "bigsmiles", "selfies"], default="smiles" + ) p.add_argument("output", type=argparse.FileType("w"), default=sys.stdout, nargs="?") args = p.parse_args() if args.type == "vocab": - if args.format == "smiles": - alphabet = build_smiles_alphabet() - elif args.format == "selfies": - alphabet = build_selfies_alphabet() + if args.format == "bigsmiles": + vocab = build_bigsmiles_vocab() else: - # Argparse should catch this sooner - raise RuntimeError("Unknown format", args.format) + if args.format == "smiles": + alphabet = build_smiles_alphabet() + elif args.format == "selfies": + alphabet = build_selfies_alphabet() + else: + # Argparse should catch this sooner + raise RuntimeError("Unknown format", args.format) - # Convert enumerated glyphs to a vocab - vocab = build_vocab(alphabet) + # Convert enumerated glyphs to a vocab + vocab = build_vocab(alphabet) json.dump(vocab, args.output, indent=4) + args.output.write("\n") elif args.type == "regex": if args.format == "smiles": build_smiles_pretokenizer() + elif args.format == "bigsmiles": + build_bigsmiles_pretokenizer() elif args.format == "selfies": build_selfies_pretokenizer() else: diff --git a/pyproject.toml b/pyproject.toml index 90a72ba..a31bf08 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,4 +53,4 @@ requires = ["maturin~=1.7"] [tool.maturin] python-source = "python" -include = [ "python/smirk/vocab_smiles.json", "python/smirk/vocab_selfies.json" ] +include = [ "python/smirk/vocab_smiles.json", "python/smirk/vocab_bigsmiles.json", "python/smirk/vocab_selfies.json" ] diff --git a/python/smirk/__init__.py b/python/smirk/__init__.py index 196af33..11e5376 100644 --- a/python/smirk/__init__.py +++ b/python/smirk/__init__.py @@ -15,7 +15,6 @@ TruncationStrategy, ) from transformers.tokenization_utils_fast import TOKENIZER_FILE -from transformers.utils import add_code_sample_docstrings from . import smirk as rs_smirk @@ -28,8 +27,8 @@ "cls_token": "[CLS]", "mask_token": "[MASK]", } -""" Default Special tokens used by the :py:class:`SmirkTokenizerFast` -and :py:func:`SmirkSelfiesFast` tokenizers. +""" Default special tokens used by :py:class:`SmirkTokenizerFast`, +:py:class:`SmirkBigSmilesFast`, and :py:func:`SmirkSelfiesFast`. """ @@ -342,6 +341,40 @@ def _save_pretrained( AutoTokenizer.register("SmirkTokenizer", fast_tokenizer_class=SmirkTokenizerFast) +class SmirkBigSmilesFast(SmirkTokenizerFast): + def __init__(self, tokenizer_file: Optional[os.PathLike] = None, **kwargs): + """ + A Chemically-Complete Tokenizer for core BigSMILES line notation. + For a specification of of the reference see: + https://olsenlabmit.github.io/BigSMILES/docs/line_notation.html. + + .. warning:: + SmirkBigSmilesFast supports explicit BigSMILES fragment definitions + such as ``[#R].{#R=...}``, but it does not load or expand the + predefined Common Repeat Unit table from the BigSMILES v1.1 + documentation. Common repeat unit placeholders must therefore be + defined explicitly. + + :param tokenizer_file: Path to a JSON serialize SmirkTokenizerFast tokenizers + :param kwargs: Additional kwargs are passed to :py:class:`SmirkTokenizerFast` + """ + default_vocab_file = files("smirk").joinpath("vocab_bigsmiles.json") + if tokenizer := kwargs.pop("tokenizer", None): + tokenizer = tokenizer + elif tokenizer_file: + tokenizer = rs_smirk.SmirkTokenizer.from_file(str(tokenizer_file)) + kwargs["tokenizer_file"] = str(tokenizer_file) + elif vocab_file := kwargs.pop("vocab_file", default_vocab_file): + tokenizer = rs_smirk.SmirkTokenizer.from_vocab( + str(vocab_file), bigsmiles=True + ) + kwargs["vocab_file"] = str(vocab_file) + else: + tokenizer = rs_smirk.SmirkTokenizer(bigsmiles=True) + + super().__init__(tokenizer=tokenizer, **kwargs) + + def SmirkSelfiesFast( vocab_file: Optional[os.PathLike] = None, unk_token="[UNK]", **kwargs ) -> PreTrainedTokenizerFast: diff --git a/python/smirk/vocab_bigsmiles.json b/python/smirk/vocab_bigsmiles.json new file mode 100644 index 0000000..1ec1537 --- /dev/null +++ b/python/smirk/vocab_bigsmiles.json @@ -0,0 +1,167 @@ +{ + "[UNK]": 0, + "#": 1, + "$": 2, + "%": 3, + "(": 4, + ")": 5, + "*": 6, + "+": 7, + "-": 8, + ".": 9, + "/": 10, + "0": 11, + "1": 12, + "2": 13, + "3": 14, + "4": 15, + "5": 16, + "6": 17, + "7": 18, + "8": 19, + "9": 20, + ":": 21, + "=": 22, + "@": 23, + "@@": 24, + "@AL": 25, + "@OH": 26, + "@SP": 27, + "@TB": 28, + "@TH": 29, + "Ac": 30, + "Ag": 31, + "Al": 32, + "Am": 33, + "Ar": 34, + "As": 35, + "At": 36, + "Au": 37, + "B": 38, + "Ba": 39, + "Be": 40, + "Bh": 41, + "Bi": 42, + "Bk": 43, + "Br": 44, + "C": 45, + "Ca": 46, + "Cd": 47, + "Ce": 48, + "Cf": 49, + "Cl": 50, + "Cm": 51, + "Cn": 52, + "Co": 53, + "Cr": 54, + "Cs": 55, + "Cu": 56, + "Db": 57, + "Ds": 58, + "Dy": 59, + "Er": 60, + "Es": 61, + "Eu": 62, + "F": 63, + "Fe": 64, + "Fl": 65, + "Fm": 66, + "Fr": 67, + "Ga": 68, + "Gd": 69, + "Ge": 70, + "H": 71, + "He": 72, + "Hf": 73, + "Hg": 74, + "Ho": 75, + "Hs": 76, + "I": 77, + "In": 78, + "Ir": 79, + "K": 80, + "Kr": 81, + "La": 82, + "Li": 83, + "Lr": 84, + "Lu": 85, + "Lv": 86, + "Mc": 87, + "Md": 88, + "Mg": 89, + "Mn": 90, + "Mo": 91, + "Mt": 92, + "N": 93, + "Na": 94, + "Nb": 95, + "Nd": 96, + "Ne": 97, + "Nh": 98, + "Ni": 99, + "No": 100, + "Np": 101, + "O": 102, + "Og": 103, + "Os": 104, + "P": 105, + "Pa": 106, + "Pb": 107, + "Pd": 108, + "Pm": 109, + "Po": 110, + "Pr": 111, + "Pt": 112, + "Pu": 113, + "Ra": 114, + "Rb": 115, + "Re": 116, + "Rf": 117, + "Rg": 118, + "Rh": 119, + "Rn": 120, + "Ru": 121, + "S": 122, + "Sb": 123, + "Sc": 124, + "Se": 125, + "Sg": 126, + "Si": 127, + "Sm": 128, + "Sn": 129, + "Sr": 130, + "Ta": 131, + "Tb": 132, + "Tc": 133, + "Te": 134, + "Th": 135, + "Ti": 136, + "Tl": 137, + "Tm": 138, + "Ts": 139, + "U": 140, + "V": 141, + "W": 142, + "Xe": 143, + "Y": 144, + "Yb": 145, + "Zn": 146, + "Zr": 147, + "[": 148, + "\\": 149, + "]": 150, + "as": 151, + "b": 152, + "c": 153, + "n": 154, + "o": 155, + "p": 156, + "s": 157, + "se": 158, + "{": 159, + "}": 160, + ",": 161, + ";": 162, + "<": 163, + ">": 164 +} diff --git a/src/pre_tokenizers/bigsmirk.rs b/src/pre_tokenizers/bigsmirk.rs new file mode 100644 index 0000000..5887f10 --- /dev/null +++ b/src/pre_tokenizers/bigsmirk.rs @@ -0,0 +1,860 @@ +use super::split_bigsmiles::{MATCH_INNER_BIGSMILES, MATCH_OUTER_BIGSMILES}; +use once_cell::sync::Lazy; +use regex::{Match, Regex}; +use serde::de::Visitor; +use serde::ser::SerializeStruct; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::fmt; +use tokenizers::tokenizer::pattern::Pattern; +use tokenizers::tokenizer::{ + Offsets, PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior, +}; + +#[derive(Clone)] +pub struct BigSmirkPreTokenizer { + outer: Regex, + inner: Regex, + inner_partial: Regex, +} + +impl BigSmirkPreTokenizer { + pub const BIGSMILES_VERSION: &'static str = "1.1"; + + pub fn new(outer: &str, inner: &str) -> Self { + Self { + outer: Regex::new(&outer).unwrap(), + inner: Regex::new(&inner).unwrap(), + inner_partial: Regex::new(partial_inner_pattern(inner)).unwrap(), + } + } + + pub fn split(&self, text: &String) -> Vec { + self.find_matches(text) + .unwrap() + .into_iter() + .map(|(offset, _)| text.get(offset.0..offset.1).unwrap().to_owned()) + .filter(|tok| !tok.is_empty()) + .collect() + } +} + +impl Default for BigSmirkPreTokenizer { + fn default() -> Self { + BigSmirkPreTokenizer::new(MATCH_OUTER_BIGSMILES, MATCH_INNER_BIGSMILES) + } +} + +impl PartialEq for BigSmirkPreTokenizer { + fn eq(&self, other: &Self) -> bool { + self.outer.as_str() == other.outer.as_str() && self.inner.as_str() == other.inner.as_str() + } +} + +impl fmt::Debug for BigSmirkPreTokenizer { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("BigSmirkPreTokenizer") + .field("outer", &format_args!("'{}'", &self.outer.as_str())) + .field("inner", &format_args!("'{}'", &self.inner.as_str())) + .finish() + } +} + +impl Serialize for BigSmirkPreTokenizer { + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + let mut state = serializer.serialize_struct("BigSmirkPreTokenizer", 4)?; + state.serialize_field("type", "BigSmirkPreTokenizer")?; + state.serialize_field("bigsmiles_version", Self::BIGSMILES_VERSION)?; + state.serialize_field("outer", self.outer.as_str())?; + state.serialize_field("inner", self.inner.as_str())?; + state.end() + } +} + +impl<'de> Deserialize<'de> for BigSmirkPreTokenizer { + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + deserializer.deserialize_struct( + "BigSmirkPreTokenizer", + &["type", "bigsmiles_version", "outer", "inner"], + BigSmirkPreTokenizerVisitor, + ) + } +} + +struct BigSmirkPreTokenizerVisitor; +impl<'de> Visitor<'de> for BigSmirkPreTokenizerVisitor { + type Value = BigSmirkPreTokenizer; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(formatter, "struct BigSmirkPreTokenizer with type field") + } + + fn visit_map(self, mut map: A) -> std::result::Result + where + A: serde::de::MapAccess<'de>, + { + let mut outer: Option = None; + let mut inner: Option = None; + let mut type_field: Option = None; + let mut bigsmiles_version: Option = None; + while let Some(key) = map.next_key::()? { + match key.as_ref() { + "type" => { + type_field = Some(map.next_value()?); + } + "bigsmiles_version" => { + bigsmiles_version = Some(map.next_value()?); + } + "outer" => { + if let Some(x) = map.next_value()? { + outer = Some(x); + } + } + "inner" => { + if let Some(x) = map.next_value()? { + inner = Some(x); + } + } + _ => { + let _: serde::de::IgnoredAny = map.next_value()?; + } + } + } + match type_field.as_deref() { + Some("BigSmirkPreTokenizer") => {} + _ => { + return Err(serde::de::Error::custom( + "Missing or invalid type field for BigSmirkPreTokenizer", + )); + } + } + match bigsmiles_version.as_deref() { + Some(BigSmirkPreTokenizer::BIGSMILES_VERSION) => {} + Some(version) => { + return Err(serde::de::Error::invalid_value( + serde::de::Unexpected::Str(version), + &"BigSMILES version `1.1`", + )); + } + None => { + return Err(serde::de::Error::missing_field("bigsmiles_version")); + } + } + Ok(BigSmirkPreTokenizer::new( + outer.expect("Missing `outer`").as_str(), + inner.expect("Missing `inner`").as_str(), + )) + } +} + +impl PreTokenizer for BigSmirkPreTokenizer { + fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> { + pretokenized.normalize(|normalized| { + if let Some(expanded) = expand_fragment_definitions(normalized.get()) { + static MATCH_FULL_STRING: Lazy = + Lazy::new(|| Regex::new(r"(?s)^.*$").unwrap()); + normalized.replace(&*MATCH_FULL_STRING, &expanded)?; + } + Ok(()) + })?; + pretokenized.split(|_, s| s.split(self.to_owned(), SplitDelimiterBehavior::Isolated)) + } +} + +fn expand_fragment_definitions(input: &str) -> Option { + let (main, definitions) = split_fragment_definitions(input)?; + let mut expanded = main.to_string(); + + for _ in 0..=definitions.len() { + let (next, changed) = expand_fragment_references_once(&expanded, &definitions); + expanded = next; + if !changed { + break; + } + } + + Some(expanded) +} + +fn split_fragment_definitions(input: &str) -> Option<(&str, HashMap)> { + for (start, _) in input.match_indices(".{#") { + if let Some(definitions) = parse_fragment_definition_suffix(&input[start..]) { + return Some((&input[..start], definitions)); + } + } + None +} + +fn parse_fragment_definition_suffix(suffix: &str) -> Option> { + let mut definitions = HashMap::new(); + let mut pos = 0; + + while pos < suffix.len() { + if !suffix[pos..].starts_with(".{#") { + return None; + } + pos += ".{#".len(); + + let name_start = pos; + while pos < suffix.len() { + let c = suffix[pos..].chars().next().unwrap(); + if c == '=' { + break; + } + pos += c.len_utf8(); + } + let name = &suffix[name_start..pos]; + if name.is_empty() || !is_fragment_definition_name(name) { + return None; + } + if !suffix[pos..].starts_with('=') { + return None; + } + pos += '='.len_utf8(); + + let value_start = pos; + let mut depth = 1; + while pos < suffix.len() { + let c = suffix[pos..].chars().next().unwrap(); + match c { + '{' => depth += 1, + '}' => { + depth -= 1; + if depth == 0 { + definitions.insert(name.to_string(), suffix[value_start..pos].to_string()); + pos += c.len_utf8(); + break; + } + } + _ => {} + } + pos += c.len_utf8(); + } + + if depth != 0 { + return None; + } + } + + Some(definitions) +} + +fn is_fragment_definition_name(name: &str) -> bool { + name.chars() + .all(|c| matches!(c, '!'..='~') && !matches!(c, '=' | '{' | '}' | '[' | ']')) +} + +fn expand_fragment_references_once( + input: &str, + definitions: &HashMap, +) -> (String, bool) { + let mut expanded = String::with_capacity(input.len()); + let mut changed = false; + let mut pos = 0; + + while pos < input.len() { + let rest = &input[pos..]; + if let Some((name, len)) = bracketed_fragment_reference(rest) { + if let Some(replacement) = definitions.get(name) { + expanded.push_str(replacement); + pos += len; + changed = true; + continue; + } + } + + let c = rest.chars().next().unwrap(); + expanded.push(c); + pos += c.len_utf8(); + } + + (expanded, changed) +} + +fn bracketed_fragment_reference(input: &str) -> Option<(&str, usize)> { + if !input.starts_with("[#") { + return None; + } + + let end = input.find(']')?; + let name = &input[2..end]; + if name.is_empty() { + return None; + } + Some((name, end + ']'.len_utf8())) +} + +fn append_split(splits: &mut Vec<(Offsets, bool)>, prev: &mut usize, m: Match, offset: usize) { + let start = m.start() + offset; + let end = m.end() + offset; + if *prev != start { + splits.push(((*prev, start), false)); + } + splits.push(((start, end), true)); + *prev = end; +} + +fn partial_inner_pattern(inner: &str) -> &str { + let pattern = inner + .strip_prefix("^(?:") + .and_then(|pattern| pattern.strip_suffix(")$")) + .unwrap_or(inner); + pattern.strip_prefix('|').unwrap_or(pattern) +} + +impl Pattern for BigSmirkPreTokenizer { + fn find_matches(&self, inside: &str) -> Result> { + let mut splits = Vec::with_capacity(inside.len()); + let mut prev = 0; + static IS_NUMBER: Lazy = Lazy::new(|| Regex::new(r"^\d+$").unwrap()); + static IS_BONDING_DESC: Lazy = Lazy::new(|| Regex::new(r"^[\$<>]$").unwrap()); + for m_outer in self.outer.find_iter(inside) { + // Check for Brackets + if m_outer.as_str().starts_with("[") { + // Record opening [ + splits.push(((m_outer.start(), m_outer.start() + 1), true)); + prev = m_outer.start() + 1; + + // Record contents between brackets + let bracketed = &inside[(m_outer.start() + 1)..(m_outer.end() - 1)]; + + // Try to match with inner pattern + if let Some(capture) = self + .inner + .captures(bracketed) + .or_else(|| self.inner_partial.captures(bracketed)) + { + // Unpack bracketed atoms + for i in 1..capture.len() { + if let Some(m) = capture.get(i) { + let matched_str = m.as_str(); + if matched_str.is_empty() { + continue; + } + if IS_NUMBER.is_match(matched_str) { + // Tokenize numbers as digits + for d in m.range() { + let s = d + m_outer.start() + 1; + splits.push(((s, s + 1), true)); + prev = s + 1; + } + } else if IS_BONDING_DESC.is_match(matched_str) { + // Bonding descriptor ($, <, >) - keep as single token + append_split(&mut splits, &mut prev, m, m_outer.start() + 1) + } else { + append_split(&mut splits, &mut prev, m, m_outer.start() + 1) + } + } + } + } + + // Check for trailing unmatched characters within the brackets + if prev != (m_outer.end() - 1) { + splits.push(((prev, m_outer.end() - 1), false)); + prev = m_outer.end() - 1; + } + + // Record closing ] + assert!(m_outer.as_str().ends_with("]")); + splits.push(((prev, m_outer.end()), true)); + prev = m_outer.end(); + } else { + append_split(&mut splits, &mut prev, m_outer, 0); + } + } + if prev != inside.len() { + splits.push(((prev, inside.len()), false)); + } + Ok(splits) + } +} + +#[cfg(test)] +pub mod tests { + use std::fs; + use std::path::PathBuf; + + use super::*; + use crate::test_utils::check_serde; + use tokenizers::tokenizer::{OffsetReferential, OffsetType}; + + #[test] + fn serialize_default() { + let default = BigSmirkPreTokenizer::default(); + check_serde(&default); + } + + #[test] + fn serializes_bigsmiles_version() { + let value = serde_json::to_value(BigSmirkPreTokenizer::default()).unwrap(); + assert_eq!( + value.get("bigsmiles_version").and_then(|v| v.as_str()), + Some(BigSmirkPreTokenizer::BIGSMILES_VERSION) + ); + } + + #[test] + fn rejects_missing_bigsmiles_version() { + let mut value = serde_json::to_value(BigSmirkPreTokenizer::default()).unwrap(); + value.as_object_mut().unwrap().remove("bigsmiles_version"); + + let err = serde_json::from_value::(value).unwrap_err(); + assert!(err + .to_string() + .contains("missing field `bigsmiles_version`")); + } + + #[test] + fn rejects_unsupported_bigsmiles_versions() { + for version in ["1.0", "1.2", "2.0", "not-a-version"] { + let mut value = serde_json::to_value(BigSmirkPreTokenizer::default()).unwrap(); + value["bigsmiles_version"] = serde_json::Value::String(version.to_string()); + + let err = serde_json::from_value::(value).unwrap_err(); + let message = err.to_string(); + assert!(message.contains(&format!("invalid value: string \"{}\"", version))); + assert!(message.contains("expected BigSMILES version `1.1`")); + } + } + + #[test] + fn serialize_pretok() { + let pretok = BigSmirkPreTokenizer::new(r".|\[.*?]", "."); + check_serde(&pretok); + } + + fn all_matches(tok: &BigSmirkPreTokenizer, bigsmiles: &str) -> bool { + let splits = tok.find_matches(bigsmiles).unwrap(); + splits.into_iter().all(|(_s, m)| m) + } + + fn all_matches_after_fragment_expansion(tok: &BigSmirkPreTokenizer, bigsmiles: &str) -> bool { + let expanded = expand_fragment_definitions(bigsmiles).unwrap_or_else(|| bigsmiles.into()); + all_matches(tok, &expanded) + } + + fn get_matched_pretokens(tok: &BigSmirkPreTokenizer, bigsmiles: &str) -> Vec { + tok.find_matches(bigsmiles) + .unwrap() + .into_iter() + .filter(|(_, m)| *m) + .map(|(o, _)| bigsmiles[o.0..o.1].into()) + .collect() + } + + fn get_split_tokens(tok: &BigSmirkPreTokenizer, bigsmiles: &str) -> Vec { + let mut bigsmiles = PreTokenizedString::from(bigsmiles); + tok.pre_tokenize(&mut bigsmiles).unwrap(); + bigsmiles + .get_splits(OffsetReferential::Original, OffsetType::Byte) + .into_iter() + .map(|(s, _, _)| s.to_string()) + .collect() + } + + #[test] + fn check_bigsmiles_splits() { + let pretok = BigSmirkPreTokenizer::default(); + let bigsmiles = "{[$]CC[$]}".to_string(); + let split = ["{", "[", "$", "]", "C", "C", "[", "$", "]", "}"]; + assert_eq!(get_split_tokens(&pretok, bigsmiles.as_str()), split); + assert_eq!(pretok.split(&bigsmiles), split); + } + + #[test] + fn check_unknown() { + let pretok = BigSmirkPreTokenizer::default(); + assert_eq!(get_split_tokens(&pretok, "C🤷"), ["C", "🤷"]); + assert_eq!(get_split_tokens(&pretok, "🤷"), ["🤷"]); + assert_eq!(get_split_tokens(&pretok, "🤷C"), ["🤷", "C"]); + assert_eq!( + get_split_tokens(&pretok, "C[H🤷]"), + ["C", "[", "H", "🤷", "]"] + ); + assert_eq!(get_split_tokens(&pretok, "[🤷]"), ["[", "🤷", "]"]); + assert_eq!( + get_split_tokens(&pretok, "[🤷H]C"), + ["[", "🤷", "H", "]", "C"] + ); + } + + #[test] + fn test_standard_smiles_basic() { + let pretok = BigSmirkPreTokenizer::default(); + assert_eq!( + get_split_tokens(&pretok, "OC[C@@H]"), + ["O", "C", "[", "C", "@@", "H", "]"] + ); + assert_eq!( + get_split_tokens(&pretok, "OC[C@@H][OH]"), + ["O", "C", "[", "C", "@@", "H", "]", "[", "O", "H", "]"] + ); + } + + #[test] + fn test_standard_smiles_chirality() { + let pretok = BigSmirkPreTokenizer::default(); + assert_eq!(get_split_tokens(&pretok, "[C@]"), ["[", "C", "@", "]"]); + assert_eq!( + get_split_tokens(&pretok, "[C@H]"), + ["[", "C", "@", "H", "]"] + ); + assert_eq!(get_split_tokens(&pretok, "[C@@]"), ["[", "C", "@@", "]"]); + assert_eq!( + get_split_tokens(&pretok, "[Fe@TB3+3]"), + ["[", "Fe", "@TB", "3", "+", "3", "]"] + ); + } + + #[test] + fn test_standard_smiles_isotopes_charges() { + let pretok = BigSmirkPreTokenizer::default(); + assert_eq!( + get_split_tokens(&pretok, "[16C]"), + ["[", "1", "6", "C", "]"] + ); + assert_eq!( + get_split_tokens(&pretok, "[C+12]"), + ["[", "C", "+", "1", "2", "]"] + ); + assert_eq!( + get_split_tokens(&pretok, "[CH4:200]"), + ["[", "C", "H", "4", ":", "2", "0", "0", "]"] + ); + } + + #[test] + fn test_standard_smiles_rings_bonds() { + let pretok = BigSmirkPreTokenizer::default(); + assert_eq!( + get_split_tokens(&pretok, "C1CCCC2C1CCCC2"), + ["C", "1", "C", "C", "C", "C", "2", "C", "1", "C", "C", "C", "C", "2"] + ); + assert_eq!(get_split_tokens(&pretok, "C%12"), ["C", "%", "1", "2"]); + assert_eq!( + get_split_tokens(&pretok, "F/C=C/F"), + ["F", "/", "C", "=", "C", "/", "F"] + ); + assert_eq!( + get_split_tokens(&pretok, r"F/C=C\F"), + ["F", "/", "C", "=", "C", "\\", "F"] + ); + } + + #[test] + fn test_standard_smiles_complex() { + let pretok = BigSmirkPreTokenizer::default(); + assert_eq!( + get_split_tokens(&pretok, "[Na+].[Cl-]"), + ["[", "Na", "+", "]", ".", "[", "Cl", "-", "]"] + ); + assert_eq!(get_split_tokens(&pretok, "CC-O"), ["C", "C", "-", "O"]); + assert_eq!( + get_split_tokens(&pretok, "O=C=O"), + ["O", "=", "C", "=", "O"] + ); + assert_eq!(get_split_tokens(&pretok, "C#N"), ["C", "#", "N"]); + assert_eq!( + get_split_tokens(&pretok, "c1ccccc1"), + ["c", "1", "c", "c", "c", "c", "c", "1"] + ); + assert_eq!( + get_split_tokens(&pretok, "FC(Br)(Cl)F"), + ["F", "C", "(", "Br", ")", "(", "Cl", ")", "F"] + ); + assert!(all_matches( + &pretok, + "OC[C@@H](O1)[C@@H](O)[C@H](O)[C@@H](O)[C@H](O)1" + )); + } + + #[test] + fn test_stochastic_object_simple() { + let pretok = BigSmirkPreTokenizer::default(); + // Simple polymer repeat unit with AA-type bonding + assert_eq!( + get_split_tokens(&pretok, "{[$]CC[$]}"), + ["{", "[", "$", "]", "C", "C", "[", "$", "]", "}"] + ); + } + + #[test] + fn test_stochastic_object_multiple_units() { + let pretok = BigSmirkPreTokenizer::default(); + assert_eq!( + get_split_tokens(&pretok, "{[$]CC[$],[$]C(C)C[$]}"), + [ + "{", "[", "$", "]", "C", "C", "[", "$", "]", ",", "[", "$", "]", "C", "(", "C", + ")", "C", "[", "$", "]", "}" + ] + ); + } + + #[test] + fn test_ab_type_descriptors() { + let pretok = BigSmirkPreTokenizer::default(); + assert_eq!( + get_split_tokens(&pretok, "{[<]CC[>]}"), + ["{", "[", "<", "]", "C", "C", "[", ">", "]", "}"] + ); + assert_eq!( + get_split_tokens(&pretok, "{[>]CCCCCC(=O)[<],[>]NCCCCCCN[<]}"), + [ + "{", "[", ">", "]", "C", "C", "C", "C", "C", "C", "(", "=", "O", ")", "[", "<", + "]", ",", "[", ">", "]", "N", "C", "C", "C", "C", "C", "C", "N", "[", "<", "]", + "}" + ] + ); + } + + #[test] + fn test_indexed_bonding_descriptors() { + let pretok = BigSmirkPreTokenizer::default(); + assert_eq!(get_split_tokens(&pretok, "[$1]"), ["[", "$", "1", "]"]); + assert_eq!(get_split_tokens(&pretok, "[$2]"), ["[", "$", "2", "]"]); + + assert_eq!(get_split_tokens(&pretok, "[<1]"), ["[", "<", "1", "]"]); + assert_eq!(get_split_tokens(&pretok, "[>1]"), ["[", ">", "1", "]"]); + + assert_eq!( + get_split_tokens(&pretok, "{[$1]CC[$1],[$2]C(C)C[$2]}"), + [ + "{", "[", "$", "1", "]", "C", "C", "[", "$", "1", "]", ",", "[", "$", "2", "]", + "C", "(", "C", ")", "C", "[", "$", "2", "]", "}" + ] + ); + } + + #[test] + fn test_ladder_descriptors() { + let pretok = BigSmirkPreTokenizer::default(); + + assert_eq!( + get_split_tokens(&pretok, "[<1[<1]1]"), + ["[", "<", "1", "[", "<", "1", "]", "1", "]"] + ); + assert_eq!( + get_split_tokens(&pretok, "[$1[$2]3]"), + ["[", "$", "1", "[", "$", "2", "]", "3", "]"] + ); + + assert_eq!( + get_split_tokens(&pretok, "{[<1[<1]1]CC[>1[>1]1]}"), + [ + "{", "[", "<", "1", "[", "<", "1", "]", "1", "]", "C", "C", "[", ">", "1", "[", + ">", "1", "]", "1", "]", "}" + ] + ); + } + + #[test] + fn test_external_bond_order_with_descriptors() { + let pretok = BigSmirkPreTokenizer::default(); + assert_eq!( + get_split_tokens(&pretok, "C=[$2]"), + ["C", "=", "[", "$", "2", "]"] + ); + assert_eq!( + get_split_tokens(&pretok, r"C/[>1]"), + ["C", "/", "[", ">", "1", "]"] + ); + assert_eq!( + get_split_tokens(&pretok, r"C\[<1]"), + ["C", "\\", "[", "<", "1", "]"] + ); + } + + #[test] + fn test_empty_terminal() { + let pretok = BigSmirkPreTokenizer::default(); + assert_eq!(get_split_tokens(&pretok, "[]"), ["[", "]"]); + + assert_eq!( + get_split_tokens(&pretok, "{[]CC[$]}"), + ["{", "[", "]", "C", "C", "[", "$", "]", "}"] + ); + + assert_eq!( + get_split_tokens(&pretok, "{[]CC[]}"), + ["{", "[", "]", "C", "C", "[", "]", "}"] + ); + } + + #[test] + fn test_end_groups_semicolon() { + let pretok = BigSmirkPreTokenizer::default(); + assert_eq!( + get_split_tokens(&pretok, "{[$]CC[$];[H][$],[$]O}"), + [ + "{", "[", "$", "]", "C", "C", "[", "$", "]", ";", "[", "H", "]", "[", "$", "]", + ",", "[", "$", "]", "O", "}" + ] + ); + assert_eq!( + get_split_tokens(&pretok, "{[$]CC[$];C[$],[$]C}"), + [ + "{", "[", "$", "]", "C", "C", "[", "$", "]", ";", "C", "[", "$", "]", ",", "[", + "$", "]", "C", "}" + ] + ); + } + + #[test] + fn test_block_copolymer() { + let pretok = BigSmirkPreTokenizer::default(); + assert_eq!( + get_split_tokens(&pretok, "{[$]CC[$]}{[$]CC(C)[$]}"), + [ + "{", "[", "$", "]", "C", "C", "[", "$", "]", "}", "{", "[", "$", "]", "C", "C", + "(", "C", ")", "[", "$", "]", "}" + ] + ); + assert_eq!( + get_split_tokens(&pretok, "CC{[$]CC[$]}CC"), + ["C", "C", "{", "[", "$", "]", "C", "C", "[", "$", "]", "}", "C", "C"] + ); + } + + #[test] + fn test_graft_copolymer_nested() { + let pretok = BigSmirkPreTokenizer::default(); + assert_eq!( + get_split_tokens(&pretok, "{[$]CC(C{[<]CC[>]})[$]}"), + [ + "{", "[", "$", "]", "C", "C", "(", "C", "{", "[", "<", "]", "C", "C", "[", ">", + "]", "}", ")", "[", "$", "]", "}" + ] + ); + assert_eq!( + get_split_tokens(&pretok, "{[$]CC{[<]C[>]}[$]}"), + [ + "{", "[", "$", "]", "C", "C", "{", "[", "<", "]", "C", "[", ">", "]", "}", "[", + "$", "]", "}" + ] + ); + } + + #[test] + fn test_fragment_reference() { + let pretok = BigSmirkPreTokenizer::default(); + assert_eq!(get_split_tokens(&pretok, "[#PEG]"), ["[", "#PEG", "]"]); + assert_eq!( + get_split_tokens(&pretok, "[#Styrene]"), + ["[", "#Styrene", "]"] + ); + assert_eq!(get_split_tokens(&pretok, "[#+]"), ["[", "#+", "]"]); + assert_eq!(get_split_tokens(&pretok, "[#PEG-1]"), ["[", "#PEG-1", "]"]); + assert_eq!(get_split_tokens(&pretok, "[#A]"), ["[", "#A", "]"]); + assert_eq!( + get_split_tokens(&pretok, "{[$][#Styrene][$]}"), + ["{", "[", "$", "]", "[", "#Styrene", "]", "[", "$", "]", "}"] + ); + } + + #[test] + fn test_fragment_definition_expansion() { + let pretok = BigSmirkPreTokenizer::default(); + assert_eq!( + get_split_tokens(&pretok, "C([#R]).{#R=CO}"), + ["C", "(", "C", "O", ")"] + ); + assert_eq!( + get_split_tokens(&pretok, "{[#A]CC[#B]}.{#A=[<]}.{#B=[>]}"), + ["{", "[", "<", "]", "C", "C", "[", ">", "]", "}"] + ); + } + + #[test] + fn test_fragment_definitions_do_not_expand_bare_labels() { + let pretok = BigSmirkPreTokenizer::default(); + let tokens = get_split_tokens( + &pretok, + "A([$1[<1]1])R(A'[$1[>1]1])(A[$1[<1]2])A'[$1[>1]2].{#A=C}.{#A'=C}.{#R=C}", + ); + + assert_eq!( + tokens + .iter() + .filter(|token| matches!(token.as_str(), "A" | "A'" | "R")) + .count(), + 5 + ); + assert!(!tokens.iter().any(|token| token == "{" || token == "=")); + } + + #[test] + fn test_reject_invalid_bracket_symbol_forms() { + let pretok = BigSmirkPreTokenizer::default(); + assert!(!all_matches(&pretok, "[B|]")); + assert!(!all_matches(&pretok, "[C@@Hextra]")); + assert!(!all_matches(&pretok, "[$=]")); + assert!(!all_matches(&pretok, "[>#]")); + assert!(!all_matches(&pretok, "[$/]")); + assert!(!all_matches(&pretok, r"[$\]")); + assert!(!all_matches(&pretok, "[#PEG 1]")); + } + + #[test] + fn test_mixed_smiles_bigsmiles() { + let pretok = BigSmirkPreTokenizer::default(); + assert_eq!( + get_split_tokens(&pretok, "CCCC{[$]CC(c1ccccc1)[$]}CCCC"), + [ + "C", "C", "C", "C", "{", "[", "$", "]", "C", "C", "(", "c", "1", "c", "c", "c", + "c", "c", "1", ")", "[", "$", "]", "}", "C", "C", "C", "C" + ] + ); + assert_eq!( + get_split_tokens(&pretok, "CCCC{[$]CC[$]}CCCC.NCC"), + [ + "C", "C", "C", "C", "{", "[", "$", "]", "C", "C", "[", "$", "]", "}", "C", "C", + "C", "C", ".", "N", "C", "C" + ] + ); + } + + #[test] + fn test_opensmiles_spec() { + let pretok = BigSmirkPreTokenizer::default(); + let mut opensmiles_examples = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + opensmiles_examples.push("test"); + opensmiles_examples.push("opensmiles.smi"); + let examples = fs::read_to_string(opensmiles_examples.as_path()) + .expect("failed to open opensmiles.smi"); + for line in examples.lines().filter(|x| !x.starts_with("#")) { + dbg!(&line); + assert!(all_matches(&pretok, line)); + } + } + + #[test] + fn test_bigsmiles_spec() { + let pretok = BigSmirkPreTokenizer::default(); + let mut bigsmiles_examples = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + bigsmiles_examples.push("test"); + bigsmiles_examples.push("bigsmiles.smi"); + let examples = + fs::read_to_string(bigsmiles_examples.as_path()).expect("failed to open bigsmiles.smi"); + let mut failures = Vec::new(); + for (idx, line) in examples + .lines() + .enumerate() + .filter(|(_, x)| !x.starts_with("#") && !x.is_empty()) + { + if !all_matches_after_fragment_expansion(&pretok, line) { + failures.push(format!("line {}: {}", idx + 1, line)); + } + } + assert!( + failures.is_empty(), + "failed to tokenize BigSMILES fixtures:\n{}", + failures.join("\n") + ); + } +} diff --git a/src/pre_tokenizers/mod.rs b/src/pre_tokenizers/mod.rs index d2db9db..ded1c4d 100644 --- a/src/pre_tokenizers/mod.rs +++ b/src/pre_tokenizers/mod.rs @@ -1,9 +1,12 @@ +mod bigsmirk; mod smirk; +mod split_bigsmiles; mod split_smiles; use tokenizers::pre_tokenizers::split::{Split, SplitPattern}; use tokenizers::SplitDelimiterBehavior; +pub use bigsmirk::BigSmirkPreTokenizer; pub use smirk::SmirkPreTokenizer; pub fn split_structure() -> Split { diff --git a/src/pre_tokenizers/split_bigsmiles.rs b/src/pre_tokenizers/split_bigsmiles.rs new file mode 100644 index 0000000..6f20fb0 --- /dev/null +++ b/src/pre_tokenizers/split_bigsmiles.rs @@ -0,0 +1,84 @@ +use const_format::formatcp; + +const BRACKETED_SYMBOL: &'static str = concat!( + r"A(?:c|g|l|m|r|s|t|u)|", + r"B(?:a|e|h|i|k|r)?|", + r"C(?:a|d|e|f|l|m|n|o|r|s|u)?|", + r"D(?:b|s|y)|", + r"E(?:r|s|u)|", + r"F(?:e|l|m|r)?|", + r"G(?:a|d|e)|", + r"H(?:e|f|g|o|s)?|", + r"I(?:n|r)?|", + r"Kr?|", + r"L(?:a|i|r|u|v)|", + r"M(?:c|d|g|n|o|t)|", + r"N(?:a|b|d|e|h|i|o|p)?|", + r"O(?:g|s)?|", + r"P(?:a|b|d|m|o|r|t|u)?|", + r"R(?:a|b|e|f|g|h|n|u)|", + r"S(?:b|c|e|g|i|m|n|r)?|", + r"T(?:a|b|c|e|h|i|l|m|s)|", + r"U|", + r"V|", + r"W|", + r"Xe|", + r"Yb?|", + r"Z(?:n|r)|", + r"as|", + r"b|", + r"c|", + r"n|", + r"o|", + r"p|", + r"se?|", + r"\*", +); + +const CHIRAL: &'static str = r"@(?:@|AL|OH|SP|T(?:B|H))?"; + +pub const MATCH_OUTER_BIGSMILES: &'static str = concat!( + r"Br?|Cl?|F|I|N|O|P|S|", // organic subset elements + r"b|c|n|o|p|s|", // Aromatic organic subset + r"\*|", // Wildcard + r"[\.\-=\#\$:/\\]|", // Bonds + r"\d|%|", // Ring closures + r"\(|\)|", // Branch delimiters in SMILES and BigSMILES + r"\{|\}|", // Stochastic object delimiters + r",|;|", // Repeat unit separator and end group separator + r"[A-Z][A-Za-z0-9']*|", // Bare spec labels + r"\[(?:[^\[\]]+|\[[^\[\]]*\])*\]", // Bracketed atoms/descriptors +); + +pub const MATCH_INNER_BIGSMILES: &'static str = formatcp!(concat!( + r"^(?:", + r"", + r"|", + r"(\$|<|>)(\d+)?", + r"|", + r"(\$|<|>)(\d+)?(\[)(\$|<|>|[A-Z][A-Za-z0-9']*)(\d+)?(\])(\d+)", + r"|", + r"(#[!-~]+)", + r"|", + r"(\d+)?", + r"({BRACKETED_SYMBOL})", + r"(?:({CHIRAL})(\d{{1,2}})?)?", + r"(?:(H)(\d)?)?", + r"(?:([+-]{{1,2}})(\d{{1,2}})?)?", + r"(?:(:)(\d+))?", + r")$", +)); + +pub const BONDING_DESCRIPTOR: &'static str = concat!( + r"(\$|<|>)", // Descriptor type + r"(\d+)?", // Optional index +); + +pub const LADDER_BONDING_DESCRIPTOR: &'static str = concat!( + r"(\$|<|>)", // Outer descriptor type + r"(\d+)?", // Outer descriptor id + r"(\[)(\$|<|>|[A-Z][A-Za-z0-9']*)(\d+)?(\])", // Inner descriptor + r"(\d+)", // Group id +); + +pub const FRAGMENT_REFERENCE: &'static str = r"(#[!-~]+)"; diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 67bed1d..82bfcae 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1,7 +1,7 @@ use std::collections::{HashMap, HashSet}; use crate::gpe::{GpeTrainer, GPE}; -use crate::pre_tokenizers::{split_structure, SmirkPreTokenizer}; +use crate::pre_tokenizers::{split_structure, BigSmirkPreTokenizer, SmirkPreTokenizer}; use crate::wrapper::{ModelWrapper, PreTokenizerWrapper, TrainerWrapper}; use pyo3::exceptions::PyValueError; use pyo3::prelude::*; @@ -51,7 +51,13 @@ fn normalizer() -> normalizers::Sequence { #[pymethods] impl SmirkTokenizer { #[new] - fn __new__() -> Self { + #[pyo3(signature = (bigsmiles = false))] + fn __new__(bigsmiles: bool) -> Self { + let pre_tokenizer: PreTokenizerWrapper = if bigsmiles { + BigSmirkPreTokenizer::default().into() + } else { + SmirkPreTokenizer::default().into() + }; let tokenizer: Tokenizer = TokenizerBuilder::new() .with_model( WordLevel::builder() @@ -60,7 +66,7 @@ impl SmirkTokenizer { .unwrap() .into(), ) - .with_pre_tokenizer(Some(SmirkPreTokenizer::default().into())) + .with_pre_tokenizer(Some(pre_tokenizer)) .with_normalizer(Some(normalizer().into())) .with_decoder(Some(Fuse::default().into())) .build() @@ -77,11 +83,17 @@ impl SmirkTokenizer { } #[staticmethod] - fn from_vocab(file: &str) -> Self { + #[pyo3(signature = (file, bigsmiles = false))] + fn from_vocab(file: &str, bigsmiles: bool) -> Self { + let pre_tokenizer: PreTokenizerWrapper = if bigsmiles { + BigSmirkPreTokenizer::default().into() + } else { + SmirkPreTokenizer::default().into() + }; let model = WordLevel::from_file(file, "[UNK]".to_string()).unwrap(); let tokenizer = TokenizerBuilder::new() .with_model(model.into()) - .with_pre_tokenizer(Some(SmirkPreTokenizer::default().into())) + .with_pre_tokenizer(Some(pre_tokenizer)) .with_normalizer(Some(normalizer().into())) .with_decoder(Some(Fuse::new().into())) .build() @@ -465,3 +477,59 @@ impl From for Encoding { } } } + +#[cfg(test)] +mod tests { + use std::path::PathBuf; + + use tokenizers::PreTokenizer; + + use super::*; + + fn assert_bigsmiles_pre_tokenizer(tokenizer: &SmirkTokenizer) { + assert!(matches!( + tokenizer.tokenizer.get_pre_tokenizer(), + Some(PreTokenizerWrapper::BigSmirkPreTokenizer(_)) + )); + } + + fn get_splits(tokenizer: &SmirkTokenizer, text: &str) -> Vec { + let mut pretokenized = PreTokenizedString::from(text); + tokenizer + .tokenizer + .get_pre_tokenizer() + .unwrap() + .pre_tokenize(&mut pretokenized) + .unwrap(); + pretokenized + .get_splits(OffsetReferential::Original, OffsetType::Byte) + .into_iter() + .map(|(s, _, _)| s.to_string()) + .collect() + } + + #[test] + fn new_selects_bigsmiles_pre_tokenizer() { + let tokenizer = SmirkTokenizer::__new__(true); + assert_bigsmiles_pre_tokenizer(&tokenizer); + assert_eq!( + get_splits(&tokenizer, "{[$]CC[$]}"), + ["{", "[", "$", "]", "C", "C", "[", "$", "]", "}"] + ); + } + + #[test] + fn from_vocab_selects_bigsmiles_pre_tokenizer() { + let mut vocab_file = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + vocab_file.push("python"); + vocab_file.push("smirk"); + vocab_file.push("vocab_bigsmiles.json"); + + let tokenizer = SmirkTokenizer::from_vocab(vocab_file.to_str().unwrap(), true); + assert_bigsmiles_pre_tokenizer(&tokenizer); + assert_eq!( + get_splits(&tokenizer, "{[$]CC[$]}"), + ["{", "[", "$", "]", "C", "C", "[", "$", "]", "}"] + ); + } +} diff --git a/src/wrapper.rs b/src/wrapper.rs index ec6e626..3b33992 100644 --- a/src/wrapper.rs +++ b/src/wrapper.rs @@ -2,12 +2,13 @@ use serde::{Deserialize, Serialize}; use tokenizers::tokenizer::{Model, PreTokenizedString, PreTokenizer, Result, Trainer}; use crate::gpe::{GpeTrainer, GPE}; -use crate::pre_tokenizers::SmirkPreTokenizer; +use crate::pre_tokenizers::{BigSmirkPreTokenizer, SmirkPreTokenizer}; #[derive(Deserialize, Serialize, Clone, Debug, PartialEq)] #[serde(untagged)] pub enum PreTokenizerWrapper { PreTokenizer(tokenizers::PreTokenizerWrapper), + BigSmirkPreTokenizer(BigSmirkPreTokenizer), SmirkPreTokenizer(SmirkPreTokenizer), } @@ -15,6 +16,7 @@ impl PreTokenizer for PreTokenizerWrapper { fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> { match self { Self::PreTokenizer(t) => t.pre_tokenize(pretokenized), + Self::BigSmirkPreTokenizer(t) => t.pre_tokenize(pretokenized), Self::SmirkPreTokenizer(t) => t.pre_tokenize(pretokenized), } } @@ -26,6 +28,12 @@ impl From for PreTokenizerWrapper { } } +impl From for PreTokenizerWrapper { + fn from(value: BigSmirkPreTokenizer) -> Self { + Self::BigSmirkPreTokenizer(value) + } +} + impl From for PreTokenizerWrapper { fn from(value: tokenizers::PreTokenizerWrapper) -> Self { Self::PreTokenizer(value) @@ -201,4 +209,11 @@ mod test { check_serde(&pretok.0.clone()); check_serde(&pretok); } + + #[test] + fn serialize_bigsmirk_pretok() { + let pretok = PreTokenizerWrapper::BigSmirkPreTokenizer(BigSmirkPreTokenizer::default()); + check_serde(&pretok.clone()); + check_serde(&pretok); + } } diff --git a/test/bigsmiles.csv b/test/bigsmiles.csv new file mode 100644 index 0000000..12b2a19 --- /dev/null +++ b/test/bigsmiles.csv @@ -0,0 +1,664 @@ +# Data from https://doi.org/10.1038/s41597-024-03212-4 +,SMILES,BigSMILES,Tg (C) +0,*C1COC2C1OCC2Oc1ccc(cc1)CNC(=O)CCCCCCC(=O)NCc1ccc(cc1)O*,{},21.58173134 +1,*OC(CCC(OC(=O)Nc1ccc(cc1)Cc1ccc(cc1)NC(=O)*)C)C,{},63.5893379 +2,*OC(=O)c1ccc(cc1)C(=O)OCCCC(=O)NCc1ccc(cc1)CNC(=O)CCC*,{},53.55726117 +3,*OC(=O)NCCNC(=O)OCC*,{},5.896093021 +4,*SCCCCC*,{},-55.37860961 +5,*Oc1ccc(cc1)C(=O)OC(=O)c1ccc(cc1)OCCCCCC*,{},64.73496741 +6,*c1[nH]c(cc1CC(=O)OCCCCCCCC)*,{},-4.076963699 +7,*C(C*)(CC(=O)OCCCCCCCCCC)C(=O)OCCCCCCCCCC,{$CC(C(=O)OCCCCCCCCCC)(CC(=O)OCCCCCCCCCC)$},75.04044311 +8,*OCC1C(C1)C*,{},-28.98581149 +9,*N(C(=O)CCCCCCCCCCCCCCCCC(=O)N(CCCCCC*)C)C,{},49.34222836 +10,*O[Si](*)(CCC(C(C(C(C(C(F)(F)F)(F)F)(F)F)(F)F)(F)F)(F)F)C,{},-68.87734458 +11,*S(=O)(=O)c1ccc(cc1)C(=O)CNc1ccc(cc1)NCC(=O)c1ccc(cc1)*,{},198.9026743 +12,*S(=O)(=O)c1ccc(cc1)C(=O)NCCNC(=O)c1ccc(cc1)*,{},205.0712987 +13,*c1cc2c(C(=O)N(C2=O)c2ccc(cc2)C(c2ccc(cc2)N2C(=O)c3c(C2=O)cc(cc3)C(=O)*)CCCCC)cc1,{},204.7640603 +14,*NC(=O)C(=O)NCCCCCCCCCCNC(=O)CCCCCCCCC(=O)NCCCCCCCCCC*,{},82.2677155 +15,*OC(=O)c1ccc(cc1)C(=O)OCCCCCCCCCCCCCCCCCCCC*,{},17.27783213 +16,*Oc1ccc(cc1)C(=O)Oc1ccc(cc1)Cc1ccc(cc1)OC(=O)c1ccc(cc1)OC(=O)c1ccc(cc1)C(=O)*,{},88.16409459 +17,*Oc1ccc(cc1)SSCCCCSSc1ccc(cc1)*,{},102.260186 +18,*Oc1ccc(cc1)C(=O)Nc1c(cc(cc1)c1cc(c(cc1)NC(=O)c1ccc(cc1)OCCCCCC*)Cl)Cl,{},133.9866306 +19,*c1c(cc(c(c1)C=Cc1ccc(cc1)OCCC(CCCC(C)C)C)C=C*)C=Cc1ccc(cc1)OCCC(CCCC(C)C)C,{$C=Cc1cc(C=Cc2ccc(OCCC(C)CCCC(C)C)cc2)c(cc1C=Cc1ccc(OCCC(C)CCCC(C)C)cc1)$},2.868925186 +20,*c1ccc(cc1)C=C1C(=O)C(=Cc2ccc(cc2)C(=O)CCCCC(=O)*)CC1,{},197.4539358 +21,*SC(=O)CCCCC(=O)SCc1ccc(cc1)c1ccc(cc1)C*,{},-14.32353196 +22,*C(C*)(C(=O)OCCNC(=O)N(CC(C(C(OC1OC(C(C(C1O)O)O)CO)C(CO)O)O)O)CCCCCC)C,{$CC(C(=O)OCCNC(=O)N(CCCCCC)CC(O)C(O)C(OC1OC(CO)C(O)C(O)C1O)C(O)CO)(C)$},23.2402228 +23,*Oc1ccc(cc1)OC(=O)c1cc(cc(c1)NC(=O)c1ccc(cc1)NC(=O)C(N1C(=O)c2c(C1=O)cccc2)C)C(=O)*,{},58.7319127 +24,*OC(=O)Nc1ccc(cc1)C(=O)OCC1CCC(CC1)C*,{},148.8519878 +25,*Nc1c(cccc1)NC(=O)c1ccc(cc1)C(=O)*,{},231.8080905 +26,*C(C*)(C(=O)OCCCCCCOc1ccc(cc1)C(=O)Oc1ccc(cc1)OCCCCCC)C,{$CC(C(=O)OCCCCCCOc1ccc(C(=O)Oc2ccc(OCCCCCC)cc2)cc1)(C)$},14.67015775 +27,*Oc1ccc(cc1)S(=O)(=O)c1c2c(ccc1)c(ccc2)S(=O)(=O)c1ccc(cc1)Oc1c(cc(c(c1C)C)c1c(c(c(c(c1)C)*)C)C)C,{},300.6954413 +28,*OC(=O)NC1CCC(CC1)CC1CCC(CC1)NC(=O)OCC(C*)(C)C,{},15.9792949 +29,*OC(=O)NCCCCCC(=O)OCCCC*,{},47.28425166 +30,*N(c1c(cc(cc1)Cc1cc(c(cc1)N(C(=O)c1ccc(cc1)C(=O)*)CCC)C)C)CCC,{},179.9023847 +31,*Nc1ccc(cc1)NC(=O)c1cc(cc(c1)NC(=O)C(CCSC)N1C(=O)c2c(C1=O)cccc2)C(=O)*,{},230.3191562 +32,*N(c1c(cc(cc1)Cc1cc(c(cc1)N(C(=O)c1ccc(cc1)C(=O)*)C)C)C)C,{},167.9642319 +33,*OC(=O)CCCCCCC(=O)OCCc1ccc(cc1)N1ON1c1ccc(cc1)CC*,{},17.91289325 +34,*Oc1ccc(cc1)C(c1ccc(cc1)OC(=O)c1ccc(cc1)C(=O)*)(C(F)(F)F)C(F)(F)F,"{C(=O)c3ccc(cc3)C(=O)>}",150.9011854 +35,*c1ccc2[nH]c3c(c2c1)cc(cc3)*,{},246.5898577 +36,*C(C*)C(=O)NCCCC,{$CC(C(=O)NCCCC)$},-84.72234668 +37,*C(=C*)CCCCCCCCCCOc1ccc(cc1)c1ccc(cc1)OCC(CCCCCC)F,{$C=C(CCCCCCCCCCOc1ccc(c2ccc(OCC(F)CCCCCC)cc2)cc1)$},27.34580481 +38,*NC(=O)NCCCCNC(=O)NCCCCCC*,{},60.10434296 +39,*Oc1ccc(cc1)C(=Cc1ccc(cc1)OC(=O)OCCCCCOC(=O)*)C,{},39.38257392 +40,*N1CCN(CC1)C(=O)CCN1CCN(CC1)CCC(=O)*,{},40.58060093 +41,*c1c(cc(cc1)C#C*)SCCCCCCCCCCCC,{$C#Cc1ccc(c(SCCCCCCCCCCCC)c1)$},53.41644401 +42,*OC(=O)Nc1cc(ccc1)C(=O)OCCCCCCCCCC*,{},45.09440244 +43,*Oc1c(cc(cc1)C(=O)*)CCCCCC,{},85.05865655 +44,*Nc1ccc(cc1)NC(=O)C=CC(=O)*,{},115.1255691 +45,*C(C*)(C(=O)OC(Oc1ccccc1)C)C,{$CC(C(=O)OC(C)Oc1ccccc1)(C)$},31.196908 +46,*C(*)C,{$C(C)$},122.3867673 +47,*N1C(=O)c2c(C1=O)cc(cc2)Oc1cc2c(C(=O)N(C2=O)c2c(c3c(C(CC3)(C)C)c(c2C)*)C)cc1,{},325.2939228 +48,*Nc1ccc(cc1)C(=O)*,{},92.05287264 +49,*C(=C*)C,{$C=C(C)$},4.023221393 +50,*c1nc(nc(n1)Oc1c2c(ccc1C(=O)Oc1ccc(cc1)OC(=O)c1c(c3c(cc1)cccc3)O*)cccc2)N1CCCCC1,{},288.7077709 +51,*OS(=O)(=O)c1ccc(cc1)Sc1ccc(cc1)S(=O)(=O)Oc1ccc(cc1)C1(CCCCC1)c1ccc(cc1)*,{},157.8104834 +52,*c1c(c(c(c(c1Cl)Cl)CC*)Cl)Cl,{$CCc1c(Cl)c(Cl)c(c(Cl)c1Cl)$},56.1987128 +53,*OC(=O)c1ccc(cc1)C(=O)OCCCC(=O)NCCCCCCNC(=O)CCC*,{},60.02078862 +54,*c1c(c(cc(c1)*)C=Nc1ccc(cc1)F)O,{$c1cc(c(O)c(C=Nc2ccc(F)cc2)c1)$},-121.5212841 +55,*N=C1C=CC(=NC2=CC(=O)C(=CC2=O)*)C=C1,{},212.9967182 +56,*OC(=O)NCCSCCCCCSCCNC(=O)OCC1CCC(CC1)C*,{},77.84678315 +57,*OS(=O)(=O)c1ccc(cc1)Oc1ccc(cc1)S(=O)(=O)Oc1ccc(cc1)C1(CCCCC1)c1ccc(cc1)*,{},149.9071313 +58,*c1nc(nc(n1)Oc1cc2c(cc1)cc(cc2)C(=O)Oc1c(cccc1)OC(=O)c1cc2c(cc1)cc(cc2)O*)N1CCN(CC1)C,{},54.90728348 +59,*OC(=O)CC(=O)OCC*,{},-90.52916041 +60,*OC(=O)CCCCCCCC(=O)OCC(C*)(C)C,{},17.01342593 +61,*OC(=O)Nc1ccc(cc1)Cc1ccc(cc1)NC(=O)OCCN(CC*)CCCC(=O)Nc1ccc(cc1)N=Nc1ccccc1,{},-32.25789051 +62,*[Si](*)(CCCCCC)CCCCCC,{$[Si](CCCCCC)(CCCCCC)$},-11.37918107 +63,*c1n(c(cc1)C=C*)CCCCCC,{},-15.03969582 +64,*NC1CC(CC(C1)(CNC(=O)c1cc(cc(c1)NC(=O)c1ccc(cc1)NC(=O)C(CC(C)C)N1C(=O)c2c(C1=O)cccc2)C(=O)*)C)(C)C,{},195.2683571 +65,*Nc1cc(cc(c1)C(=O)Nc1ccccc1)NC(=O)c1cc(ccc1)C(=O)*,{},233.599405 +66,*c1cc2c(C(=O)N(C2=O)c2ccc(cc2)N2C(=O)c3c(C2=O)cc(cc3)C(=O)Oc2ccc(cc2)OC(=O)*)cc1,{},-71.68107141 +67,*OC(=O)CCCCCC(=O)OCCC*,{},-71.28231613 +68,*N1C(=O)C(CC1=O)SCCOCCSC1C(=O)N(C(=O)C1)c1ccc(cc1)C(=O)OCCCCCCOC(=O)c1ccc(cc1)*,{},19.42674542 +69,*NC(=O)CCCCCCCCC(=O)NCCCP(CCC*)c1ccccc1,{},56.20366062 +70,*c1nc2c([nH]1)cc(cc2)S(=O)(=O)c1cc2c(nc([nH]2)c2ccc(cc2)Oc2ccc(cc2)*)cc1,{},315.1120806 +71,*C(C*)(C(=O)OCCCCCCOc1ccc(cc1)C=Nc1ccc(cc1)CCCC)C,{$CC(C(=O)OCCCCCCOc1ccc(C=Nc2ccc(CCCC)cc2)cc1)(C)$},12.41121342 +72,*Oc1ccc(cc1)Oc1ccc(cc1)NC(=O)c1cc(cc(c1)N1C(=O)c2c(C1=O)c(c(c(c2Cl)Cl)Cl)Cl)C(=O)Nc1ccc(cc1)*,{},315.5911196 +73,*c1ccc(cc1)/C=C/c1c(cc(c(c1)CCCCCC)/C=C/*)CCCCCC,{},51.04706134 +74,*Oc1ccc(cc1)OC(=O)*,"{C(=O)>}",105.0499992 +75,*C1(c2c(C(=O)O1)cccc2)c1ccc(cc1)NC(=O)c1ccc(cc1)C(=O)Nc1ccc(cc1)*,{},292.5925873 +76,*c1c(cc(c(c1)OCCOCCOCCOCCC(=O)O[Na])C#Cc1c(cc(c(c1)OC(COCCOCCOCCOC)COCCOCCOCCOC)C#C*)OC(COCCOCCOCCOC)COCCOCCOCCOC)OCCOCCOCCOCCC(=O)O[Na],{},-43.36051195 +77,*N(CC*)C(=O)CCCCCCCC,{},-49.55374382 +78,*c1cc2c(C(=O)N(C2=O)c2ccc(cc2)c2ccc(cc2)c2ccc(cc2)N2C(=O)c3c(C2=O)cc(cc3)C(=O)*)cc1,{},232.2309602 +79,*n1c(=O)c2cc3c(cc2c1=O)c(=O)n(c3=O)CCCCCC(=O)Oc1cc2c(cc1)cc(cc2)OC(=O)CCCCC*,{},-10.30414379 +80,*NC(=O)CCP(CCC(=O)NCCCCCC*)c1ccccc1,{},9.575432391 +81,*c1cc2C(c3c(c2cc1)ccc(c3)c1cc2C(c3c(c2cc1)ccc(c3)*)(CCCCCCCCCCCC)CCCCCCCCCCCC)(CCCCCCCC)CCCCCCCC,{},37.51200969 +82,*C1OC(=O)C(C1)CCC1C(=O)OC(C1)CCCCCCCCCC*,{},84.1573412 +83,*C(C*)C(=O)OC(CC(C)C)C,{$CC(C(=O)OC(C)CC(C)C)$},11.47675289 +84,*OC(C*)COc1ccc(cc1)C,{},-55.31696784 +85,*N(C(=O)CCCCCCCCCCCCCCC(=O)N(Cc1ccc(cc1)C*)C)C,{},-14.32362541 +86,*C1OC(OC(C1)*)O,{},136.0523749 +87,*c1c(cc(c(c1)OCCCCCCCCCC)c1ccc(cc1)*)OCCCCCCCCCC,{},61.07976291 +88,*c1c2c(nccc2)c(cc1)OCCOc1c2ncccc2c(cc1)C*,{},74.04018308 +89,*NC(CC(=O)*)C(=O)OCCCCCC,{},0.713924343 +90,*Oc1c(cc(cc1)OC(=O)c1ccc(cc1)OC(=O)c1ccc(cc1)C(=O)Oc1ccc(cc1)C(=O)*)C,{},81.15934241 +91,*n1c(=O)c2ccc3c(=O)n(c(=O)c4ccc(c1=O)c2c34)CCCCCCCCCCCC*,{},101.4116865 +92,*Oc1c(cc(cc1)C=C1C(=O)C(=Cc2cc(c(cc2)OC(=O)c2ccc(cc2)C(=O)*)OC)CCC1)OC,{},105.2246285 +93,*Nc1ccc(cc1)NC(=O)C1C(C(=CC(C1)C(C(=O)*)CC(=O)O)C)C(=O)O,{},149.7687276 +94,*Oc1ccc(cc1)OC(=O)c1c(cccc1)C=Cc1ccc(cc1)C=Cc1c(cccc1)C(=O)*,{},86.85824281 +95,*OC(=O)c1ccc(cc1)NC(=O)CCCCCCCCCCC(=O)Nc1ccc(cc1)C(=O)OCC*,{},14.08686284 +96,*OC(=O)N(c1ccc(cc1)N(C(=O)OCC(C*)CCCCCCOc1ccc(cc1)c1ccc(cc1)C#N)C)C,{},78.11061669 +97,*C1C(C(C(C1)C=C*)C(=O)OCCCCCOc1ccc(cc1)c1ccc(cc1)C#N)C(=O)OCCCCCOc1ccc(cc1)c1ccc(cc1)C#N,{},27.68661296 +98,*C#CC(=C(*)CCCCOC(=O)NCCCCCC)CCCCOC(=O)NCCCCCC,{$C#CC(CCCCOC(=O)NCCCCCC)=C(CCCCOC(=O)NCCCCCC)$},5.219913288 +99,*c1ccc(cc1)/C=C/*,{},43.3930774 +100,*c1nc2c([nH]1)ccc(c2)c1ccc2c(nc([nH]2)c2ccc3S(=O)(=O)c4c(c3c2)cc(cc4)*)c1,{},355.5073659 +101,*n1c(=O)c2cc3c(cc2c1=O)c(=O)n(c3=O)c1ccc(cc1)C(c1ccc(cc1)*)C,{},307.2645731 +102,*Oc1c(cc(c(c1)OC(=O)c1ccc(cc1)C=Nc1cc(c(cc1)OCCCCCCOc1c(cc(cc1)N=Cc1ccc(cc1)C(=O)*)Cl)Cl)Cl)Cl,{},82.06188226 +103,*c1ccc2n(c3c(c2c1)cc(cc3)/C=C/c1ccc(cc1)Oc1c(cc(cc1)c1ccc(cc1)c1ccc(cc1)c1cc(c(cc1)Oc1ccc(cc1)/C=C/*)C(F)(F)F)C(F)(F)F)CC(CCCC)CC,{},187.4626146 +104,*Oc1ccc(cc1)Oc1ccc(cc1)C(=Nc1ccc(cc1)Oc1ccc(cc1)N=C(c1ccc(cc1)*)C)C,{},117.5082044 +105,*Oc1ccc(cc1)NC(=O)NCCCCCCNC(=O)*,{},122.4088725 +106,*C(C*)OC(=O)CC(=O)C,{$CC(OC(=O)CC(C)=O)$},9.819405608 +107,*Oc1c(cc(cc1)OC(=O)c1ccc(cc1)OCCOc1ccc(cc1)C(=O)*)C,{},68.01369706 +108,*Oc1ccc(cc1)OC(=O)c1ccc(cc1)NC(=O)c1ccc(cc1)C(=O)*,{},176.5386789 +109,*N1C(=S)SC(=CC=C2SC(=S)N(C2=O)CC*)C1=O,{},164.0140939 +110,*C1NC(=O)C(NC1=O)CCC(=O)OCCCC=CCCCOC(=O)CC*,{},25.37727633 +111,*Oc1ccc(cc1)N=Cc1cc(c(cc1)OC(=O)CCCCC(=O)Oc1c(cc(cc1)C=Nc1ccc(cc1)*)OC)OC,{},47.38250675 +112,*c1n(c(nn1)c1ccc(cc1)c1ccc(cc1)c1n(c(nn1)COc1ccc(cc1)C=C1C(=O)C(=Cc2ccc(cc2)OC*)CC1)c1ccccc1)c1ccccc1,{},140.326419 +113,*N1CCN(CC1)C(=O)C1C(C1)C(=O)*,{},100.5840527 +114,*C(=C*)c1c(sc(c1)C(F)(F)F)C(F)(F)F,{$C=C(c1cc(C(F)(F)F)sc1C(F)(F)F)$},54.57900551 +115,*N1C(=O)c2c(C1=O)cc(cc2)c1cc2c(C(=O)N(C2=O)c2ccc(cc2)c2ccc(cc2)*)cc1,{},421.9822435 +116,*c1cc2c(C(=O)N(C2=O)c2c(c(c(c(c2F)F)N2C(=O)c3c(C2=O)cc(cc3)C(*)(C(F)(F)F)C(F)(F)F)F)F)cc1,{},388.0137456 +117,*C#CC(=C(*)OS(=O)(=O)c1ccc(cc1)C)OS(=O)(=O)c1ccc(cc1)C,{$C#CC(OS(=O)(=O)c1ccc(C)cc1)=C(OS(=O)(=O)c1ccc(C)cc1)$},164.1018919 +118,*OC(=O)CCC(=O)OCCOCCOCC*,{},14.21396213 +119,*Oc1ccc(cc1)C(c1ccc(cc1)OC(=O)c1cc(ccc1)C(=O)*)(CC)C,"{C(=O)c3cccc(c3)C(=O)>}",194.1629808 +120,*n1c(=O)c2c(c3c(cc2c1=O)c(=O)n(c3=O)CCCCCCCCCC*)Br,{},137.1087261 +121,*C(C*)(C(=O)OCCCCCCCCOC(=O)OC1CC2=CCC3C(CCC4(C(CCC34)C(CCCC(C)C)C)C)C2(CC1)C)C,{$CC(C(=O)OCCCCCCCCOC(=O)OC1CCC2(C)C(=CCC3C2CCC2(C)C(C(C)CCCC(C)C)CCC32)C1)(C)$},47.48888608 +122,*OC(=O)CCCCCCCC(=O)OCCCCCCCCC*,{},3.667443154 +123,*Oc1ccc(cc1)c1c(cc(c(c1)OCCCCCC)c1ccc(cc1)OC(=O)c1ccc(cc1)C(=O)*)OCCCCCC,{},81.55306128 +124,*c1c2c(nccc2)c(cc1)CC*,{},70.93028064 +125,*c1n(c(cc1)CCC(=O)OC(=O)CC*)C,{},78.11959859 +126,*N1C(=O)C2OC3C(C2C1=O)C(=O)N(C3=O)c1ccc(cc1)c1c(nc2c(n1)cc(cc2)S(=O)(=O)c1cc2c(nc(c(n2)c2ccc(cc2)*)c2ccccc2)cc1)c1ccccc1,{},373.476202 +127,*c1oc(cc1)Sc1oc(cc1)C=C1C(=O)C(=C*)CC1,{},102.2883172 +128,*Oc1c(c(ccc1)Oc1ccc(cc1)Nc1ccc(cc1)Nc1ccc(cc1)Nc1ccc(cc1)Nc1ccc(cc1)*)C(=O)Nc1ccc(cc1)N=Nc1ccccc1,{},238.4094841 +129,*=C=C=C(C(=*)CO)CO,{$=C=C=C(CO)C(CO)=$},42.01413885 +130,*c1oc(cc1)Sc1oc(cc1)C=NCCN=C*,{},95.6213855 +131,*NC(=O)/C=C/CC/C=C/C(=O)NCCCCCC*,{},2.127453311 +132,*Nc1c(cc(c(c1)SCCC#N)NC(=O)c1ccc(cc1)C(=O)*)SCCC#N,{},214.7483216 +133,*c1c(cc(cc1)CC*)C(=O)C,{$CCc1ccc(c(C(C)=O)c1)$},8.997925789 +134,*Oc1ccc(cc1)C(=O)c1ccc(cc1)Oc1ccc(cc1)C(=O)c1ccc(cc1)C(=O)c1ccc(cc1)*,{},138.8344532 +135,*c1sc(nn1)c1cc(ccc1)OCCCCCCCCCCCOC(=O)CCCCC(=O)OCCCCCCCCCCCOc1cc(ccc1)*,{},35.51374296 +136,*Nc1c(cc(c(c1)OC)*)OC,{},168.2573371 +137,*SCSCCCC*,{},1.410869343 +138,*c1nc(sc1)NC(=O)NCCCCCCNC(=O)Nc1nc(cs1)c1ccc(cc1)C=C1C(=O)C(=Cc2ccc(cc2)*)CCC1,{},47.02441211 +139,*N(CC*)C(=O)c1ccc(cc1)C,{},87.3045689 +140,*n1c(=O)c2cc3c(cc2c1=O)c(=O)n(c3=O)CCCCCC(=O)Oc1ccc(cc1)c1ccc(cc1)OC(=O)CCCCC*,{},60.41501019 +141,*Oc1c(c(c(c(c1[2H])[2H])C(c1c(c(c(c(c1[2H])[2H])OC(=O)*)[2H])[2H])(C([2H])([2H])[2H])C([2H])([2H])[2H])[2H])[2H],{},230.9913822 +142,*Oc1ccc(cc1)Oc1ccc(cc1)Oc1ccc(cc1)c1ccc(cc1)Oc1ccc(cc1)Oc1ccc(cc1)Oc1c(cc(cc1)C(=O)c1cc(c(cc1)*)S(=O)(=O)O)S(=O)(=O)O,{},151.7193386 +143,*c1c2c(nsn2)c(cc1)c1sc(c(c1)CCCCCC)C=Cc1sc(c(c1CCCCCC)CCCCCC)C=Cc1sc(cc1CCCCCC)*,{},17.9542177 +144,*N1C(=O)c2c(C1=O)c(ccc2)c1cc2c(C(=O)N(C2=O)c2ccc(cc2)C(=O)Nc2ccc(cc2)Oc2ccc(cc2)Oc2ccc(cc2)NC(=O)c2ccc(cc2)*)cc1,{},178.6163373 +145,*c1cc2n(c3c(c2cc1CCCCCC)cc(c(c3)C=C*)CCCCCC)CCCCCCCC,{},45.45691415 +146,*C#CC(=C(*)c1cncnc1)CCCCOC(=O)NCC(=O)OCCCC,{$C#CC(CCCCOC(=O)NCC(=O)OCCCC)=C(c1cncnc1)$},73.99227071 +147,*OC(=O)c1ccc(cc1)C(=O)NCCCNC(=O)c1ccc(cc1)C(=O)OCCCCCCCCCC*,{},76.72929425 +148,*Oc1ccc(cc1)S(=O)(=O)c1ccc(cc1)Oc1c(cc(cc1)C(c1cc(c(cc1)*)[N-][N+]#N)(C)C)[N-][N+]#N,{},181.1592195 +149,*c1sc(cc1CCCCCCCC)c1sc(cc1CCCCCC(C(C(C(F)(F)F)(F)F)(F)F)(F)F)*,{},6.476080227 +150,*c1c2C(=O)N(C(=O)c2c(c2ccccc12)c1ccc(cc1)Oc1ccc(cc1)S(=O)(=O)c1ccc(cc1)Oc1ccc(cc1)*)c1c(cccc1)F,{},314.5599695 +151,*NC(=O)CCCCCCCCCCC(=O)NCC1CC(CCC1)C*,{},26.18556129 +152,*Nc1ccc(cc1)NC(=O)c1cc(cc(c1)C(C(C(C(F)(F)F)(F)F)(F)F)(F)F)C(=O)*,{},235.6895845 +153,*c1c(cc(c(c1)O)O)*,{$c1cc(O)c(O)cc1$},158.9171015 +154,*Oc1ccc(cc1)C(=Cc1ccc(cc1)OCCCCCCC*)C,{},36.76976951 +155,*C(=C(*)C)[Si](CC)(C)C,{$C(C)=C([Si](C)(C)CC)$},157.1009863 +156,*OC(=O)CCCCCCCC(=O)*,{},-68.69432556 +157,*OC(=O)c1ccc(cc1)NC(=O)CCCCCCCCCCC(=O)Nc1ccc(cc1)C(=O)OCCCCCCCCCCCC*,{},-26.12929226 +158,*c1c(cc(c(c1)OCCCCCCCCCC)*)OCCCCCCCCCC,{},5.423112055 +159,*C(C(=O)*)(C)C,{$C(=O)C(C)(C)$},105.8292511 +160,*C#CC(=C(*)Cn1c2ccc(cc2c2cc(ccc12)CCCCCCCCCCCCCCCC)CCCCCCCCCCCCCCCC)Cn1c2ccc(cc2c2cc(ccc12)CCCCCCCCCCCCCCCC)CCCCCCCCCCCCCCCC,{$C#CC(Cn1c2ccc(CCCCCCCCCCCCCCCC)cc2c2cc(CCCCCCCCCCCCCCCC)ccc21)=C(Cn1c2ccc(CCCCCCCCCCCCCCCC)cc2c2cc(CCCCCCCCCCCCCCCC)ccc21)$},-2.237818484 +161,*C(C*)CCCC(C)C,{$CC(CCCC(C)C)$},5.980301023 +162,*N1C(=O)C(CC1=O)Oc1ccc(cc1)N=Cc1ccc(cc1)OC1C(=O)N(C(=O)C1)c1ccc(cc1)Oc1ccc(cc1)*,{},154.6342574 +163,*c1nc(nc(n1)Oc1c(cccc1)C(=O)Oc1ccc(cc1)OC(=O)c1c(cccc1)O*)N(CC)c1ccccc1,{},146.7365026 +164,*c1nc2c(n1CCCS(=O)(=O)O)ccc(c2)c1ccc2c(nc(n2CCCS(=O)(=O)O)c2cc(ccc2)*)c1,{},208.6363648 +165,*OC(=O)C1C(=O)CC(C(=O)C1)C(=O)OCCCCCC*,{},-36.66445232 +166,*Oc1ccc(cc1)C(c1ccc(cc1)Oc1ccc(cc1)C(=O)c1ccc(cc1)*)c1c(cccc1)C(=O)O[Na],{},169.1341304 +167,*N(C(=O)CCCCCCCCCCCCCCCCC(=O)N(Cc1ccc(cc1)C*)CC)CC,{},4.279694171 +168,*c1nc(nc(n1)Oc1ccc(cc1)C(c1ccc(cc1)O*)(C)C)OC,{},121.2094568 +169,*OC(=O)NCc1ccc(cc1)CNC(=O)OCCCCCCCC*,{},-25.31417235 +170,*NC(=O)CCCCCCCC(=O)NCCCCCCCCC*,{},9.904824538 +171,*N1C(=O)c2c(C1=O)cc(cc2)Oc1cc2c(C(=O)N(C2=O)c2cc(ccc2)Oc2ccc(cc2)S(=O)(=O)c2ccc(cc2)Oc2cc(ccc2)*)cc1,{},199.6619548 +172,*c1n(c(cc1)*)CC,{},112.6054346 +173,*OC(=O)CCCCS(=O)(=O)CCCCC(=O)OCCCCCCCCCC*,{},37.87494117 +174,*c1cc2c(C(=O)N(C2=O)c2ccc(cc2)Oc2ccc(cc2)N2C(=O)c3c(C2=O)cc(cc3)C(=O)N2C3(C(=O)N(C2=O)C(=O)*)CCCCC3)cc1,{},277.0792131 +175,*C(=C*)c1c(cccc1)C,{$C=C(c1ccccc1C)$},261.6623551 +176,*Oc1cc2C(c3c(c2cc1)ccc(c3)Oc1ccc(cc1)c1c2cc(ccc2c(c2ccccc12)c1ccc(cc1)*)CCC)(c1ccc(cc1)N(c1ccccc1)c1ccccc1)c1ccc(cc1)N(c1ccccc1)c1ccccc1,{},187.3941077 +177,*SC(=O)CCCCC(=O)SCC*,{},-21.91188888 +178,*OC(CC*)(C(F)(F)F)C(F)(F)F,{},-8.026293088 +179,*C(C(*)c1ccccc1)[N+](=O)[O-],{$C(c1ccccc1)C([N+](=O)[O-])$},173.9913454 +180,*N1C(=O)c2c(C1=O)cc(cc2)C(c1cc2c(C(=O)N(C2=O)CC(=O)NNC(=O)c2ccc(cc2)C(=O)NNC(=O)C*)cc1)(C(F)(F)F)C(F)(F)F,{},226.7602549 +181,*C(C*)CCCCCCCCCCCCCCCCCCCC,{$CC(CCCCCCCCCCCCCCCCCCCC)$},21.77937998 +182,*OS(=O)(=O)c1ccc(cc1)Oc1ccc(cc1)S(=O)(=O)Oc1ccc(cc1)c1ccc(cc1)*,{},190.021745 +183,*SCCCCCC(=O)NCCc1ccc(cc1)CCNC(=O)CCCCC*,{},-18.01175462 +184,*N1CCC(CC1)C(=O)c1c(cc(cc1)C(=O)N1CCC(CC1)CCC*)Oc1ccccc1,{},131.1821319 +185,*C(C(CC*)(F)F)(Cl)F,{$CCC(F)(F)C(Cl)(F)$},-41.84388225 +186,*n1c(=O)c2cc3c(cc2c1=O)c(=O)n(c3=O)CC(=O)OCCCCCCOC(=O)C*,{},55.49864746 +187,*c1oc(cc1)Sc1oc(cc1)C=Nc1cc(ccc1)N=C*,{},122.5813993 +188,*C(C*)S(=O)c1ccccc1,{$CC(S(=O)c1ccccc1)$},137.9705527 +189,*OC(=O)c1cc(c(cc1)C(=O)Nc1ccc(cc1)C(c1ccc(cc1)C(c1ccc(cc1)NC(=O)c1c(ccc(c1)C(=O)OCCN(CC*)CCCCOc1ccc(cc1)N=Nc1ccc(cc1)CCCCCC)C(=O)O)(C)C)(C)C)C(=O)O,{},82.18427933 +190,*=C=C=C(C(=*)COC(=O)NCC)COC(=O)NCC,{$=C=C=C(COC(=O)NCC)C(COC(=O)NCC)=$},172.0633872 +191,*Oc1ccc(cc1)c1ccc(cc1)OCCCCCCOC(=O)Nc1c(ccc(c1)NC(=O)OCCCCOC(=O)Nc1cc(c(cc1)C)NC(=O)OCCCCCC*)C,{},41.49405569 +192,*c1sc(c(c1C#N)C#N)N=Cc1ccc(cc1)C=N*,{},204.4271705 +193,*c1cc2c(C(=O)N(C2=O)c2ccc(cc2)Oc2ccc(cc2)Oc2ccc(cc2)Oc2ccc(cc2)N2C(=O)c3c(C2=O)cc(cc3)C(=O)*)cc1,{},126.2551791 +194,*c1cc2c(C(=O)N(C2=O)c2cc(ccc2)P(=O)(c2cc(ccc2)N2C(=O)c3c(C2=O)cc(cc3)C(=O)N2C(=O)N(C(=O)C2(C)C)C(=O)*)c2ccccc2)cc1,{},138.7903633 +195,*OCOCCCCCC*,{},-41.68464364 +196,*Oc1ccc(cc1)OC(=O)c1c(cc(c(c1)OCCCCCCCCCCCCCCCC)C(=O)*)OCCCCCCCCCCCCCCCC,{},77.98044592 +197,*c1sc(cc1)/C=C/c1ccc(cc1)N(c1ccc(cc1)N(c1ccc(cc1)/C=C/c1sc(cc1)/C=C(/C(=O)Nc1cc(ccc1)NC(=O)/C(=C/*)/C#N)\C#N)c1ccccc1)c1ccccc1,{},243.9892983 +198,*N(c1ccc(cc1)Cc1ccc(cc1)N(C(=O)c1ccc(cc1)C(=O)*)CCC)CCC,{},173.7390032 +199,*Oc1ccc(cc1)C(c1ccc(cc1)OCC#CC#CC*)(C)C,{},48.046331 +200,*NC(=O)C(C(C(=O)NCCCCCC*)O)O,{},149.1553302 +201,*NC(=O)CCCCCC(=O)NCc1ccc(cc1)C*,{},61.73836642 +202,*Nc1c(c(c(cc1)*)Cl)Cl,{},115.5762842 +203,*NC(=O)C(C(=O)NCCCCCCCCCC*)CCCCCCCCCCCC,{},-0.868265132 +204,*N1C(=O)c2c(C1=O)c(c(c(c2F)F)Oc1c(c(c(c(c1F)F)Oc1c(c2c(C(=O)N(C2=O)c2ccc(cc2)Oc2ccc(cc2)*)c(c1F)F)F)F)F)F,{},283.2696326 +205,*C1(c2c(C(=O)O1)cccc2)c1cc(c(c(c1)Br)OC(=O)c1cc(ccc1)C(=O)Oc1c(cc(cc1Br)*)Br)Br,{},245.9668197 +206,*OC(=O)CCCCSCCCCC(=O)OCCCCCCCCCC*,{},-22.30064955 +207,*OC(=O)c1c(cccc1)C(=O)OCc1ccc(cc1)C*,{},95.33964688 +208,*C=CCC(CC*)(C(=O)OC)C,{$C=CCC(C)(C(=O)OC)CC$},-28.15217408 +209,*O[Si](*)(CCCCCOc1ccc(cc1)OC(=O)c1ccc(cc1)OCCCC)C,{},-64.12566312 +210,*Nc1ccc(cc1)CCc1ccc(cc1)NC(=O)c1ccc(cc1)C(=O)*,{},245.2414672 +211,*=NN=CC#C[Si](C#CC=*)(c1ccccc1)c1ccccc1,{<=CC#C[Si](c1ccccc1)(c1ccccc1)C#CC=NN=>},131.2577432 +212,*Oc1ccc(cc1)C(=O)OCCOCCOCCOC(=O)c1ccc(cc1)OC(=O)Nc1c(ccc(c1)NC(=O)*)C,{},100.4455143 +213,*Oc1ccc(cc1)c1ccc(cc1)OC(=O)c1c(cc(cc1)C(=O)*)Oc1ccc(cc1)C(c1ccccc1)(C)C,"{C(=O)c3ccc(cc3Oc3ccc(C(C)(C)c4ccccc4)cc3)C(=O)>}",135.0490958 +214,*OC(=O)NCCSCCCCCSCCNC(=O)OCCCC*,{},3.000872148 +215,*Oc1cc(ccc1)OC(=O)c1cc(c(c(c1)C(C)(C)C)OC(=O)c1ccc(cc1)C(=O)Oc1c(cc(cc1C(C)(C)C)C(=O)*)C(C)(C)C)C(C)(C)C,{},135.2011857 +216,*Oc1ccc(cc1)C(c1ccc(cc1)OC(=O)*)(CCCC)C,"{C(=O)>}",118.8361345 +217,*c1cc2n3c(nc2cc1)c1ccc(cc1C3=O)Oc1cc2c3n(c4ccc(cc4n3)O*)C(=O)c2cc1,{},291.281809 +218,*Oc1c(cc(cc1)OC(=O)Oc1ccc(cc1)OC(=O)*)C,{},120.5520321 +219,*N1C(=O)c2c(C1=O)c(ccc2)c1cc2c(C(=O)N(C2=O)c2ccc(cc2)OCCCCCCCCCOc2ccc(cc2)*)cc1,{},47.98864595 +220,*C1OC(CO1)COCC1OC(OC1)CCCCCCCC(=O)OCCOC(=O)CCCCCCC*,{},-2.553202642 +221,*Oc1ccc(cc1)Oc1ccc(cc1)Oc1ccc(cc1)OC(=O)c1ccc(cc1)C(=O)*,{},136.3718208 +222,*C(C(=O)C*)c1ccc(cc1)C,{$CC(=O)C(c1ccc(C)cc1)$},67.57313464 +223,*C(C*)C(=O)OCC1(COC(OC1)(C)C)C,{$CC(C(=O)OCC1(C)COC(C)(C)OC1)$},95.74104893 +224,*Oc1c(cc(cc1)C=C1C(=O)C(=Cc2cc(c(cc2)OCCCC*)OC)CC1)OC,{},36.19808899 +225,*N=C1c2ccccc2C(=Nc2ccc(cc2)*)c2ccccc12,{},158.2579262 +226,*NC(=O)CCCCCCCCCCCCCCCC*,{},15.38660594 +227,*Nc1cc(ccc1)NC(=O)CCCCCC(=O)*,{},-12.07623639 +228,*c1cc2c(cc1)cc(cc2)*,{},239.5402998 +229,*c1ncc(cc1)c1ccc(nc1)*,{},138.2497069 +230,*C(C*)n1c2ccc(cc2c2cc(ccc12)Br)Br,{$CC(n1c2ccc(Br)cc2c2cc(Br)ccc21)$},115.6170166 +231,*c1cc2c(C(=O)N(C2=O)c2ccc(cc2)Oc2ccc(cc2)N2C(=O)c3c(C2=O)cc(cc3)c2ccc(cc2)c2ccc(cc2)*)cc1,{},357.2965065 +232,*c1cc2c(C(=O)N(C2=O)c2ccc(cc2)c2ccc(cc2)c2ccc(cc2)N2C(=O)c3c(C2=O)cc(cc3)C(*)(C(F)(F)F)C(F)(F)F)cc1,{},302.652739 +233,*OC(=O)NC1C(C1)NC(=O)OCCCC*,{},47.31586772 +234,*Oc1ccc(cc1)S(=O)(=O)c1ccc(cc1)Oc1c(cc(cc1C)C1C(CC(CC1)C(c1cc(c(c(c1)C)*)C)(C)C)C)C,{},250.477212 +235,*C1C(=O)N(C(=O)C1*)c1ccccc1,{},197.1821336 +236,*[Si](c1ccc(cc1)*)(C)C,{},126.7080905 +237,*Nc1c(c(c(c(c1[2H])[2H])*)[2H])[2H],{},235.370823 +238,*C1c2c(C(C=C1)C=C*)cc(cc2)CCCCCC,{$C=CC1C=CC(c2ccc(CCCCCC)cc21)$},9.704073219 +239,*c1ccc2n(c3c(c2c1)cc(cc3)/C=C/c1ccc(cc1)Oc1c(cc(cc1)c1ccc(cc1)c1ccc(cc1)c1cc(c(cc1)Oc1ccc(cc1)/C=C/*)C#N)C#N)CC(CCCC)CC,{},129.2206103 +240,*c1sc(c(c1OCCCCCCC)C)*,{},32.04437108 +241,*OC(=O)C(=O)OCCCCCCCCCC*,{},-65.08541819 +242,*C=CC(C(*)C)C,{$C=CC(C)C(C)$},78.17880502 +243,*N1C(=O)c2c(C1=O)cc(cc2)Oc1cc2c(C(CC2(C)C)(c2ccc(cc2)Oc2cc3c(C(=O)N(C3=O)c3cc(ccc3)N3C(=O)c4c(C3=O)cc(cc4)Oc3ccc(cc3)C3(CC(c4c3cc(cc4)Oc3cc4c(C(=O)N(C4=O)c4cc(ccc4)*)cc3)(C)C)C)cc2)C)cc1,{},231.956044 +244,*=C1SC(=S)N(C1=O)c1ccc(cc1)N1C(=S)SC(=Cc2ccc(cc2)C=*)C1=O,{<=Cc1ccc(cc1)C=C2SC(=S)N(C2=O)c3ccc(cc3)N4C(=O)C(SC4=S)=>},187.1036635 +245,*Nc1c(cc(c(c1)C(=O)O)*)OC,{},93.18891585 +246,*NNC(=O)CCCCC(=O)NNC(=O)CCCCCCCC(=O)*,{},101.0022623 +247,*n1c(=O)c2sc3c(sc2c1=O)c(=O)n(c3=O)c1ccc(cc1)C(=O)Nc1ccc(cc1)NC(=O)c1ccc(cc1)*,{},327.2441987 +248,*C1CC(CC1)C*,{},42.35931484 +249,*Nc1ccc(cc1)NC(=O)c1c(cc(c(c1)C(=O)O)C(=O)*)C(=O)O,{},240.3936293 +250,*NNC(=O)c1cc(ccc1)C(=O)*,{},232.2266265 +251,*c1sc(c(c1CCCCCCCCCCCC)CCCCCCCCCCCC)c1sc(cc1)c1sc(cc1)*,{},14.60597757 +252,*C(C*)C(=O)NCC,{$CC(C(=O)NCC)$},55.10481694 +253,*Oc1c(cc(cc1C(C)(C)C)C(=O)*)C(C)(C)C,{},206.7951116 +254,*OS(=O)(=O)c1ccc(cc1)c1ccc(cc1)S(=O)(=O)Oc1c(cc(cc1Br)C1(CCCCC1)c1cc(c(c(c1)Br)*)Br)Br,{},268.4647521 +255,*C(C*)C(=O)NCCCCCCCCCCCC,{$CC(C(=O)NCCCCCCCCCCCC)$},21.73577755 +256,*N(c1ccc(cc1)*)CCCCCC,{},-128.6299242 +257,*Oc1c(cc(cc1)C(c1cc(c(cc1)OC(=O)c1cc(ccc1)C(=O)*)[N+](=O)[O-])(CCC(=O)O)C)[N+](=O)[O-],{},127.0784722 +258,*c1ncc(cc1)C(=O)NNC(=O)*,{},135.1836518 +259,*c1n(c(cc1)*)C(C(=O)NO)n1ccc2c1cccc2,{},180.3082327 +260,*N=Nc1ccc(cc1)*,{},144.6951233 +261,*c1sc2c(c1)sc(c2)c1sc(cc1CCCCCCCCCCBr)c1sc(c(c1)CCCCCCCCCCBr)*,{},24.95900908 +262,*C(CC(C*)c1ccccc1)(C(=O)OC)C#N,{$CC(c1ccccc1)CC(C(=O)OC)(C#N)$},127.0156605 +263,*=C=C=C(C(=*)COS(=O)(=O)c1ccc(cc1)OC)COS(=O)(=O)c1ccc(cc1)OC,{$=C=C=C(COS(=O)(=O)c1ccc(OC)cc1)C(COS(=O)(=O)c1ccc(OC)cc1)=$},55.30555285 +264,*c1c(cc(c(c1)OCCCCCCCCCC)C#C*)OCCCCCCCCCC,{},61.85443636 +265,*OCCCC(=O)NCCCCCCNC(=O)CCCCCC*,{},1.467345964 +266,*c1cncc(c1)C(=O)NCCCCCCCCCCNC(=O)*,{},85.60365033 +267,*C(C(C(C(*)(F)F)(F)F)(F)F)(C(F)(F)F)F,{$C(F)(F)C(F)(F)C(F)(F)C(C(F)(F)F)(F)$},-86.88823628 +268,*Oc1cc(ccc1)C(=O)NNC(=O)c1cc(ccc1)C(=O)NNC(=O)CC*,{},234.2133464 +269,*Oc1cc(ccc1)NC(=O)c1ccc(cc1)C(c1ccc(cc1)C(=O)Nc1ccc(cc1)*)(C(F)(F)F)C(F)(F)F,{},157.6228264 +270,*c1n(c(nn1)CCCCCCCC*)N,{},-41.92176029 +271,*c1nc2c([nH]1)cc(cc2)c1cc2c(nc([nH]2)c2ccc(cc2)NC(=O)c2ccc(cc2)C(=O)Nc2ccc(cc2)*)cc1,{},358.667269 +272,*Nc1c(cc(cc1Cl)*)Cl,{},-0.214281278 +273,*c1ccc2n(c3c(c2c1)cc(cc3)N=Cc1sc(cc1)c1sc(cc1)C=N*)CCCCCC,{},95.07860424 +274,*N1C2(CCCC2)C(=O)N(C1=O)C(=O)c1ccc(cc1)N=Nc1ccc(cc1)C(=O)*,{},17.25361376 +275,*P(=Nc1ccc(cc1)N=P(CC*)(Cl)Cl)(Cl)Cl,{},-27.34500252 +276,*Nc1c(cc(c(c1)SCCC#N)NC(=O)c1cc(ccc1)C(=O)*)SCCC#N,{},38.16065966 +277,*C(C*)(c1ccc(cc1)OC(=O)CC)OC(=O)C,{$CC(c1ccc(OC(=O)CC)cc1)(OC(C)=O)$},2.162388076 +278,*C1C(=O)N(C(=O)C1C(C*)c1ccccc1)CCCCCCCC,{},105.6965321 +279,*c1sc2cc3c(cc2n1)sc(n3)CCCCC*,{},87.36313445 +280,*NC(C(=O)*)CCC(=O)OCCCCCCCCCCCC,{},30.09586697 +281,*NC(C(=O)*)CO,{},84.57547927 +282,*Nc1c(cccc1)NC(=O)CCCCCCC(=O)*,{},71.92438381 +283,*Oc1c(cc(cc1)C=Cc1ccc(cc1)C=Cc1cc(c(cc1)OC(=O)CCCCCCCCC(=O)*)C)C,{},35.47523522 +284,*c1c2c(nccn2)c(cc1)*,{},216.378732 +285,*Oc1ccc(cc1)Oc1ccc(cc1)C(=O)Nc1ccc(cc1)Oc1ccc(cc1)c1ccc(cc1)Oc1ccc(cc1)NC(=O)c1ccc(cc1)*,{},165.0428244 +286,*OP(=O)(OCCCCCCCCCCOc1ccc(cc1)C=Cc1ccc(cc1)OCCCCCCCCCC*)OCCCCCCCCCCOc1ccc(cc1)N=Nc1ccc(cc1)F,{},-7.2122695 +287,*c1cc2c(C(=O)N(C2=O)c2ccc(cc2)OCCCOc2ccc(cc2)N2C(=O)c3c(C2=O)cc(cc3)C(=O)Nc2ccc(cc2)S(=O)(=O)c2ccc(cc2)NC(=O)*)cc1,{},163.1829015 +288,*OC(=O)c1ccc(cc1)C(=O)NCCCCCCNC(=O)c1ccc(cc1)C(=O)OCCCCCCCCCC*,{},-20.66610996 +289,*NNC(=O)c1ccc(cc1)NC(=O)c1cc(cc(c1)N1C(=O)c2c(C1=O)c(c(c(c2Cl)Cl)Cl)Cl)C(=O)*,{},133.1528291 +290,*N=P(*)(OCCC(=O)C=C)OCCC(=O)C=C,{},-42.12432011 +291,*S(=O)(=O)c1ccc(cc1)c1ccc(cc1)*,{},229.0539301 +292,*c1cc2c(C(=O)N(C2=O)c2ccc(cc2)S(=O)(=O)c2ccc(cc2)N2C(=O)c3c(C2=O)cc(cc3)C(=O)N2C(=O)N(C(=O)C2(CC)C)C(=O)*)cc1,{},242.6534046 +293,*OP(=O)(OCCCCCCCCCCOc1ccc(cc1)C=Cc1ccc(cc1)OCCCCCCCCCC*)OCCCCCCCCCCOc1ccc(cc1)N=Nc1ccc(cc1)C,{},5.817024886 +294,*NC(CCCCNC(=O)NCCCCNC(=O)*)C(=O)OC,{},0.336556425 +295,*C(=C*)c1nc2c(n1C)cccc2,{$C=C(c1nc2ccccc2n1C)$},-6.104199835 +296,*Oc1ccc(cc1)N=Nc1ccc(cc1)*,{},116.9759489 +297,*c1nc(nc(n1)NC(=O)c1c(cc(c(c1)C(=O)N*)C(=O)O)C(=O)OC(=O)Nc1c(ccc(c1)NC(=O)OCCCCCCCC)C)c1ccccc1,{},-1.691479041 +298,*c1[nH]c2cc3c(cc2n1)nc([nH]3)c1ccc(cc1)*,{},340.5865983 +299,*OC(=O)Cc1ccc(cc1)CC(=O)OCCCC*,{},-34.10658315 +300,*SC(=O)NCCCCCCNC(=O)SCCCC*,{},-33.13797704 +301,*c1c(cc(c(c1)OC)*)OC,{},63.90036252 +302,*N(CC*)C(=O)CCC(C(C(C(C(C(F)(F)F)(F)F)(F)F)(F)F)(F)F)(F)F,{},-52.64825421 +303,*OC(=O)c1c(cccc1)NC(=O)c1ccc(cc1)C(=O)Nc1c(cccc1)C(=O)OC(=O)c1cc(ccc1)C(=O)*,{},185.0999075 +304,*c1sc(cc1CCCCCCCC)Nc1ccc(cc1)*,{},49.50827798 +305,*n1c(=O)c2cc3c(cc2c1=O)c(=O)n(c3=O)c1ccc(cc1)c1ccc(cc1)c1ccc(cc1)*,{},278.5221779 +306,*n1c(=O)c2c(c3c(c(c2c1=O)F)c(=O)n(c3=O)c1ccc(cc1)*)F,{},337.816724 +307,*OC(=O)c1ccc(cc1)C(=O)NCCNC(=O)c1ccc(cc1)C(=O)OCCCCCCCCCC*,{},4.250402609 +308,*NC(=O)NC(=O)NCc1c(c(cc(c1)Cc1c(c(cc(c1)C*)C=O)O)C=O)O,{},190.6072154 +309,*OC(=O)C(C*)(CCCC)CCCC,{},-17.98562642 +310,*C1(C(=O)C(CCC1)C*)CO,{},71.70468573 +311,*N1C(=O)c2c(C1=O)cc(cc2)Oc1ccc(cc1)Oc1cc2c(C(=O)N(C2=O)c2ccc(cc2)Oc2ccc(cc2)Oc2ccc(cc2)*)cc1,"{N4C(=O)c5ccc(cc5C4=O)Oc6ccc(cc6)Oc7ccc8c(c7)C(=O)N(C8=O)>}",191.2304459 +312,*C(=C*)c1cc(c(c(c1)CO)OCc1ccc(cc1)CNC(COCCCCCCCC)C)CO,{$C=C(c1cc(CO)c(OCc2ccc(CNC(C)COCCCCCCCC)cc2)c(CO)c1)$},65.78481038 +313,*n1c(=O)c2cc3c(cc2c1=O)c(=O)n(c3=O)CCCCCCCCCCCC*,{},73.37639604 +314,*c1sc2cc3c(cc2n1)sc(n3)c1ccc(cc1)c1ccc(cc1)*,{},343.9030515 +315,*c1sc(cc1COCCCCCCOc1ccc(cc1)c1ccc(cc1)C#N)*,{},59.88076613 +316,*Nc1ccc(cc1)CC(=O)*,{},63.68828699 +317,*C(C*)C(=O)N(c1ccccc1)O,{$CC(C(=O)N(O)c1ccccc1)$},141.1832818 +318,*Oc1c(cc(cc1)OC(=O)c1ccc(cc1)C(=O)*)c1ccccc1,"{C(=O)c2ccc(cc2)C(=O)>}",65.88587862 +319,*OC(CC(=O)*)C(C)C,{},-24.82439314 +320,*NC(C(=O)*)CC(=O)OCc1ccccc1,{},70.10647363 +321,*c1c(c(cc(c1)N=Nc1ccc(cc1)[N+](=O)[O-])*)O,{},165.6801535 +322,*OCCCCC(=O)NCCCCCCNC(=O)CCCC*,{},-56.49395983 +323,*=C=C=C(C(=*)CCCCOC(=O)NC(=O)OCCCC)CCCCOC(=O)NC(=O)OCCCC,{$=C=C=C(CCCCOC(=O)NC(=O)OCCCC)C(CCCCOC(=O)NC(=O)OCCCC)=$},27.51087357 +324,*Oc1cc(c(cc1)S(=O)(=O)c1ccc(cc1)Oc1ccc(cc1)C(c1ccc(cc1)*)(C)C)[N-][N+]#N,{},196.3053138 +325,*c1sc(nn1)SCC(=O)NN=Cc1ccc(cc1)OCCCCOc1ccc(cc1)C=NNC(=O)CS*,{},39.79677892 +326,*OC(=O)NCCSCCCCSCCNC(=O)OCC*,{},26.25521298 +327,*C(C*)C(=O)N1CC[N+](CC1)(CCCCCCCCCCCC)C,{$CC(C(=O)N1CC[N+](C)(CCCCCCCCCCCC)CC1)$},-81.38297384 +328,*OC(=O)c1cc(ccc1)c1cc(ccc1)C(=O)OCCCCCCCCCC*,{},8.531981028 +329,*C1(C(=O)OC(=O)C1)C*,{},143.854555 +330,*Oc1cc(ccc1)OC(=O)c1ccc(cc1)C(=O)Oc1cc(ccc1)OCCCCCCCCCC*,{},20.92794041 +331,*Oc1ccc(cc1)OC(=O)c1ccc(cc1)OC(=O)c1ccc(cc1)C(=O)Oc1ccc(cc1)C(=O)Oc1ccc(cc1)OC(=O)c1cc(ccc1)C(=O)*,{},119.4238893 +332,*=C1CN(CC(=Cc2oc(cc2)C(=O)CCCCC(=O)c2oc(cc2)C=*)C1=O)C,{<=Cc1ccc(o1)C(=O)CCCCC(=O)c2ccc(o2)C=C3CN(C)CC(C3=O)=>},95.14114513 +333,*Oc1ccc(cc1)NC(=O)c1c(c(c(c(c1F)F)C(=O)Nc1ccc(cc1)*)F)F,{},168.6168885 +334,*C(=C(CC*)C)C,{$CCC(C)=C(C)$},46.49641909 +335,*S(=O)(=O)NCCNS(=O)(=O)c1ccc(cc1)*,{},7.54081281 +336,*N(C(=O)*)c1ccccc1,{},150.7130657 +337,*OC(=O)C(C(=O)OCCCCCC*)CCCCCCOc1ccc(cc1)c1ccc(cc1)OCc1ccc(cc1)[N+](=O)[O-],{},17.95743357 +338,*N1C(=S)SC(=Cc2ccc(cc2)C=C2SC(=S)N(C2=O)CCCCCC*)C1=O,{},35.37444876 +339,*c1oc(cc1)Sc1oc(cc1)C=Nc1ccc(cc1)N=C*,{},95.64631957 +340,*OC(=O)c1ccc(cc1)S(=O)(=O)CCCCCCS(=O)(=O)c1ccc(cc1)C(=O)OCCCCCC*,{},35.86441642 +341,*OS(=O)(=O)c1cc(ccc1)S(=O)(=O)Oc1cc(ccc1)*,{},29.19231194 +342,*SSC(=O)N(c1ccc(cc1)Cc1ccc(cc1)N(C(=O)SSCCCC*)C)C,{},20.68923822 +343,*C(C*)(c1ccc(cc1)OC(=O)C)OC(=O)C,{$CC(c1ccc(OC(C)=O)cc1)(OC(C)=O)$},74.85696518 +344,*Oc1ccc(cc1)C(=Cc1ccc(cc1)OC(=O)CCCCCCCCCCC(=O)*)C,{},6.458707655 +345,*N(C(=O)CCCCC(=O)N(CC(C(C(C*)(F)F)(F)F)(F)F)CC)CC,{},10.59395774 +346,*c1ccc(cc1)C=C1C(=O)C(=Cc2ccc(cc2)C(=O)c2ccc(cc2)C(=O)*)CC1,{},192.2096838 +347,*Oc1cc(c(cc1)N1ON1c1c(cc(cc1)OC(=O)CCCCCCCCCCC(=O)*)C)C,{},50.67469492 +348,*c1c(cc(c(c1)Oc1ccc(cc1)S(=O)(=O)O[Na])c1ccc(cc1)*)Oc1ccc(cc1)S(=O)(=O)O[Na],{},172.5717242 +349,*OC(=O)NCCCCC*,{},-13.55087665 +350,*N1C(=O)C(CC1=O)Nc1ccc(cc1)NC1C(=O)N(C(=O)C1)c1ccc(cc1)Cc1ccc(cc1)*,{},248.5034267 +351,*C(=C*)c1ccc(cc1)OCCCCCC(=O)Oc1c(c(c(c(c1F)F)F)F)F,{$C=C(c1ccc(OCCCCCC(=O)Oc2c(F)c(F)c(F)c(F)c2F)cc1)$},73.83198457 +352,*Oc1cc(ccc1)OC(=O)c1ccc(cc1)C=Nc1ccc(cc1)OCCCCCCOc1ccc(cc1)N=Cc1ccc(cc1)C(=O)*,{},83.83402436 +353,*C1=NC2=CC(C=CC2=C1)*,{},103.156476 +354,*C=CCCCCCCCC*,{$C=CCCCCCCCC$},-17.2820223 +355,*C(CCC*)Cl,{$CCCC(Cl)$},-30.93658282 +356,*C#CC(=C(*)CCCCOC(=O)NCCC)CCCCOC(=O)NCCC,{$C#CC(CCCCOC(=O)NCCC)=C(CCCCOC(=O)NCCC)$},40.70123878 +357,*NC(CC(=O)*)c1ccccc1,{},-30.79261317 +358,*S(=O)(=O)C(C=CC(*)C)C,{},33.82674704 +359,*C(C(*)(C([2H])([2H])[2H])C([2H])([2H])[2H])([2H])[2H],{$C([2H])([2H])C(C([2H])([2H])[2H])(C([2H])([2H])[2H])$},85.2671101 +360,*c1sc(cc1)C(=O)NCCCCCCNC(=O)*,{},42.90894132 +361,*SC(=O)c1ccc(cc1)C(=O)SCc1c(c(c(c(c1C)C)C*)C)C,{},127.8963011 +362,*c1ccc2c(c1)c(=O)oc(n2)c1cc(cc(c1)N1C(=O)c2c(C1=O)c(c(c(c2Cl)Cl)Cl)Cl)c1oc(=O)c2c(ccc(c2)C*)n1,{},378.8956296 +363,*Nc1ccc(cc1)NC(=O)c1c(cc(c(c1)SCCCCCCCC)C(=O)*)SCCCCCCCC,{},69.67482713 +364,*Oc1c(cc(cc1)C=Nc1ccc(cc1)N=Cc1cc(c(cc1)OC(=O)CCCCC(=O)*)OC)OC,{},-41.85748469 +365,*Oc1ccc(cc1)OC(=O)c1ccc(cc1)OC(=O)CCCCCCC(=O)*,"{C(=O)c2ccc(cc2)OC(=O)CCCCCCC(=O)>}",72.32081554 +366,*c1nc(sc1)N=Cc1cc(c(cc1)OCCCCOc1c(cc(cc1)C=Nc1nc(cs1)c1ccc(cc1)Oc1ccc(cc1)*)OC)OC,{},104.0626058 +367,*Oc1ccc(cc1)S(=O)(=O)c1ccc(cc1)Oc1ccc(cc1)C(=O)NNC(=O)c1cc(c(cc1)NC(=O)c1ccc(cc1)*)O,{},111.457133 +368,*Oc1ccc(cc1)c1ccc(cc1)OC(=O)c1cc(ccc1)Oc1ccc(cc1)C(=O)*,"{C(=O)c3cccc(c3)Oc4ccc(cc4)C(=O)>}",89.5930824 +369,*C(C*)OCCC,{$CC(OCCC)$},-17.16506937 +370,*OC(=O)CCCC(=O)OCC(C(C(C(C*)(F)F)(F)F)(F)F)(F)F,{},-64.11657159 +371,*OC1CCC(CC1)OC(=O)CCCCCCC(=O)*,"{C(=O)CCCCCCC(=O)>}",-35.83964331 +372,*N1C(=O)N(C(=O)C1(C)C)C(=O)c1ccc(cc1)N=Nc1ccc(cc1)C(=O)*,{},302.3547996 +373,*Oc1ccc(cc1)OCC(=O)OC(=O)c1ccc(cc1)C(=O)OC(=O)C*,{},80.44433982 +374,*c1c2c(c(cc1)*)cccc2,{},189.0821827 +375,*C1C(=O)N(C(=O)C1C(C*)c1ccccc1)c1ccc(cc1)Cl,{},168.825845 +376,*[Si](c1ccc(cc1)*)(OCC)OCC,{},7.02246836 +377,*NC(CNC(=O)NCCCCCCNC(=O)*)C,{},55.35434565 +378,*c1nc(cs1)c1ccc(cc1)c1nc(sc1)CCCC*,{},110.5188915 +379,*Oc1ccc(cc1)OC(=O)CCCCCCCC(=O)*,"{C(=O)CCCCCCCC(=O)>}",-6.058372606 +380,*C(C*)OC(=O)C(CC)(CC)CC,{$CC(OC(=O)C(CC)(CC)CC)$},20.41787387 +381,*NC(=O)CCCCCCCCCCCCCCC(=O)NCCc1ccc(cc1)CC*,{},65.25488595 +382,*C1CCN(CC1)SC(=O)OCCCCOC(=O)SN1CCC(CC1)CCC*,{},-6.032415023 +383,*Oc1cc2c(C(CC2(C)C)(c2ccc(cc2)Oc2ccc(cc2)C2(CC(c3c2cc(cc3)*)(C)C)C)C)cc1,{},308.5311609 +384,*OC(=O)c1ccc(cc1)CCc1ccc(cc1)C(=O)OCC*,{},53.2795594 +385,*c1oc2c(n1)cc(cc2)c1cc2c(oc(n2)CCCCCCCC*)cc1,{},61.93714913 +386,*C(=C*)CNS(=O)(=O)CC,{$C=C(CNS(=O)(=O)CC)$},44.748248 +387,*c1nc(ccc1)Oc1ccc(cc1)Oc1ccc(cc1)O*,{},-19.11943844 +388,*C(=C(*)C)[Si](CCCC)(C)C,{$C(C)=C([Si](C)(C)CCCC)$},152.5242236 +389,*OC(=O)CCSCCC(=O)*,{},6.950276755 +390,*Oc1ccc(cc1)Oc1ccc(cc1)NC(=C(C#N)C#N)c1ccc(cc1)c1ccc(cc1)C(=C(C#N)C#N)Nc1ccc(cc1)*,{},270.2905197 +391,*Oc1cc(ccc1)OC(=O)Oc1ccc(cc1)OC(=O)*,{},-104.3379932 +392,*OC(=O)c1ccc(cc1)NC(=O)CCCCC(=O)Nc1ccc(cc1)C(=O)OCCC*,{},83.99780359 +393,*c1nc2c([nH]1)cc(cc2)NC(=NO)C(=NO)Nc1ccc2c(nc([nH]2)CCCC*)c1,{},170.1130329 +394,*Nc1c(c(c(c(c1C)C)NC(=O)c1ccc(cc1)C(=O)*)C)C,{},205.6552433 +395,*OC(=O)Nc1ccc(cc1)NC(=O)OCC*,{},134.2908917 +396,*Oc1c(cc(cc1)OC(=O)c1ccc(cc1)C(=O)*)Cl,{},119.6241841 +397,*Oc1ccc(cc1)c1ccc(cc1)OC(=O)c1c(cc(c(c1)C(=O)OCCCCCC)C(=O)*)C(=O)OCCCCCC,{},77.13970172 +398,*N1CCN(CC1)CCC(=O)N(CCN(C(=O)CC*)C(C)C)C(C)C,{},-43.71593166 +399,*c1ccc2n(c3c(c2c1)cc(cc3)C=NN(c1ccc(cc1)S(=O)(=O)c1ccc(cc1)N(N=C*)CCCC)CCCC)CC,{},139.604302 +400,*Sc1ccc(cc1)c1ccc(cc1)SC(=O)CCCCC(=O)*,{},13.2621556 +401,*N(c1ccc(cc1)c1ccc(cc1)N(C(=O)c1ccc(cc1)C(=O)*)CC)CC,{},62.56896406 +402,*S(=O)(=O)CCCC*,{},-14.41108992 +403,*c1ccc(cc1)C=C1C(=O)C(=Cc2ccc(cc2)C(=O)c2ccc(cc2)C(=O)*)CCC1,{},240.3798309 +404,*c1ccc2n(c3c(c2c1)cc(cc3)*)CC,{},206.6525359 +405,*OC(C(C(=O)*)(C)C)c1ccccc1,{},65.42132156 +406,*c1nc2c([nH]1)cc(cc2)c1ccc2c(nc([nH]2)c2ccc(cc2)C(=O)c2ccc(cc2)*)c1,{},300.9001313 +407,*Nc1ccc(cc1)NC(=S)NC(=O)c1ccc(cc1)C(=O)NC(=S)*,{},220.8197438 +408,*=c1c2cc3c(cc2c(=O)o1)c(=O)oc3=Nc1cc(ccc1)Oc1cc(ccc1)Oc1cc(ccc1)N=*,{<=Nc1cccc(c1)Oc2cccc(c2)Oc3cccc(c3)N=C4OC(=O)c5cc6c(cc54)C(OC6=O)=>},132.5253262 +409,*OS(=O)(=O)c1ccc(cc1)S(=O)(=O)c1ccc(cc1)S(=O)(=O)Oc1ccc(cc1)S(=O)(=O)c1ccc(cc1)*,{},194.2678014 +410,*OC(CCOC(=O)c1cc(ccc1)C(=O)*)C,"{C(=O)c1cccc(c1)C(=O)>}",-42.28619333 +411,*NC(=O)CCCCCCCCC(=O)NC*,{},66.76854223 +412,*NC(=O)CCP(=O)(CCC(=O)NCC*)C,{},143.8466204 +413,*C(C*)c1c(cccc1)OC,{$CC(c1ccccc1OC)$},160.10962 +414,*OCCCCCOCCCCCCOCCCCCC*,{},-72.01996519 +415,*NC(=O)C(=O)NCCNC(=O)CCCCCCCC(=O)NCC*,{},112.5256843 +416,*OC(=O)c1ccc(cc1)C(=O)OCC(C*)(CCl)CCl,{},11.46555203 +417,*OC(COC(=O)CCCCC(=O)*)C,"{C(=O)CCCCC(=O)>}",-15.95518318 +418,*c1ccc(cc1)c1ccc(cc1)C(*)(C)C,{},210.9469969 +419,*Oc1ccc(cc1)Oc1ccc(cc1)NC(=O)c1ccc(cc1)C(=O)Nc1ccc(cc1)*,{},203.5999878 +420,*Oc1c(cc(cc1)Oc1ccc(cc1)C(=O)c1ccc(cc1)*)CBr,{},98.31320425 +421,*Oc1cc(ccc1)C(=O)OC(=O)c1cc(ccc1)OCC*,{},3.697542049 +422,*C(=C*)c1ccc(cc1)[N+](=O)[O-],{$C=C(c1ccc([N+](=O)[O-])cc1)$},-12.60583746 +423,*c1sc(cc1)[Si](c1sc(cc1)[SiH](*)C)(C)C,{},60.04219897 +424,*Oc1ccc(cc1)C(c1ccc(cc1)OC(=O)c1ccc(cc1)Oc1ccc(cc1)C(=O)*)C,"{C(=O)c3ccc(cc3)Oc4ccc(cc4)C(=O)>}",76.95210942 +425,*C(C*)C(=O)n1sc2c(c1=O)cccc2,{$CC(C(=O)n1sc2ccccc2c1=O)$},48.67425788 +426,*Nc1cc(cc(c1)C(=O)OCCN(c1ccc(cc1)S(=O)(=O)C(C(C(C(C(C(C(C(F)(F)F)(F)F)(F)F)(F)F)(F)F)(F)F)(F)F)(F)F)C)NC(=O)c1cc(cc(c1)OCCN(c1ccc(cc1)S(=O)(=O)C(C(C(C(C(C(C(C(F)(F)F)(F)F)(F)F)(F)F)(F)F)(F)F)(F)F)(F)F)C)C(=O)*,{},172.7344272 +427,*n1c(=O)c2cc3c(cc2c1=O)c(=O)n(c3=O)c1c(cc(cc1)c1cc(c(cc1)*)C)C,{},271.2577887 +428,*c1c(cc(c(c1)OCCCCCCOc1ccc(cc1)C1CCC(CC1)CCCCC)C=C*)OCCCCCCOc1ccc(cc1)C1CCC(CC1)CCCCC,{},36.67582945 +429,*OC(=O)SSC(=O)OCCCC*,{},-58.94578339 +430,*NC(C(=O)NCC(=O)*)C,{},19.96697332 +431,*NC(=O)CNC(=O)CC*,{},70.42884183 +432,*SC(=O)CCCCC(=O)SCc1c(c(c(c(c1C)C)C*)C)C,{},44.80678229 +433,*C(=C*)c1ccccc1,{$C=C(c1ccccc1)$},66.12991425 +434,*c1cc2c(C(=O)N(C2=O)c2cc(ccc2)NC(=O)c2cc(ccc2)C(=O)Nc2cc(ccc2)NC(=O)*)cc1,{},213.4893774 +435,*Oc1c(cc(cc1Br)C(c1cc(c(c(c1)Br)OC(=O)c1cc(ccc1)C(=O)*)Br)(CCC(=O)O)C)Br,{},175.2273279 +436,*OC(=O)CCCCC(=O)OCC(C*)(CCl)CCl,{},2.83724036 +437,*N1CCN(CC1)C(=O)SSCCCCSSC(=O)*,{},-3.571974592 +438,*N1C(=O)N(C(=O)C1(c1ccccc1)c1ccccc1)C(=O)c1ccc(cc1)N=Nc1ccc(cc1)C(=O)*,{},183.844758 +439,*c1cc2n(c3c(c2cc1)ccc(c3)C#CC#C*)CCCCCCCCCCCCCCCC,{},68.35869686 +440,*c1c2c(nccc2)c(cc1)OCc1ccc(cc1)COc1c2ncccc2c(cc1)C*,{},200.3538526 +441,*OC(=O)c1ccc(cc1)C(=O)OCCOCCOCC*,{},36.06576581 +442,*c1c2c(c(s1)*)sc(n2)CCCCCCCCC,{},56.24069524 +443,*=C=C=C(C(=*)COS(=O)(=O)c1ccc(cc1)C)COS(=O)(=O)c1ccc(cc1)C,{$=C=C=C(COS(=O)(=O)c1ccc(C)cc1)C(COS(=O)(=O)c1ccc(C)cc1)=$},76.80290526 +444,*Oc1c(cc(cc1)C=CC=Cc1cc(c(cc1)OCCCCCCC*)C)C,{},41.89270134 +445,*Oc1ccc(cc1)C(=O)OCCCCOC(=O)c1ccc(cc1)OC(=O)c1ccc(cc1)C(=O)*,{},68.79315579 +446,*Oc1c(c(ccc1)Oc1ccc(cc1)C(=O)Nc1ccc(cc1)Oc1ccc(cc1)NC(=O)c1ccc(cc1)*)C#N,{},160.9324159 +447,*C1C(=O)N(C(=O)C1*)c1ccc(cc1)COC(C)(C)C,{},122.5873684 +448,*c1ccc2n(c3c(c2c1)cc(cc3)C(=O)Oc1ccc(cc1)C(c1ccc(cc1)OC(=O)*)(C)C)C,{},73.83261176 +449,*/C=C/*,{$/C=C/$},59.5588378 +450,*c1ncc(cc1)c1n(c(cc1)c1n(c(cc1)*)C)C,{},256.5965094 +451,*c1ncc(cc1)*,{},322.0959561 +452,*c1cc2c(C(=O)N(C2=O)c2ccc(cc2)Oc2ccc(cc2)N2C(=O)c3c(C2=O)cc(cc3)C(=O)Nc2ccc(cc2)NC(=O)*)cc1,{},48.20867337 +453,*N(C(=O)*)CCCCCC,{},14.34014558 +454,*Nc1cc(ccc1)C#Cc1cc(ccc1)NC(=O)c1c(cc(cc1)C(=O)*)C(=O)O,{},187.6187871 +455,*c1c(cc(c(c1)C*)C)O,{$Cc1cc(c(O)cc1C)$},12.40187977 +456,*Oc1cc(c(cc1)C(=O)Nc1ccc(cc1)NC(=O)c1c(cc(cc1)*)C(=O)O)C(=O)O,{},216.6231471 +457,*c1nc(nc(n1)Oc1cc(ccc1)C(=O)Nc1ccc(cc1)Oc1ccc(cc1)NC(=O)c1cc(ccc1)O*)Sc1ccccc1,{},184.9515774 +458,*=C1c2c(C(=O)O1)cc(cc2)c1cc2c(C(=O)OC2=Nc2cc(ccc2)Oc2cc(ccc2)Oc2cc(ccc2)N=*)cc1,{<=Nc1cccc(c1)Oc2cccc(c2)Oc3cccc(c3)N=C4OC(=O)c5ccc(cc54)c6ccc7c(c6)C(=O)OC7=>},193.7356518 +459,*Nc1c(cc(cc1)*)CC,{},162.1855704 +460,*OC(=O)NCCCCCCNC(=O)OCCN(CC*)c1ccc(cc1)N=Nc1ccc(cc1)C,{},24.09354398 +461,*c1nc2c([nH]1)ccc(c2)c1ccc2c(nc([nH]2)c2ccc(cc2)*)c1,{},423.6341908 +462,*c1ncc(cc1)C(=O)OC(=O)COc1ccc(cc1)OCC(=O)OC(=O)*,{},155.9709567 +463,*C(C*)C(=O)OCCN(S(=O)(=O)C(C(C(C(C(C(C(C(F)(F)F)(F)F)(F)F)(F)F)(F)F)(F)F)(F)F)(F)F)C,{$CC(C(=O)OCCN(C)S(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F)$},-148.0297376 +464,*OC(=O)c1cc(ccc1)C(=O)OCC1C(C1)C*,{},10.23490017 +465,*NC(C(=O)*)C(C)C,{},153.0278775 +466,*C(C*)(C(=O)OCC(C(C(C(C(C(C(F)(F)F)(F)F)(F)F)(F)F)(F)F)(F)F)(F)F)F,{$CC(C(=O)OCC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F)(F)$},-98.70472035 +467,*C(*)C(=O)OC(C)(C)C,{$C(C(=O)OC(C)(C)C)$},12.73837737 +468,*OC(=O)c1c(cccc1)c1c(cccc1)C(=O)OCCCC*,{},71.34273106 +469,*Nc1ccc(cc1)Cc1ccc(cc1)NC(=O)Nc1ccc(cc1)CCc1ccc(cc1)NC(=O)*,{},183.1975298 +470,*C1C(=O)N(C(=O)C1C(C*)OC(=O)C)c1ccccc1,{},157.3759721 +471,*OC(CC(=O)*)C(Cl)Cl,{},110.2326669 +472,*OC(=O)NCCCCCCCCCCNC(=O)OCCCCCCCC*,{},24.78605928 +473,*C1C=CC(CC1)*,{},103.377349 +474,*NC(=O)NCCCP(CCC*)c1ccccc1,{},-49.19787298 +475,*c1oc(nn1)CCCCCCCC*,{},-48.88416733 +476,*OC(=O)NCCCCCCNC(=O)OCC(C*)(C)C,{},47.60248847 +477,*N=P(*)(OCC(C(C(F)(F)F)(F)F)(F)F)OCC(C(C(F)(F)F)(F)F)(F)F,{},-77.91107652 +478,*OC(=O)c1ccc(cc1)C(=O)OCCCCCC(=O)NCCNC(=O)CCCCC*,{},81.65245915 +479,*c1cc2c(C(=O)OC2=Nc2cc(ccc2)N=C2OC(=O)c3c2cc(cc3)C(=O)*)cc1,{},246.6584182 +480,*C*,{$C$},-2.526682925 +481,*c1c(nnc(n1)c1nc(ccc1)c1nc(c(nn1)c1ccccc1)c1ccc(cc1)Sc1ccc(cc1)*)c1ccccc1,{},419.5781202 +482,*c1cc2c(C(=O)N(C2=O)c2c(cc(cc2)c2cc(c(cc2)N2C(=O)c3c(C2=O)cc(cc3)C(=O)Oc2ccc(cc2)C(c2ccc(cc2)OC(=O)*)(C)C)C)C)cc1,{},87.17702398 +483,*=c1sc(cs1)c1ccc(cc1)C=*,{<=Cc1ccc(cc1)C2=CSC(S2)=>},105.3372816 +484,*Oc1ccc(cc1)N=Cc1ccc(cc1)OC(=O)c1ccc(cc1)C=Nc1ccc(cc1)OCCCCOC(=O)NCCCCCCNC(=O)OCCCC*,{},-34.25655466 +485,*Oc1ccc(cc1)C(=O)CNc1ccc(cc1)NCC(=O)c1ccc(cc1)*,{},158.6944649 +486,*OC(=O)N(c1c(ccc(c1)N(C(=O)OCC*)C)C)C,{},42.46691504 +487,*c1cc2c(C(=O)N(C2=O)c2ccc(cc2)Oc2ccc(cc2)N2C(=O)c3c(C2=O)cc(cc3)C(=O)N2C(=O)N(C(C(=C2C)C(=O)OCC)c2ccc(cc2)Cl)C(=O)*)cc1,{},170.3261787 +488,*C(C*)(C(=O)Oc1ccc(cc1)C)C,{$CC(C(=O)Oc1ccc(C)cc1)(C)$},126.1154692 +489,*c1sc2c(n1)ccc(c2)OCCCCCCCCCCCOC(=O)CCCCC(=O)OCCCCCCCCCCCOc1ccc(cc1)*,{},-17.21827415 +490,*c1nc2c(c(c1)OCCO*)cc(cc2)C,{},88.77435741 +491,*Oc1ccc(cc1)C(=O)CCCCCCCCC(=O)c1ccc(cc1)OC(=O)*,{},6.568396994 +492,*c1ccc2c(nc([nH]2)c2cc(ccc2)c2nc3c([nH]2)ccc(c3)C(=O)Nc2ccc(cc2)NC(=O)*)c1,{},309.3374253 +493,*OCC1(C2CCC(C1)CC2)C*,{},81.08225465 +494,*c1nc(ccc1)C=Nc1ccc(cc1)N=C*,{},139.1234551 +495,*/C(=C(/*)\c1ccccc1)/c1ccccc1,{$/C(c1ccccc1)=C(c1ccccc1)/$},206.5698859 +496,*N1C(CN(C(C1)C)C(=O)OCCOC(=O)*)C,{},60.10633691 +497,*C1C(CC1)*,{$C1CCC1$},69.57488221 +498,*OC(C*)CCCCCCCCOc1ccc(cc1)C(=O)Oc1ccc(cc1)C(=O)OCC(CC)C,{},-0.17829817 +499,*C(C*)(C(=O)OCCOC(=O)c1cc(cc(c1)OC(=O)c1ccc(cc1)N=Nc1ccc(cc1)OCCCCCCC)OC(=O)c1ccc(cc1)N=Nc1ccc(cc1)OCCCCCCC)C,{$CC(C(=O)OCCOC(=O)c1cc(OC(=O)c2ccc(N=Nc3ccc(OCCCCCCC)cc3)cc2)cc(OC(=O)c2ccc(N=Nc3ccc(OCCCCCCC)cc3)cc2)c1)(C)$},9.014452923 +500,*C(C*)C(=O)OCCOC(C(F)(F)F)(C(F)(F)F)F,{$CC(C(=O)OCCOC(F)(C(F)(F)F)C(F)(F)F)$},-51.63721715 +501,*Sc1ccc(cc1)*,{},64.586946 +502,*Oc1ccc(cc1)CCCNC(=O)CCCCC(=O)NCCCc1ccc(cc1)OCCCCC*,{},28.2211431 +503,*SSCCCCSSCCCCCC*,{},-41.26672381 +504,*c1cc2c(C(=O)N(C2=O)c2ccc(cc2)N2C(=O)c3c(C2=O)cc(cc3)C(=O)OCCN(CCOC(=O)*)c2ccc(cc2)N=Nc2ccc(cc2)[N+](=O)[O-])cc1,{},145.3751112 +505,*n1c(=O)c2cc3c(cc2c1=O)c(=O)n(c3=O)C(C(=O)N1C(=O)N(C(C1=O)(C)C)C(=O)C(*)C)C,{},252.5865186 +506,*OC(=O)CCCCC(=O)OCc1ccc(cc1)C*,{},-4.158432897 +507,*Nc1c(cc(cc1)NC(=O)c1c(cc(c(c1)C(=O)*)C(=O)O)C(=O)O)S(=O)(=O)O[Na],{},129.970858 +508,*Nc1ccc(cc1)Cc1ccc(cc1)NC(=O)c1cc(cc(c1)NC(=O)C(CC(C)C)N1C(=O)c2c(C1=O)cccc2)C(=O)*,{},36.87047274 +509,*N1C(CN(C(C1)C)SC(=O)OCCCCOC(=O)S*)C,{},-30.512105 +510,*N(C(=O)*)CC=C,{},164.8639006 +511,*[Si](c1ccc(cc1)*)(c1ccc(cc1)CN(C)C)C,{},12.90627629 +512,*c1sc(cc1)C#CC#C*,{},49.13696662 +513,*c1ncnc(c1)C=Cc1ccc(cc1)C=C*,{},84.80053451 +514,*c1nc(nc(n1)Oc1c2c(ccc1C(=O)Nc1ccc(cc1)NC(=O)c1c(c3c(cc1)cccc3)O*)cccc2)N1CCCCC1,{},162.6671353 +515,*OC(=O)CCC(=O)OCCCCCCCCCC*,{},1.783806133 +516,*c1cc2n3c(=O)c4cc5c(cc4c3nc2cc1)c(=O)n1c2ccc(cc2nc51)*,{},384.637936 +517,*c1ccc2n(c3c(c2c1)cc(cc3)C(=O)Oc1ccc(cc1)C1(c2c(C(=O)O1)cccc2)c1ccc(cc1)OC(=O)*)C,{},262.5942508 +518,*=Nc1ccc(cc1)N=C(Nc1c(cc(cc1)c1cc(c(cc1)NC(=*)C)C(=O)O)C(=O)O)C,{<=Nc1ccc(cc1)N=C(C)Nc2ccc(cc2C(=O)O)c3ccc(c(C(=O)O)c3)NC(C)=>},89.38045943 +519,*OC(=O)C=C(CC(=O)OCC*)c1ccc(cc1)OCC,{},1.131191733 +520,*OC(=O)Nc1c(ccc(c1)NC(=O)OCCOCCOCCC*)C,{},-26.75831261 +521,*Nc1c(cccc1)CCc1c(cccc1)NC(=O)*,{},207.655323 +522,*SC(=O)Nc1ccc(cc1)Cc1ccc(cc1)NC(=O)SCc1ccc(cc1)C*,{},90.77725081 +523,*c1cc2c(C(=O)N(C2=O)c2ccc(cc2)Oc2ccc(cc2)N2C(=O)c3c(C2=O)cc(cc3)C(=O)NCCC[Si](O[Si](CCCNC(=O)*)(C)C)(C)C)cc1,{},143.0502256 +524,*OC(=O)c1cc(ccc1)C(=O)OCCNC(=O)c1ccc(cc1)C(=O)NCC*,{},121.124261 +525,*C(C*)(C(=O)OCCCCCCCCCCOc1ccc(cc1)N1C(=O)C(=Cc2c(c3c(n2C)cccc3)C)C(=C(C)C)C1=O)C,{$CC(C(=O)OCCCCCCCCCCOc1ccc(N2C(=O)C(=Cc3c(C)c4ccccc4n3C)C(=C(C)C)C2=O)cc1)(C)$},-17.96880959 +526,*SC(=O)CCCCCCCCC(=O)SCCCCCC*,{},-39.13613776 +527,*c1nc(ccc1)C(=O)NCCCCCCCCCCNC(=O)*,{},49.59402876 +528,*NC(=O)c1ccc(cc1)C(=O)NCCCCCCCCCC*,{},50.1155014 +529,*Nc1ccc(cc1)NC(=O)c1ccc(cc1)NC(=O)C=Cc1ccc(cc1)C=CC(=O)Nc1ccc(cc1)C(=O)*,{},169.7759737 +530,*NNC(=O)CCCCCCCCC(=O)NNC(=S)c1cc(ccc1)C(=S)*,{},32.4414411 +531,*N1C(=O)c2c(C1=O)cc(cc2)Oc1cc2c(C(=O)N(C2=O)c2ccc(cc2)NC(=O)c2cc(ccc2)C(=O)Nc2ccc(cc2)*)cc1,{},155.8375326 +532,*NC(=S)C=Cc1ccc(cc1)Cc1ccc(cc1)C=CC(=S)NCC*,{},99.87529352 +533,*c1ccc2[nH]c3c(c2c1)cc(cc3)C(=O)c1c(cc(c(c1)C(=O)*)C(=O)O)C(=O)O,{},106.1795053 +534,*c1nc(sc1)NC(=O)Nc1ccc(cc1)Cc1ccc(cc1)NC(=O)Nc1nc(cs1)c1ccc(cc1)Oc1ccc(cc1)*,{},221.6298798 +535,*OC(=O)c1ccc(cc1)N1ON1c1ccc(cc1)C(=O)OCCOCC*,{},15.34426557 +536,*c1ccc2nc3c(Sc4cc(ccc4N3)c3cc4Sc5c(Nc4cc3)nc3ccc(cc3n5)O*)nc2c1,{},383.4 +537,*c1cc2Sc3c(=Nc2cc1)[nH]c1ccc(cc1n3)c1ccc2[nH]c3=Nc4c(Sc3nc2c1)cc(cc4)*,{},418.69 +538,*c1nc2c([nH]1)ccc(c2)c1ccc2c(nc([nH]2)c2oc(cc2)*)c1,{},419.98 +539,*=C1OC(=c2cc3ccc4=CC(=*)C=c5ccc(c2)c3c45)c2c3c4c1ccc1cccc(c41)c1c3c(ccc1)cc2,{<=c1cc2ccc3cc(cc4ccc(c1)c2c34)=c4oc(c5ccc6cccc7c8cccc9ccc4c(c98)c5c67)=>},432.43 +540,*c1cc2c3n(c4ccc(cc4n3)Oc3ccc4n5c(nc4c3)c3ccc(cc3C5=O)C(*)(C(F)(F)F)C(F)(F)F)C(=O)c2cc1,{},395.15 +541,*c1cc2n3c(=O)c4cc5c(cc4c3nc2cc1)c(=O)n1c2ccc(cc2nc51)O*,{},416.53 +542,*n1c(=O)c2cc3c(cc2c1=O)c(=O)n(c3=O)c1ccc(cc1)c1c(cc(cc1c1ccc(cc1)c1ccccc1)c1ccc(cc1)c1cc(c(c(c1)c1ccc(cc1)c1ccccc1)c1ccc(cc1)*)c1ccc(cc1)c1ccccc1)c1ccc(cc1)c1ccccc1,{},435 +543,*n1c(=O)c2cc3c(cc2c1=O)c(=O)n(c3=O)c1ccc2c(nc([nH]2)c2ccc(cc2)*)c1,{},456.35 +544,*N1C(=O)c2cc3C(c4c(Oc3cc2C1=O)cc1C(=O)N(C(=O)c1c4)c1cc(c(cc1)c1c(cc(cc1)*)C(F)(F)F)C(F)(F)F)(C(F)(F)F)C(F)(F)F,{},472.25 +545,*c1cc2nc3c4c5c6c(c3nc2cc1)cccc6c1nc2ccc(cc2nc1c5ccc4)*,{},411.97 +546,*=C1C=c2ccc3cc(=C4c5ccccc5C(=*)c5ccccc45)cc4ccc(=C1)c2c34,{<=c1cc2ccc3cc(cc4ccc(c1)c2c34)=c4c5ccccc5c(c5ccccc45)=>},437.49 +547,*c1n(c(cc1)*)C(C(=O)OC)C,{},279.4452403 +548,*NC(C(=O)NCC(=O)NCC(=O)*)C,{},208.6397491 +549,*c1sc2cc3c(cc2n1)sc(n3)c1c(cc(c(c1)OCCCCCC)*)OCCCCCC,{},168.5263131 +550,*C(*)C(=O)OC(CC)(C)C,{$C(C(=O)OC(C)(C)CC)$},136.5678336 +551,*N(c1ccc(cc1)*)CCCCCCC,{},110.7170963 +552,*Oc1ccc(cc1)OC(=O)c1c(cc(cc1)C(=O)*)c1ccccc1,"{C(=O)c2ccc(cc2c2ccccc2)C(=O)>}",227.700588 +553,*S(=O)(=O)NCCNS(=O)(=O)c1ccc(cc1)c1ccc(cc1)*,{},173.2454244 +554,*Oc1ccc(cc1)c1ccc(cc1)OC(=O)c1ccc(cc1)OC(=O)c1ccc(cc1)C(=O)Oc1ccc(cc1)C(=O)*,{},213.4133554 +555,*c1[nH]c(cc1c1ccccc1)*,{},120.4503456 +556,*c1ccc(cc1)C1C(C(C1C(=O)OCC)*)C(=O)OCC,{},164.3224631 +557,*C(C(*)O)C,{$C(C)C(O)$},113.5665564 +558,*C(C*)C(CC)CC,{$CC(C(CC)CC)$},38.96888215 +559,*OC(=O)CCC(=O)OCCCCCCCCCCCCCCCCCCCC*,{},-32.76938912 +560,*C(C*)C(=O)N(CC)CC,{$CC(C(=O)N(CC)CC)$},56.77009786 +561,*NNC(=O)CCC(=O)NNC(=O)CCCCCCCCC(=O)*,{},64.69850401 +562,*NC(=O)CCC(=O)NCCCCCCCC*,{},69.22130195 +563,*NC(C(C(=O)*)(C)C)c1ccccc1,{},154.3595069 +564,*OC(=O)C/C=C/CC(=O)OCCCCCCCCCCCCCC*,{},-41.10158883 +565,*C(C*)(C(=O)OCC)CO,{$CC(C(=O)OCC)(CO)$},22.36004964 +566,*O[Si](CCCN=C1c2c(ccc(c2C(=NCCC[Si](*)(C)C)c2ccccc12)O)O)(C)C,{},51.3 +567,*Oc1ccc(cc1)S(=O)(=O)c1ccc(cc1)Oc1ccc(cc1)C(c1ccc(cc1)*)(c1ccc(cc1)O)C,{},255.52 +568,*OP(=O)(N=Nc1ccc(cc1)COC(=O)c1cc(cc(c1)C(C)(C)C)C(=O)OCc1ccc(cc1)N=NP(=O)(OCCCCCC*)OC)OC,{},62.39 +569,*c1cc2c(C(=O)N(C2=O)c2c(ccc(c2)N2C(=O)c3c(C2=O)cc(cc3)C(=O)*)CP(=O)(OCC)OCC)cc1,{},264.06 +570,*Nc1c(cc(cc1)NC(=O)c1ccc(cc1)NC(=O)CCCCCCCCCCC(=O)Nc1ccc(cc1)C(=O)*)C(=O)OCCCCCCCCCCCCCCCC,{},187.43 +571,*C(C*)(C(=O)OCCF)C,{$CC(C(=O)OCCF)(C)$},76.42 +572,*N1C(=O)c2c(C1=O)cc(cc2)Oc1ccc(cc1)Oc1ccc(cc1)Oc1cc2c(C(=O)N(C2=O)c2ccc(cc2)Oc2ccc(cc2)Sc2ccc(cc2)Oc2ccc(cc2)*)cc1,{},225.33 +573,*C(C*)c1ccc(cc1)C(=O)N(C)C,{$CC(c1ccc(C(=O)N(C)C)cc1)$},146.9 +574,*OC(=O)c1ccc(cc1)C(c1ccc(cc1)C(=O)*)(C)C,{},46.39 +575,*Oc1cc2c(cc1)ccc(c2)Oc1ccc(cc1)C(=O)Nc1cc(ccc1)NC(=O)c1ccc(cc1)*,{},204.47 +576,*SCC(=O)NCCCCCCNC(=O)C*,{},29.78 +577,*c1cc2C(c3c(c2cc1)ccc(c3)c1ccc(cc1)c1c(c(cc(c1)c1ccc(cc1)OCC(CCCC)CC)c1ccc(cc1)*)c1ccc(cc1)OCC(CCCC)CC)(CCCCCC)CCCCCC,{},123.52 +578,*OC(=O)COCC(=O)OCCCC*,{},27.22 +579,*C1(CCN(CC1)C(=O)C(CC(=O)N1CCC(CC1)(CCC*)C)C)C,{},111.94 +580,*c1nc2c(nc1)cc(cc2)Oc1cc2c(nc(cn2)c2ccc(cc2)*)cc1,{},381.02 +581,*C(C*)OC(=O)c1c(cccc1)C,{$CC(OC(=O)c1ccccc1C)$},81.84 +582,*C1(CC(c2c1cc(cc2)*)(C)C)C,{},261.41 +583,*c1cc2c(C(=O)N(C2=O)c2ccc3Cc4c(c3c2)cc(cc4)N2C(=O)c3c(C2=O)cc(cc3)C(*)(C(F)(F)F)C(F)(F)F)cc1,{},442.63 +584,*c1cc2c(C(=O)N(C2=O)c2c3c(ccc2)c(ccc3)Oc2ccc(cc2)N2C(=O)c3c(C2=O)cc(cc3)C(=O)Nc2nc(nc(n2)NC(=O)*)c2ccccc2)cc1,{},330.59 +585,*Oc1ccc(cc1)OC(=O)c1c(cc(cc1)C(=O)*)Sc1ccc(cc1)Cl,{},146.43 +586,*N1C(=O)c2c(C1=O)cc(cc2)Oc1cc2c(C(=O)N(C2=O)CCCCCCCCCCCC*)cc1,{},80.63 +587,*Oc1ccc(cc1)OC(=O)c1ccc(cc1)C(=O)Oc1ccc(cc1)OCCCCC*,{},89.92 +588,*C1C(=O)N(C(=O)C1C(C*)c1ccccc1)CCCCCCCCCCCC,{},70.24 +589,*OC(=O)Nc1c(ccc(c1)NC(=O)OCCCCCCCC*)C,{},117.25 +590,*C(C*)(C(=O)OCCCCCCOc1ccc(cc1)C(=O)Oc1ccc2c(c1)oc(=O)cc2)C,{$CC(C(=O)OCCCCCCOc1ccc(C(=O)Oc2ccc3ccc(=O)oc3c2)cc1)(C)$},118.96 +591,*Oc1ccc(cc1)CC(NC(=O)Cc1ccc(cc1)OC(=O)CCCCCCC(=O)*)C(=O)OCC,{},66.4 +592,*N1C(=O)c2c(C1=O)cc(cc2)Oc1cc2c(C(=O)N(C2=O)c2cc(ccc2)C(c2cc(ccc2)*)O[Si](O[Si](O[Si](C)(C)C)(C)C)(C)C)cc1,{},196.68 +593,*c1cc2c(C(=O)N(C2=O)c2ccc(cc2)C(=O)OCCOCCOCCOC(=O)*)cc1,{},79.22 +594,*Oc1ccc(cc1)C1(c2ccccc2c2ccccc12)c1ccc(cc1)Oc1c(cc(cc1)C(=O)c1ccc(cc1)C(=O)c1cc(c(cc1)*)C(F)(F)F)C(F)(F)F,{},265.8 +595,*O[Si](O[Si](O[Si](O[Si](CC[Si](O[Si](O[Si](O[Si](O[Si](CC[Si](*)(c1ccccc1)c1ccccc1)(C)C)(C)C)(C)C)(C)C)(C)C)(c1ccccc1)c1ccccc1)(c1ccccc1)c1ccccc1)(c1ccccc1)c1ccccc1)(c1ccccc1)c1ccccc1,{},11.62 +596,*N1C(=O)c2c(C1=O)c(ccc2)Oc1c(c(cc(c1)C(C)(C)C)C(C)(C)C)Oc1c2c(C(=O)N(C2=O)c2ccc(cc2)Oc2ccc(cc2)*)ccc1,"{N3C(=O)c4cccc(c4C3=O)Oc5c(cc(C(C)(C)C)cc5C(C)(C)C)Oc6cccc7c6C(=O)N(C7=O)>}",301.26 +597,*Oc1ccc(cc1)c1ccc(cc1)Oc1c(c(c(c(c1F)F)COC(c1cc(ccc1)C(OCc1c(c(c(c(c1F)F)*)F)F)(C(F)(F)F)C(F)(F)F)(C(F)(F)F)C(F)(F)F)F)F,{},139.73 +598,*S(=O)(=O)c1ccc(cc1)NC(=O)c1cc(cc(c1)NC(=O)c1ccc(cc1)NC(=O)C(N1C(=O)c2c(C1=O)cccc2)C)C(=O)Nc1ccc(cc1)*,{},272.65 +599,*NC(=O)NCc1ccc(cc1)CNC(=O)NCCCCCCCCCCCCCCCCCC*,{},62.4 +600,*C1CC2CC(CC(C1)O2)OC(=O)O*,{},196.52 +601,*C(C(*)C(=O)OC(C)(C)C)C(=O)OC,{$C(C(=O)OC)C(C(=O)OC(C)(C)C)$},148.77 +602,*Oc1ccc(cc1)c1ccc(cc1)C(=O)OCC(COC(=O)c1ccc(cc1)c1ccc(cc1)OC(CC*)C)C,{},116.96 +603,*N1C(=O)c2c(C1=O)cc(cc2)c1cc2c(C(=O)N(C2=O)c2c(cc(cc2C)C(c2cc(c(c(c2)C)*)C)c2c3c(ccc2)cccc3)C)cc1,{},369 +604,*c1cc2c(nc(c(n2)c2ccccc2)c2ccc(cc2)c2c(nc3c(n2)cc(cc3)C(=O)*)c2ccccc2)cc1,{},326.27 +605,*C(C*)(C(=O)OCC)F,{$CC(C(=O)OCC)(F)$},124.69 +606,*NC(=O)CCCCC(=O)NCC(CC(CC*)(C)C)C,{},54.82 +607,*c1c2C(=O)N(C(=O)c2c(c2ccccc12)c1ccc(cc1)Oc1ccc(cc1)C(=O)c1cc(ccc1)C(=O)c1ccc(cc1)Oc1ccc(cc1)*)CCCCCCCCCCCC,{},161.91 +608,*Oc1ccc(cc1)C(c1ccc(cc1)OC(=S)*)C,{},101.06 +609,*C(C*)OCC(CC)(C)C,{$CC(OCC(C)(C)CC)$},12.2 +610,*c1sc(cc1)C(=O)Oc1ccc(cc1)[Si](c1ccc(cc1)OC(=O)*)(CC)CC,{},-36.93 +611,*Oc1c(cc(cc1C)*)C(CCCCCCCCCCCC)C,{},44.22 +612,*Oc1cc(ccc1)C(C(C(c1cc(ccc1)OC(=O)c1cc(ccc1)C(C(C(c1cc(ccc1)C(=O)*)(F)F)(F)F)(F)F)(F)F)(F)F)(F)F,{},105.91 +613,*C(C*)C(=O)Oc1ccc(cc1)C(=O)Oc1ccc(cc1)OC(=O)c1ccc(cc1)OCCCC,{$CC(C(=O)Oc1ccc(C(=O)Oc2ccc(OC(=O)c3ccc(OCCCC)cc3)cc2)cc1)$},42.05 +614,*C(C*)c1c(cccc1)C(=O)OCCC,{$CC(c1ccccc1C(=O)OCCC)$},135.21 +615,*N=P(*)(OCc1ccc(cc1)c1ccccc1)OCc1ccc(cc1)c1ccccc1,{},51.23 +616,*c1oc(nn1)c1ccc(cc1)C(=O)OCCCCCCOc1ccc(cc1)C=C1C(=O)C(=Cc2ccc(cc2)OCCCCCCOC(=O)c2ccc(cc2)*)CCC1,{},90.11 +617,*OC(=O)NCCCCCCNC(=O)OCCCCCCCCCCCC*,{},35.91 +618,*c1oc(nc1)c1cc(c(cc1)Oc1ccc(cc1)S(=O)(=O)c1ccc(cc1)Oc1c(cc(cc1)*)C(F)(F)F)C(F)(F)F,{},267.06 +619,*c1nc(nc(n1)c1ccc(cc1)Oc1ccc(cc1)C(c1ccc(cc1)Oc1ccc(cc1)*)(C(F)(F)F)C(F)(F)F)c1ccccc1,{},303.48 +620,*Oc1c(c(c(c(c1C)C)Oc1ccc(cc1)NC(=O)c1ccc(cc1)C(=O)Nc1ccc(cc1)*)C)C,{},337.62 +621,*N1C(=O)c2c(C1=O)c(ccc2)c1cc2c(C(=O)N(C2=O)c2cc(ccc2)*)cc1,"{N2C(=O)c3ccc(cc3C2=O)c4cccc5c4C(=O)N(C5=O)>}",348.28 +622,*N1C(=O)c2c(C1=O)cc(cc2)Oc1cc2c(C(=O)N(C2=O)c2cc(c(c(c2)Br)Oc2c(cc(cc2Br)*)Br)Br)cc1,{},302.26 +623,*N1C(=O)C2C3C4C(C(C2C1=O)C=C3)C(=O)N(C4=O)c1ccc(cc1)Sc1cc2c(C(=O)N(C2=O)c2ccc(cc2)CC)cc1Sc1ccc(cc1)*,{},307.1 +624,*C1C(C(C(C1)C=C*)(F)F)(C(F)(F)F)C(F)(F)F,{$C=CC1CC(C(C(F)(F)F)(C(F)(F)F)C1(F)F)$},163.71 +625,*OC(CCC(OC(=O)CCCCCCCCC(=O)*)C)C,"{C(=O)CCCCCCCCC(=O)>}",-18.21 +626,*c1ccc2c(nc([nH]2)c2cc(ccc2)c2nc3c([nH]2)ccc(c3)C(=O)Nc2cc(cc(c2)c2nc3c([nH]2)cccc3)NC(=O)*)c1,{},348.55 +627,*C(C*)C(=O)c1ccc(cc1)CC,{$CC(C(=O)c1ccc(CC)cc1)$},68.42 +628,*C1C(=O)N(C(=O)C1*)CCOc1ccc(cc1)c1ccc(cc1)C#N,{},158.36 +629,*C1COC2C1OCC2OC(=O)CCC(=O)O*,{},86.76 +630,*OC(=O)c1ccc(cc1)C(=O)OCCCCOC(=O)CCCCC(=O)OCCCC*,{},33.64 +631,*C(C*)c1ccc(cc1)COCC(CCCC)CC,{$CC(c1ccc(COCC(CC)CCCC)cc1)$},13.66 +632,*C(C*)(C(=O)Oc1cc(c(cc1)C(=O)c1ccccc1)O)C,{$CC(C(=O)Oc1ccc(C(=O)c2ccccc2)c(O)c1)(C)$},186.45 +633,*C(C*)C(=O)OCCOC(C(F)F)(F)F,{$CC(C(=O)OCCOC(F)(F)C(F)F)$},11.16 +634,*OC(=O)OCC(C(C*)O)O,{},98.39 +635,*Oc1ccc(cc1)NC(=O)c1cc(cc(c1)C(C)(C)C)C(=O)Nc1ccc(cc1)OCCCCCC*,{},158.08 +636,*N(c1ccc(cc1)C(c1ccc(cc1)*)c1ccccc1)c1ccc(cc1)C,{},263.16 +637,*Nc1ccc(cc1)C(c1ccc(cc1)NC(=O)c1cc(cc(c1)N1C(=O)C2C(C1=O)CC=CC2)C(=O)*)(C)C,{},358.53 +638,*C(C*)(CC(=O)OCCCc1ccccc1)C(=O)OCCCc1ccccc1,{$CC(C(=O)OCCCc1ccccc1)(CC(=O)OCCCc1ccccc1)$},29.9 +639,*C=CCCCC(CCC*)Cl,{$C=CCCCC(Cl)CCC$},-9.32 +640,*C1C(=O)N(C(=O)C1C(C*)(C)C)c1c(cccc1)C,{},286.47 +641,*C1(c2c(C(=O)O1)cccc2)c1ccc(cc1)Oc1c(c(c(c(c1F)F)C(=O)c1c(c(c(c(c1F)F)Oc1ccc(cc1)*)F)F)F)F,{},276.34 +642,*Nc1cc(ccc1)NC(=O)CCCCCCC(=O)*,{},128.17 +643,*C(C*)(C(=O)OCCOc1ccc(cc1)N=Nc1ccc(cc1)C#N)C,{$CC(C(=O)OCCOc1ccc(N=Nc2ccc(C#N)cc2)cc1)(C)$},141.68 +644,*N1C(=O)c2c(C1=O)cc(cc2)Oc1ccc(cc1)C1(CCC(CC1)c1ccccc1)c1ccc(cc1)Oc1cc2c(C(=O)N(C2=O)c2ccc(cc2)Cc2ccc(cc2)*)cc1,{},249.05 +645,*Oc1ccc(cc1)Oc1ccc(cc1)C(=O)c1c(c(c(c(c1c1ccc(cc1)F)c1ccc(cc1)F)c1ccc(cc1)F)c1ccc(cc1)F)C(=O)c1ccc(cc1)*,{},252.56 +646,*c1cc2c(C(=O)N(C2=O)c2ccc(cc2)NC(=O)Nc2ccc(cc2)N2C(=O)c3c(C2=O)cc(cc3)C(=O)Nc2ccc(cc2)c2ccc(cc2)NC(=O)*)cc1,{},253.71 +647,*OC(C*)CCl,"{,}",2.36 +648,*NC(=O)c1cc(cc(c1)C(C)(C)C)C(=O)NCC(CCC(C*)C)C,{},157.37 +649,*c1cc2c(C(=O)N(C2=O)c2ccc(cc2)Oc2ccc(cc2)C(=O)c2cc(ccc2)C(=O)c2ccc(cc2)Oc2ccc(cc2)N2C(=O)c3c(C2=O)cc(cc3)C(=O)*)cc1,{},240.35 +650,*Oc1c(cc(cc1OC)C=Cc1ccc(cc1)C=Cc1cc(c(c(c1)OC)OCCCCCCCC*)OC)OC,{},65.39 +651,*Nc1c2c(ccc1)c(ccc2)NC(=O)c1cc(ccc1)C(=O)*,{},339.66 +652,*OS(=O)(=O)c1ccc(cc1)*,{},140.64 +653,*c1ccc2ccc3c(c2n1)nc(cc3)c1ccc(cc1)c1c(cc(c(c1)CCCCCC)c1ccc(cc1)*)CCCCCC,{},160.44 +654,*C(C*)(C(=O)OCCCCCCCCCCn1c2ccc(cc2c2ccccc12)N=Nc1ccc(cc1)[N+](=O)[O-])C,{$CC(C(=O)OCCCCCCCCCCn1c2ccccc2c2cc(N=Nc3ccc([N+](=O)[O-])cc3)ccc21)(C)$},60.81 +655,*N=Nc1ccc(cc1)NC(=O)CCC(=O)Nc1ccc(cc1)*,{},208.85 +656,*C(C*)c1cc(ccc1)Cl,{$CC(c1cccc(Cl)c1)$},122.57 +657,*Oc1ccc(cc1)C1(c2cc(ccc2c2ccc(cc12)[N+](=O)[O-])OC)c1ccc(cc1)OC(=O)CCCC(=O)*,{},195.71 +658,*c1cc2c(C(=O)N(C2=O)c2c(cc(c(c2C)C(=O)c2cc(ccc2)N2C(=O)c3c(C2=O)cc(cc3)C(*)(C(F)(F)F)C(F)(F)F)C)C)cc1,{},280.09 +659,*Oc1ccc(cc1)Oc1ccc(cc1)C(=O)c1cc(ccc1)NC(=O)c1cc(ccc1)C(=O)Nc1cc(ccc1)C(=O)c1ccc(cc1)*,{},222.22 +660,*Oc1ccc(cc1)CC(NC(=O)CCc1ccc(cc1)OC(=O)CCCC(=O)*)C(=O)OCCOCCOCC,{},42.34 +661,*OP(=O)(OCCCCCCCCCCOc1ccc(cc1)C=Cc1ccc(cc1)OCCCCCCCCCC*)OCCCCCCCCCCOc1ccc(cc1)N=Nc1ccc(cc1)C#N,{},51.61 diff --git a/test/bigsmiles.smi b/test/bigsmiles.smi new file mode 100644 index 0000000..dd5de8b --- /dev/null +++ b/test/bigsmiles.smi @@ -0,0 +1,62 @@ +# Example BigSMILES String pulled from https://olsenlabmit.github.io/BigSMILES/docs/line_notation.html#the-bigsmiles-line-notation +[$]-CC-[$] +[$]CC[$] +[$]CC(CC)[$] +C([$])C([$])CC +[<]C(=O)CCCCC(=O)[<] +[>]NCCCCCCN[>] +[<]C(=O)CCCCC(=O)NCCCCCCN[>] +[<]CCO[>] +[>]CCO[<] +[$1]-CC-[$2] +[$1]CC[$2] +C([$1])C[$2] +[$]-CC-[$] +[$]-CC([$])-[$] +[$]=CCC=[$] +{[][$]CC[$],[$]CC(CC)[$][]} +{[][<]C(=O)CCCCC(=O)[<],[>]NCCCCCCN[>][]} +{[][<]C(=O)CCCCC(=O)NCCCCCCN[>][]} +{[][$]CC[$],[$]CC(CC)[$][]} +C{[$][$]CC[$],[$]CC(CC)[$][$]} +[H]O{[>][<]C(=O)CCCCC(=O)[<],[>]NCCCCCCN[>][<]}[H] +{[][<]C(=O)CCCCC(=O)[<],[>]NCCCCCCN[>];[>]O[H],[<][H][]} +[H]O{[>][<]C(=O)CCCCC(=O)[<],[>]NCCCCCCN[>];[<][H][]} +{[][<]c1cc([>])cc([>]c1);[<]Br,[>]B(O)O[]} +OB(O){[>][<]c1cc([>])cc([>]c1);[<]Br[]} +{[][$]CC(C#N)[$],[$]CC(c1ccccc1)[$][]} +{[][<]OCC[>][<]}{[>][<]OC(C)C[>][]} +{[][<]C(=O)c1ccc(cc1)C(=O)[<],[>]OCCO[>][]} +{[][$]CC[$],[$]CC([$])[$][]} +{[][$]CC(C)(C)[$],[$]CC(c1ccc(cc1)C{[$][$]CC(C)(C(=O)OC)[$][$]}Br)[$][]} +{[][>]C(=O)Nc1ccc(C)c(c1)NC(=O)[>],[<]OCC{[<][>]OCC[<][>]}O[<],[<]OCCCO[<][]} +{[][$]CC[$],[$]CC(CC)[$][]} +{[]CC,CC(CC)[]} +O{[>][<]C(=O)C(C)N[>],[<]C(=O)CN[>][<]} +O{[>]C(=O)C(C)N,C(=O)CN[<]} +{[<][<]C(=O)C(C)N[>],[<]C(=O)CN[>][>]}O +{[<][>]NC(C)C(=O)[<],[>]NCC(=O)[<][>]}O +{[<]NC(C)C(=O),NCC(=O)[>]}O +{[][$]CC(C)([#R])[$][]} +{[][$]CC(C)([#R])[$][]}.{#R=C(=O)OCC12CC(C3)CC(C1)CC3C2} +C([#Arm])([#Arm])([#Arm])[#Arm] +C([#Arm])([#Arm])([#Arm])[#Arm].{#Arm=CO{[<][>]CCO[<][>]}} +A([<1[Inner]1])R(A[<1[Inner]1])(B[>1[Inner]2])B[>1[Inner]2].{#A=C}.{#R=C}.{#B=C}.{#Inner=<} +A([<1[<1]1])R(A[<1[<1]1])(B[>1[>1]2])B[>1[>1]2].{#A=C}.{#R=C}.{#B=C} +A([$1[Inner]1])R(A'[$1[Inner]1])(A[$1[Inner]2])A'[$1[Inner]2].{#A=C}.{#A'=C}.{#R=C}.{#Inner=$} +A([$1[$1]1])R(A'[$1[$1]1])(A[$1[$1]2])A'[$1[$1]2].{#A=C}.{#A'=C}.{#R=C} +A([$1[$1]1])R(A'[$1[$2]1])(A[$1[$1]2])A'[$1[$2]2].{#A=C}.{#A'=C}.{#R=C} +A([$1[<1]1])R(A'[$1[>1]1])(A[$1[<1]2])A'[$1[>1]2].{#A=C}.{#A'=C}.{#R=C} +CC{[>][<]CC(C)[>][<]}CC(C)=C +C{[>][<]C[C@@H](C)[>][<]}CC(C)=C +CC{[>][<]C[C@@H](C)C[C@H](C)[>];[<]C=CC,[<]C[C@H](C)C=CC[]} +{[][$]CC(c1ccncc1)[$],[$]CC(c1cc[n+](C)cc1)[$].[I-][]} +{[][<][#A][#R][#A][<],[>][#B][#R']([#B][>])([#B][>])[#B][>][]} +{[][<][#A][#R][#A][<],[>][#B][#R']([#B][>])([#B][>])[#B][>][]}.{#A=C}.{#R=C}.{#B=C}.{#R'=C} +{[][<][#A][#R][#A][<],[>][#B][#R']([#B][>])([#B][>])[#B][>];[>][#E1],[<][#E2][]} +{[][<][#A][#R][#A][<],[>][#B][#R']([#B][>])([#B][>])[#B][>];[>][#E1],[<][#E2][]}.{#A=C}.{#R=C}.{#B=C}.{#R'=C}.{#E1=C}.{#E2=C} +{[][>]COC(=O){[$][$]COC[$][$]}C(=O)OC[>],c1([<])cc([#L]2)cc([#L]3)c1.c4([<])cc([#L]5)cc([#L]6)c4.c7([<])cc([#L]8)cc([#L]9)c7.C%10([<])cc([#L]%11)cc([#L]%12)c%10.[Pd++]258%11.[Pd++]369%12} +{[][>]COC(=O){[$][$]COC[$][$]}C(=O)OC[>],c1([<])cc([#L]2)cc([#L]3)c1.c4([<])cc([#L]5)cc([#L]6)c4.c7([<])cc([#L]8)cc([#L]9)c7.C%10([<])cc([#L]%11)cc([#L]%12)c%10.[Pd++]258%11.[Pd++]369%12}.{#L=c(c1)cccn1} +S1C(c2ccccc2){[$][$]CC(c1ccccc1)[$][$]}C(=O)OC(=C3)N=NN3CC(O)COC(=O)C(C)C1 +C1CCC{[$1][$1]=CCCCCCCC=[$1][$1]}CCCC1 +{[][$1]=CCCCCCCC=[$1][]} diff --git a/test/test_tokenize_bigsmiles.py b/test/test_tokenize_bigsmiles.py new file mode 100644 index 0000000..f455aa8 --- /dev/null +++ b/test/test_tokenize_bigsmiles.py @@ -0,0 +1,283 @@ +import csv +import json +from pathlib import Path +from tempfile import NamedTemporaryFile + +import pytest +import smirk +from smirk.smirk import SmirkTokenizer + + +def _assert_pretokenize( + tokenizer: SmirkTokenizer, text: str, expected_tokens: list[str] +) -> None: + assert tokenizer.pretokenize(text) == expected_tokens + + +def _tokens(spec: str) -> list[str]: + return spec.split() + + +def _smi_fixture(filename: str) -> list[tuple[str, str]]: + path = Path(__file__).with_name(filename) + return [ + (f"{filename}:{idx}", line) + for idx, line in enumerate(path.read_text().splitlines(), start=1) + if line and not line.startswith("#") + ] + + +def _bigsmiles_fixture() -> list[tuple[str, str]]: + return _smi_fixture("bigsmiles.smi") + + +def _bigsmiles_csv_fixture() -> list[tuple[str, str]]: + path = Path(__file__).with_name("bigsmiles.csv") + data_lines = [ + (idx, line) + for idx, line in enumerate(path.read_text().splitlines(), start=1) + if line and not line.startswith("#") + ] + reader = csv.reader(line for _, line in data_lines) + header = next(reader) + bigsmiles_index = header.index("BigSMILES") + return [ + (f"bigsmiles.csv:{line_no}", row[bigsmiles_index]) + for (line_no, _), row in zip(data_lines[1:], reader) + if row[bigsmiles_index] + ] + + +INLINE_ROUNDTRIP_BIGSMILES = [ + "{[$]CC[$]}", + "{[$]CC[$],[$]C(C)C[$]}", + "{[<]CC[>]}", + "[$1]", + "[<2]", + "[]", + "{[]CC[$]}", + "{[$]CC[$];C[$],[$]C}", + "CC{[$]CC[$]}CC", + "{[$]CC(c1ccccc1)[$]}", + "{[>]CCCCCC(=O)[<],[>]NCCCCCCN[<]}", + "{[$]CC[$]}{[$]CC(C)[$]}", +] + + +def _roundtrip_fixtures() -> list[tuple[str, list[tuple[str, str]]]]: + return [ + ( + "inline", + [ + (f"inline:{idx}", text) + for idx, text in enumerate(INLINE_ROUNDTRIP_BIGSMILES, start=1) + ], + ), + ("bigsmiles.smi", _bigsmiles_fixture()), + ("opensmiles.smi", _smi_fixture("opensmiles.smi")), + ("bigsmiles.csv", _bigsmiles_csv_fixture()), + ] + + +UNDEFINED_FRAGMENT_PLACEHOLDER_CASES = [ + ("{[][$]CC(C)([#R])[$][]}", ["#R"]), + ("C([#Arm])([#Arm])([#Arm])[#Arm]", ["#Arm", "#Arm", "#Arm", "#Arm"]), + ( + "{[][<][#A][#R][#A][<],[>][#B][#R']([#B][>])([#B][>])[#B][>][]}", + ["#A", "#R", "#A", "#B", "#R'", "#B", "#B", "#B"], + ), + ( + "{[][<][#A][#R][#A][<],[>][#B][#R']([#B][>])([#B][>])[#B][>];" + "[>][#E1],[<][#E2][]}", + ["#A", "#R", "#A", "#B", "#R'", "#B", "#B", "#B", "#E1", "#E2"], + ), + ( + "{[][>]COC(=O){[$][$]COC[$][$]}C(=O)OC[>],c1([<])cc([#L]2)cc([#L]3)c1." + "c4([<])cc([#L]5)cc([#L]6)c4.c7([<])cc([#L]8)cc([#L]9)c7." + "C%10([<])cc([#L]%11)cc([#L]%12)c%10.[Pd++]258%11.[Pd++]369%12}", + ["#L", "#L", "#L", "#L", "#L", "#L", "#L", "#L"], + ), +] + +BARE_LABEL_BIGSMILES_CASES = [ + ( + "A([<1[Inner]1])R(A[<1[Inner]1])(B[>1[Inner]2])B[>1[Inner]2]", + 7, + ), + ( + "A([<1[<1]1])R(A[<1[<1]1])(B[>1[>1]2])B[>1[>1]2]", + 3, + ), + ( + "A([$1[Inner]1])R(A'[$1[Inner]1])(A[$1[Inner]2])A'[$1[Inner]2]", + 9, + ), +] +BARE_LABEL_BIGSMILES_WITH_DEFINITIONS = [ + ( + "A([<1[Inner]1])R(A[<1[Inner]1])(B[>1[Inner]2])B[>1[Inner]2]." + "{#A=C}.{#R=C}.{#B=C}.{#Inner=<}" + ), + "A([<1[<1]1])R(A[<1[<1]1])(B[>1[>1]2])B[>1[>1]2].{#A=C}.{#R=C}.{#B=C}", + ( + "A([$1[Inner]1])R(A'[$1[Inner]1])(A[$1[Inner]2])A'[$1[Inner]2]." + "{#A=C}.{#A'=C}.{#R=C}.{#Inner=$}" + ), + "A([$1[$1]1])R(A'[$1[$1]1])(A[$1[$1]2])A'[$1[$1]2].{#A=C}.{#A'=C}.{#R=C}", + "A([$1[$1]1])R(A'[$1[$2]1])(A[$1[$1]2])A'[$1[$2]2].{#A=C}.{#A'=C}.{#R=C}", + "A([$1[<1]1])R(A'[$1[>1]1])(A[$1[<1]2])A'[$1[>1]2].{#A=C}.{#A'=C}.{#R=C}", +] +EXPECTED_UNKNOWN_BIGSMILES_TEXTS = ( + {text for text, _ in UNDEFINED_FRAGMENT_PLACEHOLDER_CASES} + | {text for text, _ in BARE_LABEL_BIGSMILES_CASES} + | set(BARE_LABEL_BIGSMILES_WITH_DEFINITIONS) +) +NON_EXACT_ROUNDTRIP_TEXTS = EXPECTED_UNKNOWN_BIGSMILES_TEXTS | {"[Cu++]"} + + +def _is_lossless_roundtrip_text(text: str) -> bool: + return ".{#" not in text and text not in NON_EXACT_ROUNDTRIP_TEXTS + + +@pytest.fixture +def bigsmiles_tokenizer() -> SmirkTokenizer: + return SmirkTokenizer(bigsmiles=True) + + +@pytest.fixture +def smiles_tokenizer() -> SmirkTokenizer: + return SmirkTokenizer(bigsmiles=False) + + +ROUNDTRIP_FIXTURES = _roundtrip_fixtures() + + +@pytest.mark.parametrize( + ("fixture_name", "fixture_rows"), + ROUNDTRIP_FIXTURES, + ids=[name for name, _ in ROUNDTRIP_FIXTURES], +) +def test_bigsmiles_roundtrip_batch_decode(fixture_name, fixture_rows): + bigsmirk = smirk.SmirkBigSmilesFast() + bigsmiles_batch = [text for _, text in fixture_rows] + encoded = bigsmirk(bigsmiles_batch, add_special_tokens=False) + decoded = bigsmirk.batch_decode(encoded["input_ids"], skip_special_tokens=True) + itemwise_decoded = [ + bigsmirk.decode(ids, skip_special_tokens=True) for ids in encoded["input_ids"] + ] + exact_failures = [ + f"{source}: expected {text!r}, got {decoded_text!r}" + for (source, text), decoded_text in zip(fixture_rows, decoded) + if _is_lossless_roundtrip_text(text) and decoded_text != text + ] + + assert decoded == itemwise_decoded + assert len(decoded) == len(bigsmiles_batch) + assert not exact_failures, ( + f"{fixture_name} exact roundtrip mismatches:\n" + "\n".join(exact_failures) + ) + + +def test_bigsmiles_fixture_has_no_unknown_tokens(): + bigsmirk = smirk.SmirkBigSmilesFast() + failures = [] + + for line_no, text in _bigsmiles_fixture(): + if text in EXPECTED_UNKNOWN_BIGSMILES_TEXTS: + continue + + tokens = bigsmirk.tokenize(text, add_special_tokens=False) + if bigsmirk.unk_token in tokens: + failures.append(f"line {line_no}: {text}") + + assert not failures, "unknown tokens in BigSMILES fixtures:\n" + "\n".join(failures) + + +def _unknown_spans(bigsmirk: smirk.SmirkBigSmilesFast, text: str) -> list[str]: + encoding = bigsmirk._tokenizer.encode(text, add_special_tokens=False) + unk_token_id = bigsmirk._tokenizer.token_to_id(bigsmirk.unk_token) + return [ + text[start:end] + for token_id, (start, end) in zip(encoding["input_ids"], encoding["offsets"]) + if token_id == unk_token_id + ] + + +@pytest.mark.parametrize( + ("text", "unknown_spans"), + UNDEFINED_FRAGMENT_PLACEHOLDER_CASES, +) +def test_bigsmiles_undefined_fragment_placeholders_return_unknowns(text, unknown_spans): + bigsmirk = smirk.SmirkBigSmilesFast() + tokens = bigsmirk.tokenize(text, add_special_tokens=False) + actual_unknown_spans = _unknown_spans(bigsmirk, text) + + assert actual_unknown_spans == unknown_spans + assert tokens.count(bigsmirk.unk_token) == len(actual_unknown_spans) + + +@pytest.mark.parametrize( + ("text", "unknown_count"), + BARE_LABEL_BIGSMILES_CASES, +) +def test_bigsmiles_bare_labels_return_unknowns(text, unknown_count): + bigsmirk = smirk.SmirkBigSmilesFast() + tokens = bigsmirk.tokenize(text, add_special_tokens=False) + + assert tokens.count(bigsmirk.unk_token) == unknown_count + + +@pytest.mark.parametrize( + ("text", "expected_tokens"), + [ + ("OC[C@@H]", _tokens("O C [ C @@ H ]")), + ("C[C@H](N)C(=O)O", _tokens("C [ C @ H ] ( N ) C ( = O ) O")), + ], +) +def test_smiles_tokens_match_between_modes( + bigsmiles_tokenizer, smiles_tokenizer, text, expected_tokens +): + _assert_pretokenize(bigsmiles_tokenizer, text, expected_tokens) + _assert_pretokenize(smiles_tokenizer, text, expected_tokens) + + +@pytest.mark.parametrize( + "bigsmiles,expected_type", + [ + (True, "BigSmirkPreTokenizer"), + (False, None), + ], +) +def test_tokenizer_serialize_pretokenizer_type(bigsmiles, expected_type): + tokenizer = SmirkTokenizer(bigsmiles=bigsmiles) + config = json.loads(tokenizer.to_str()) + assert "pre_tokenizer" in config + + if expected_type is None: + assert "type" not in config["pre_tokenizer"] + assert "bigsmiles_version" not in config["pre_tokenizer"] + else: + assert config["pre_tokenizer"].get("type") == expected_type + assert config["pre_tokenizer"].get("bigsmiles_version") == "1.1" + + +@pytest.mark.parametrize( + "text", + [ + "{[$]CC[$]}", + "{[<]CC[>]}", + "{[]CC[$]}", + "{[$]CC[$],[$]C(C)C[$]}", + ], +) +def test_bigsmiles_tokenizer_save_load(bigsmiles_tokenizer, text): + with NamedTemporaryFile("w", suffix=".json", delete=False) as file: + bigsmiles_tokenizer.save(file.name) + with open(file.name) as saved: + config = json.load(saved) + loaded = SmirkTokenizer.from_file(file.name) + + assert config["pre_tokenizer"].get("bigsmiles_version") == "1.1" + original_splits = bigsmiles_tokenizer.pretokenize(text) + loaded_splits = loaded.pretokenize(text) + assert original_splits == loaded_splits