From 0f7e082f9a3b14bbabe6a93c6cdb2587b0ba8d16 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 26 Sep 2025 09:44:59 +0000 Subject: [PATCH 1/9] Initial plan From 52e8b5bdb4923886875adacc5753f2362003c911 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 26 Sep 2025 09:55:08 +0000 Subject: [PATCH 2/9] Add missing dependencies to pyproject.toml and fix megatron import issues Co-authored-by: aflah02 <72096386+aflah02@users.noreply.github.com> --- pyproject.toml | 14 ++++++++++++-- tokensmith/utils.py | 15 +++++++++++++-- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7793da6..ec365df 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,8 +10,14 @@ repository = "https://github.com/aflah02/tokensmith" keywords = ["dataset", "management", "editing", "sampling", "exporting", "searching"] [tool.poetry.dependencies] -python = "^3.7" -# Add other dependencies here +python = "^3.8" +# Core dependencies +numpy = "^1.21.0" +pandas = "^1.3.0" +tqdm = "^4.62.0" +transformers = "^4.20.0" +torch = "^1.11.0" +tokengrams = "^0.3.0" [tool.poetry.group.docs.dependencies] mkdocs = "^1.5.0" @@ -19,6 +25,10 @@ mkdocs-material = "^9.0.0" mkdocstrings = {extras = ["python"], version = "^0.24.0"} mkdocstrings-python = "^1.7.0" +[tool.poetry.group.ui.dependencies] +streamlit = "^1.20.0" +altair = "^4.2.0" + [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" \ No newline at end of file diff --git a/tokensmith/utils.py b/tokensmith/utils.py index 6ac807c..bda3126 100644 --- a/tokensmith/utils.py +++ b/tokensmith/utils.py @@ -6,14 +6,22 @@ from tqdm import trange import uuid from typing import Optional, List, Dict, Any -from megatron.data.indexed_dataset import MMapIndexedDataset -from .megatron_dependencies import get_train_valid_test_split_, build_index_mappings from transformers import AutoTokenizer import torch import os from functools import lru_cache import time +# Optional megatron imports - will be imported when needed +try: + from megatron.data.indexed_dataset import MMapIndexedDataset + MEGATRON_AVAILABLE = True +except ImportError: + MMapIndexedDataset = None + MEGATRON_AVAILABLE = False + +from .megatron_dependencies import get_train_valid_test_split_, build_index_mappings + logger = logging.getLogger(__name__) @lru_cache(1) @@ -61,6 +69,9 @@ def __init__(self, packing_impl: str, allow_chopped: bool, add_extra_token_to_seq: int): + if not MEGATRON_AVAILABLE: + raise ImportError("Megatron is required for WriteableMMapIndexedDataset functionality. Please install GPT-NeoX following the instructions in the README.") + logger.debug(f"Initializing WriteableMMapIndexedDataset with pointer: {dataset_prefix}.bin and index: {dataset_prefix}.idx") self.corpus_pointer = open(f"{dataset_prefix}.bin", 'r+b') From 6d709e57d9f4905720f780018de8d3d520a29d42 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 26 Sep 2025 09:57:44 +0000 Subject: [PATCH 3/9] Update setup.py dependencies and add comprehensive test validation Co-authored-by: aflah02 <72096386+aflah02@users.noreply.github.com> --- setup.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 59376e4..636c351 100644 --- a/setup.py +++ b/setup.py @@ -15,8 +15,25 @@ 'License :: OSI Approved :: MIT License', # Replace with your actual license 'Operating System :: OS Independent', ], - python_requires='>=3.6', + python_requires='>=3.8', install_requires=[ - # List your package dependencies here + 'numpy>=1.21.0', + 'pandas>=1.3.0', + 'tqdm>=4.62.0', + 'transformers>=4.20.0', + 'torch>=1.11.0', + 'tokengrams>=0.3.0', ], + extras_require={ + 'ui': [ + 'streamlit>=1.20.0', + 'altair>=4.2.0', + ], + 'docs': [ + 'mkdocs>=1.5.0', + 'mkdocs-material>=9.0.0', + 'mkdocstrings[python]>=0.24.0', + 'mkdocstrings-python>=1.7.0', + ], + }, ) \ No newline at end of file From 06792cb810ef35f45ce4059f4d4a5c8e41827a0e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 26 Sep 2025 10:13:57 +0000 Subject: [PATCH 4/9] Remove torch and make tokengrams optional with separate install groups Co-authored-by: aflah02 <72096386+aflah02@users.noreply.github.com> --- README.md | 166 +++++++++++++++++++++++++++++++---- pyproject.toml | 11 ++- setup.py | 10 ++- tokensmith/search/handler.py | 11 ++- tokensmith/utils.py | 1 - 5 files changed, 174 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index d946694..03ea428 100644 --- a/README.md +++ b/README.md @@ -38,9 +38,11 @@ DatasetManager ### Installation -Ensure you have a working GPT-NeoX environment using steps provided [here](https://github.com/EleutherAI/gpt-neox?tab=readme-ov-file#environment-and-dependencies) +TokenSmith can be installed with different dependency sets depending on your use case: -Within the same env run the following - +#### Option 1: Core Functionality Only (Recommended) + +For basic TokenSmith functionality including inspect, sample, export, and edit operations: ```bash git clone https://github.com/aflah02/tokensmith.git @@ -48,8 +50,101 @@ cd tokensmith pip install -e . ``` +This installs core dependencies: +- `numpy` - Array operations +- `pandas` - Data processing +- `tqdm` - Progress bars +- `transformers` - Tokenizer support + +#### Option 2: With Search Functionality + +For search and indexing operations using tokengrams: + +```bash +git clone https://github.com/aflah02/tokensmith.git +cd tokensmith +pip install -e ".[search]" +``` + +Or with Poetry: +```bash +poetry install --with search +``` + +#### Option 3: With UI Support + +For the interactive Streamlit web interface: + +```bash +git clone https://github.com/aflah02/tokensmith.git +cd tokensmith +pip install -e ".[ui]" +``` + +Or with Poetry: +```bash +poetry install --with ui +``` + +#### Option 4: With Documentation Tools + +For building documentation: + +```bash +pip install -e ".[docs]" +``` + +Or with Poetry: +```bash +poetry install --with docs +``` + +#### Option 5: Complete Installation (All Features) + +For all optional dependencies (search, UI, and docs): + +```bash +pip install -e ".[all]" +``` + +Or with Poetry: +```bash +poetry install --with all +``` + +You can also combine multiple options: +```bash +pip install -e ".[search,ui]" # Search + UI +``` + +#### GPT-NeoX/Megatron Integration + +**Note:** For functionality that requires GPT-NeoX/Megatron (such as `WriteableMMapIndexedDataset` and some advanced dataset operations), you must separately install GPT-NeoX following the steps provided [here](https://github.com/EleutherAI/gpt-neox?tab=readme-ov-file#environment-and-dependencies). + +TokenSmith is designed to work with or without GPT-NeoX: +- **Without GPT-NeoX**: Core functionality, search, UI, and most operations work perfectly +- **With GPT-NeoX**: Full dataset editing and advanced Megatron-compatible operations are available + +**Note:** Torch is provided by the GPT-NeoX environment and is not included as a TokenSmith dependency to avoid version conflicts. + +#### Python Version Requirements + +- **Python 3.8+** is required +- Compatible with modern Python versions and dependency ecosystems + +#### Which Installation Option to Choose? + +- **Core functionality**: Use Option 1 if you want basic dataset operations without search or UI +- **Search features**: Use Option 2 if you need token sequence search and indexing +- **Web interface**: Use Option 3 if you want the interactive Streamlit UI for visual dataset exploration +- **Documentation**: Use Option 4 if you're contributing to documentation or want to build docs locally +- **Complete features**: Use Option 5 if you want all functionality available +- **Development**: Use Option 5 if you're developing TokenSmith or want all features + ### Basic Usage +#### Core Functionality (Works with basic installation) + ```python from tokensmith import DatasetManager from transformers import AutoTokenizer @@ -57,26 +152,59 @@ from transformers import AutoTokenizer # Initialize the manager manager = DatasetManager() +# Load a tokenizer for detokenization +tokenizer = AutoTokenizer.from_pretrained("gpt2") +``` + +#### Search Functionality (Requires search installation) + +```python +# Setup search functionality - requires tokengrams +try: + manager.setup_search( + bin_file_path="path/to/dataset.bin", + search_index_save_path="path/to/search_index", + vocab=2**16, # or 2**32 for larger vocabularies + reuse=True + ) + + # Search operations + query = [101, 2023, 102] # Token IDs + count = manager.search.count(query) + positions = manager.search.positions(query) + print("✅ Search functionality available") + +except ImportError as e: + print("ℹ️ Search functionality requires tokengrams: pip install 'tokensmith[search]'") +``` + +#### Advanced Operations (Requires GPT-NeoX environment) + +```python # Setup dataset for inspection, sampling, editing, and export -manager.setup_edit_inspect_sample_export( - dataset_prefix="path/to/your/dataset", - batch_info_save_prefix="path/to/batch_info", - train_iters=1000, - train_batch_size=32, - train_seq_len=1024, - seed=42 -) +# Note: This requires GPT-NeoX/Megatron to be installed +try: + manager.setup_edit_inspect_sample_export( + dataset_prefix="path/to/your/dataset", + batch_info_save_prefix="path/to/batch_info", + train_iters=1000, + train_batch_size=32, + train_seq_len=1024, + seed=42 + ) + print("✅ Full functionality available") +except ImportError as e: + print("ℹ️ Advanced operations require GPT-NeoX installation") +``` -# Setup search functionality (optional) -manager.setup_search( - bin_file_path="path/to/dataset.bin", - search_index_save_path="path/to/search_index", - vocab=2**16, # or 2**32 for larger vocabularies - reuse=True -) +#### Web UI Usage (Requires UI installation) -# Load a tokenizer for detokenization -tokenizer = AutoTokenizer.from_pretrained("gpt2") +```bash +# Navigate to UI directory and run +cd tokensmith/ui +./run_ui.sh + +# Or modify run_ui.sh for your specific setup ``` ## 📚 Core Functionality diff --git a/pyproject.toml b/pyproject.toml index ec365df..fa27f9a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,8 +16,6 @@ numpy = "^1.21.0" pandas = "^1.3.0" tqdm = "^4.62.0" transformers = "^4.20.0" -torch = "^1.11.0" -tokengrams = "^0.3.0" [tool.poetry.group.docs.dependencies] mkdocs = "^1.5.0" @@ -29,6 +27,15 @@ mkdocstrings-python = "^1.7.0" streamlit = "^1.20.0" altair = "^4.2.0" +[tool.poetry.group.search.dependencies] +tokengrams = "^0.3.0" + +[tool.poetry.group.all.dependencies] +# All optional dependencies for complete functionality +streamlit = "^1.20.0" +altair = "^4.2.0" +tokengrams = "^0.3.0" + [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" \ No newline at end of file diff --git a/setup.py b/setup.py index 636c351..db19bd3 100644 --- a/setup.py +++ b/setup.py @@ -21,19 +21,25 @@ 'pandas>=1.3.0', 'tqdm>=4.62.0', 'transformers>=4.20.0', - 'torch>=1.11.0', - 'tokengrams>=0.3.0', ], extras_require={ 'ui': [ 'streamlit>=1.20.0', 'altair>=4.2.0', ], + 'search': [ + 'tokengrams>=0.3.0', + ], 'docs': [ 'mkdocs>=1.5.0', 'mkdocs-material>=9.0.0', 'mkdocstrings[python]>=0.24.0', 'mkdocstrings-python>=1.7.0', ], + 'all': [ + 'streamlit>=1.20.0', + 'altair>=4.2.0', + 'tokengrams>=0.3.0', + ], }, ) \ No newline at end of file diff --git a/tokensmith/search/handler.py b/tokensmith/search/handler.py index bc14e20..c637dc8 100644 --- a/tokensmith/search/handler.py +++ b/tokensmith/search/handler.py @@ -1,12 +1,21 @@ # Heavily inspired by the original code from https://github.com/EleutherAI/tokengrams/blob/master/tokengrams/tokengrams.pyi and uses the same library. -from tokengrams import MemmapIndex from typing import List import os import logging +# Optional tokengrams import - will be imported when needed +try: + from tokengrams import MemmapIndex + TOKENGRAMS_AVAILABLE = True +except ImportError: + MemmapIndex = None + TOKENGRAMS_AVAILABLE = False + class SearchHandler: def __init__(self, bin_file_path: str, index_save_path: str, vocab: int, verbose: bool = True, reuse: bool = True): + if not TOKENGRAMS_AVAILABLE: + raise ImportError("Tokengrams is required for search functionality. Please install with: pip install 'tokensmith[search]' or pip install tokengrams") self.bin_file_path = bin_file_path self.index_save_path = index_save_path diff --git a/tokensmith/utils.py b/tokensmith/utils.py index bda3126..37cc823 100644 --- a/tokensmith/utils.py +++ b/tokensmith/utils.py @@ -7,7 +7,6 @@ import uuid from typing import Optional, List, Dict, Any from transformers import AutoTokenizer -import torch import os from functools import lru_cache import time From a552ce677a93f233069876557bd4cbe4826895b9 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 26 Sep 2025 10:20:45 +0000 Subject: [PATCH 5/9] Remove transformers dependency to avoid NeoX version conflicts Co-authored-by: aflah02 <72096386+aflah02@users.noreply.github.com> --- README.md | 5 +++-- pyproject.toml | 1 - setup.py | 1 - tokensmith/ui/app.py | 2 ++ tokensmith/utils.py | 26 ++++++++++++++++++++++++-- 5 files changed, 29 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 03ea428..3b8b2f6 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,8 @@ This installs core dependencies: - `numpy` - Array operations - `pandas` - Data processing - `tqdm` - Progress bars -- `transformers` - Tokenizer support + +**Note:** `transformers` and `torch` are provided by the GPT-NeoX environment when needed for advanced functionality. #### Option 2: With Search Functionality @@ -125,7 +126,7 @@ TokenSmith is designed to work with or without GPT-NeoX: - **Without GPT-NeoX**: Core functionality, search, UI, and most operations work perfectly - **With GPT-NeoX**: Full dataset editing and advanced Megatron-compatible operations are available -**Note:** Torch is provided by the GPT-NeoX environment and is not included as a TokenSmith dependency to avoid version conflicts. +**Note:** `torch` and `transformers` are provided by the GPT-NeoX environment and are not included as TokenSmith dependencies to avoid version conflicts. #### Python Version Requirements diff --git a/pyproject.toml b/pyproject.toml index fa27f9a..07202b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,6 @@ python = "^3.8" numpy = "^1.21.0" pandas = "^1.3.0" tqdm = "^4.62.0" -transformers = "^4.20.0" [tool.poetry.group.docs.dependencies] mkdocs = "^1.5.0" diff --git a/setup.py b/setup.py index db19bd3..4ddeb82 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,6 @@ 'numpy>=1.21.0', 'pandas>=1.3.0', 'tqdm>=4.62.0', - 'transformers>=4.20.0', ], extras_require={ 'ui': [ diff --git a/tokensmith/ui/app.py b/tokensmith/ui/app.py index 71e0193..3721b89 100644 --- a/tokensmith/ui/app.py +++ b/tokensmith/ui/app.py @@ -62,6 +62,8 @@ class DefaultArgs: from transformers import AutoTokenizer print(f"Loading tokenizer from {st.session_state.args.tokenizer_path}") st.session_state.tokenizer = AutoTokenizer.from_pretrained(st.session_state.args.tokenizer_path) + except ImportError: + st.error("Transformers library not available. Tokenizer functionality requires transformers to be installed (should be available in GPT-NeoX environment).") except Exception as e: st.error(f"Failed to load tokenizer: {e}") diff --git a/tokensmith/utils.py b/tokensmith/utils.py index 37cc823..59e7729 100644 --- a/tokensmith/utils.py +++ b/tokensmith/utils.py @@ -6,11 +6,18 @@ from tqdm import trange import uuid from typing import Optional, List, Dict, Any -from transformers import AutoTokenizer import os from functools import lru_cache import time +# Optional transformers imports - will be imported when needed +try: + from transformers import AutoTokenizer + TRANSFORMERS_AVAILABLE = True +except ImportError: + AutoTokenizer = None + TRANSFORMERS_AVAILABLE = False + # Optional megatron imports - will be imported when needed try: from megatron.data.indexed_dataset import MMapIndexedDataset @@ -28,7 +35,22 @@ def warn_once(logger: logging.Logger, msg: str): logger.warning(msg) time.sleep(10) -def generate_training_sample(tokenized_segments: List[List[int]], tokenizer: AutoTokenizer) -> str: +def generate_training_sample(tokenized_segments: List[List[int]], tokenizer) -> str: + """Generate training sample from tokenized segments using a tokenizer. + + Args: + tokenized_segments: List of tokenized segments + tokenizer: Tokenizer object (should have a decode method) + + Returns: + Decoded text string + + Raises: + ImportError: If transformers is not available and tokenizer is None + """ + if not TRANSFORMERS_AVAILABLE and tokenizer is None: + raise ImportError("Transformers is required for tokenization functionality. It should be available from your GPT-NeoX environment.") + concat_training_sample = np.concatenate(tokenized_segments) return tokenizer.decode( concat_training_sample, From 5b14cb98b8c1c5c8f78f492646904254d643b471 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 26 Sep 2025 10:25:56 +0000 Subject: [PATCH 6/9] Correct documentation - only search works without GPT-NeoX environment Co-authored-by: aflah02 <72096386+aflah02@users.noreply.github.com> --- README.md | 66 ++++++++++++++++++++++++++++++------------------------- 1 file changed, 36 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 3b8b2f6..1975df3 100644 --- a/README.md +++ b/README.md @@ -40,9 +40,9 @@ DatasetManager TokenSmith can be installed with different dependency sets depending on your use case: -#### Option 1: Core Functionality Only (Recommended) +#### Option 1: Core Dependencies Only -For basic TokenSmith functionality including inspect, sample, export, and edit operations: +For basic installation (most functionality still requires GPT-NeoX environment): ```bash git clone https://github.com/aflah02/tokensmith.git @@ -55,11 +55,11 @@ This installs core dependencies: - `pandas` - Data processing - `tqdm` - Progress bars -**Note:** `transformers` and `torch` are provided by the GPT-NeoX environment when needed for advanced functionality. +**Note:** This installation alone only allows imports. Dataset operations, UI, and most functionality require GPT-NeoX environment. -#### Option 2: With Search Functionality +#### Option 2: With Search Functionality (Works Standalone) -For search and indexing operations using tokengrams: +For search and indexing operations using tokengrams - **this is the only option that works without GPT-NeoX**: ```bash git clone https://github.com/aflah02/tokensmith.git @@ -72,7 +72,7 @@ Or with Poetry: poetry install --with search ``` -#### Option 3: With UI Support +#### Option 3: With UI Support (Requires GPT-NeoX) For the interactive Streamlit web interface: @@ -100,7 +100,7 @@ Or with Poetry: poetry install --with docs ``` -#### Option 5: Complete Installation (All Features) +#### Option 5: Complete Installation (Requires GPT-NeoX) For all optional dependencies (search, UI, and docs): @@ -123,8 +123,8 @@ pip install -e ".[search,ui]" # Search + UI **Note:** For functionality that requires GPT-NeoX/Megatron (such as `WriteableMMapIndexedDataset` and some advanced dataset operations), you must separately install GPT-NeoX following the steps provided [here](https://github.com/EleutherAI/gpt-neox?tab=readme-ov-file#environment-and-dependencies). TokenSmith is designed to work with or without GPT-NeoX: -- **Without GPT-NeoX**: Core functionality, search, UI, and most operations work perfectly -- **With GPT-NeoX**: Full dataset editing and advanced Megatron-compatible operations are available +- **Without GPT-NeoX**: Only search functionality works standalone +- **With GPT-NeoX**: Full functionality including UI, dataset operations, editing, sampling, and advanced operations **Note:** `torch` and `transformers` are provided by the GPT-NeoX environment and are not included as TokenSmith dependencies to avoid version conflicts. @@ -135,32 +135,26 @@ TokenSmith is designed to work with or without GPT-NeoX: #### Which Installation Option to Choose? -- **Core functionality**: Use Option 1 if you want basic dataset operations without search or UI -- **Search features**: Use Option 2 if you need token sequence search and indexing -- **Web interface**: Use Option 3 if you want the interactive Streamlit UI for visual dataset exploration -- **Documentation**: Use Option 4 if you're contributing to documentation or want to build docs locally -- **Complete features**: Use Option 5 if you want all functionality available -- **Development**: Use Option 5 if you're developing TokenSmith or want all features +- **Search only**: Use Option 2 if you only need token sequence search and indexing (works standalone) +- **Full functionality**: Use Options 3-5 if you need UI or dataset operations (requires GPT-NeoX environment) + - **Web interface**: Use Option 3 for interactive Streamlit UI + - **Documentation**: Use Option 4 for contributing to docs + - **Complete features**: Use Option 5 for all functionality +- **Development**: Use Option 5 for developing TokenSmith + +**Important**: Only search functionality works without GPT-NeoX. All other features require the GPT-NeoX environment. ### Basic Usage -#### Core Functionality (Works with basic installation) +#### Search Functionality (Works standalone - no GPT-NeoX required) ```python from tokensmith import DatasetManager -from transformers import AutoTokenizer # Initialize the manager manager = DatasetManager() -# Load a tokenizer for detokenization -tokenizer = AutoTokenizer.from_pretrained("gpt2") -``` - -#### Search Functionality (Requires search installation) - -```python -# Setup search functionality - requires tokengrams +# Setup search functionality - requires tokengrams but no GPT-NeoX try: manager.setup_search( bin_file_path="path/to/dataset.bin", @@ -179,12 +173,17 @@ except ImportError as e: print("ℹ️ Search functionality requires tokengrams: pip install 'tokensmith[search]'") ``` -#### Advanced Operations (Requires GPT-NeoX environment) +#### Dataset Operations (Requires GPT-NeoX environment) ```python -# Setup dataset for inspection, sampling, editing, and export -# Note: This requires GPT-NeoX/Megatron to be installed +from tokensmith import DatasetManager + +# Initialize the manager +manager = DatasetManager() + +# Note: All operations below require GPT-NeoX environment to be installed try: + # Setup dataset for inspection, sampling, editing, and export manager.setup_edit_inspect_sample_export( dataset_prefix="path/to/your/dataset", batch_info_save_prefix="path/to/batch_info", @@ -193,14 +192,21 @@ try: train_seq_len=1024, seed=42 ) + + # Load a tokenizer (requires transformers from GPT-NeoX environment) + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained("gpt2") + print("✅ Full functionality available") except ImportError as e: - print("ℹ️ Advanced operations require GPT-NeoX installation") + print("ℹ️ Dataset operations require GPT-NeoX installation") + print(" See: https://github.com/EleutherAI/gpt-neox") ``` -#### Web UI Usage (Requires UI installation) +#### Web UI Usage (Requires GPT-NeoX environment) ```bash +# Note: UI requires GPT-NeoX environment # Navigate to UI directory and run cd tokensmith/ui ./run_ui.sh From 830bd9e00fa74cb2cc81f3a14bdcf50ad034e0c0 Mon Sep 17 00:00:00 2001 From: Aflah <72096386+aflah02@users.noreply.github.com> Date: Fri, 26 Sep 2025 13:06:05 +0200 Subject: [PATCH 7/9] Revise README for TokenSmith installation options Updated installation instructions and clarified dependencies for TokenSmith. --- README.md | 170 ++++++++++-------------------------------------------- 1 file changed, 29 insertions(+), 141 deletions(-) diff --git a/README.md b/README.md index 1975df3..8e17ccc 100644 --- a/README.md +++ b/README.md @@ -38,182 +38,70 @@ DatasetManager ### Installation -TokenSmith can be installed with different dependency sets depending on your use case: +`tokensmith` can be installed in several ways depending on your use case. -#### Option 1: Core Dependencies Only +Note: Apart from search all features assume that GPT-NeoX is installed to use Megatron. You can do that by simply following the steps provided [here](https://github.com/EleutherAI/gpt-neox?tab=readme-ov-file#environment-and-dependencies). -For basic installation (most functionality still requires GPT-NeoX environment): +## 1. Basic Installation (Core Only) -```bash -git clone https://github.com/aflah02/tokensmith.git -cd tokensmith -pip install -e . -``` - -This installs core dependencies: -- `numpy` - Array operations -- `pandas` - Data processing -- `tqdm` - Progress bars - -**Note:** This installation alone only allows imports. Dataset operations, UI, and most functionality require GPT-NeoX environment. - -#### Option 2: With Search Functionality (Works Standalone) - -For search and indexing operations using tokengrams - **this is the only option that works without GPT-NeoX**: +If you only need the **core functionality** (data editing, sampling, importing, exporting, inspection): ```bash -git clone https://github.com/aflah02/tokensmith.git -cd tokensmith -pip install -e ".[search]" +pip install tokensmith ``` -Or with Poetry: -```bash -poetry install --with search -``` +## 2. With Documentation Dependencies -#### Option 3: With UI Support (Requires GPT-NeoX) +If you plan to build or serve the documentation locally: -For the interactive Streamlit web interface: - -```bash -git clone https://github.com/aflah02/tokensmith.git -cd tokensmith -pip install -e ".[ui]" -``` - -Or with Poetry: ```bash -poetry install --with ui +pip install "tokensmith[docs]" ``` -#### Option 4: With Documentation Tools - -For building documentation: +Once installed, you can build and serve the docs: ```bash -pip install -e ".[docs]" -``` - -Or with Poetry: -```bash -poetry install --with docs +mkdocs serve ``` -#### Option 5: Complete Installation (Requires GPT-NeoX) +## 3. With UI Components -For all optional dependencies (search, UI, and docs): +If you want the **interactive interface** for exploring data: ```bash -pip install -e ".[all]" +pip install "tokensmith[ui]" ``` -Or with Poetry: -```bash -poetry install --with all -``` +## 4. With Search Features + +For advanced **token-level search and n-gram utilities**: -You can also combine multiple options: ```bash -pip install -e ".[search,ui]" # Search + UI +pip install "tokensmith[search]" ``` -#### GPT-NeoX/Megatron Integration - -**Note:** For functionality that requires GPT-NeoX/Megatron (such as `WriteableMMapIndexedDataset` and some advanced dataset operations), you must separately install GPT-NeoX following the steps provided [here](https://github.com/EleutherAI/gpt-neox?tab=readme-ov-file#environment-and-dependencies). - -TokenSmith is designed to work with or without GPT-NeoX: -- **Without GPT-NeoX**: Only search functionality works standalone -- **With GPT-NeoX**: Full functionality including UI, dataset operations, editing, sampling, and advanced operations - -**Note:** `torch` and `transformers` are provided by the GPT-NeoX environment and are not included as TokenSmith dependencies to avoid version conflicts. - -#### Python Version Requirements - -- **Python 3.8+** is required -- Compatible with modern Python versions and dependency ecosystems - -#### Which Installation Option to Choose? - -- **Search only**: Use Option 2 if you only need token sequence search and indexing (works standalone) -- **Full functionality**: Use Options 3-5 if you need UI or dataset operations (requires GPT-NeoX environment) - - **Web interface**: Use Option 3 for interactive Streamlit UI - - **Documentation**: Use Option 4 for contributing to docs - - **Complete features**: Use Option 5 for all functionality -- **Development**: Use Option 5 for developing TokenSmith - -**Important**: Only search functionality works without GPT-NeoX. All other features require the GPT-NeoX environment. +## 5. Full Installation (Everything) -### Basic Usage +To install **all optional features**: -#### Search Functionality (Works standalone - no GPT-NeoX required) - -```python -from tokensmith import DatasetManager - -# Initialize the manager -manager = DatasetManager() - -# Setup search functionality - requires tokengrams but no GPT-NeoX -try: - manager.setup_search( - bin_file_path="path/to/dataset.bin", - search_index_save_path="path/to/search_index", - vocab=2**16, # or 2**32 for larger vocabularies - reuse=True - ) - - # Search operations - query = [101, 2023, 102] # Token IDs - count = manager.search.count(query) - positions = manager.search.positions(query) - print("✅ Search functionality available") - -except ImportError as e: - print("ℹ️ Search functionality requires tokengrams: pip install 'tokensmith[search]'") +```bash +pip install "tokensmith[all]" ``` -#### Dataset Operations (Requires GPT-NeoX environment) +This includes docs, UI, and search extras. -```python -from tokensmith import DatasetManager - -# Initialize the manager -manager = DatasetManager() - -# Note: All operations below require GPT-NeoX environment to be installed -try: - # Setup dataset for inspection, sampling, editing, and export - manager.setup_edit_inspect_sample_export( - dataset_prefix="path/to/your/dataset", - batch_info_save_prefix="path/to/batch_info", - train_iters=1000, - train_batch_size=32, - train_seq_len=1024, - seed=42 - ) - - # Load a tokenizer (requires transformers from GPT-NeoX environment) - from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained("gpt2") - - print("✅ Full functionality available") -except ImportError as e: - print("ℹ️ Dataset operations require GPT-NeoX installation") - print(" See: https://github.com/EleutherAI/gpt-neox") -``` +## 6. Development Installation -#### Web UI Usage (Requires GPT-NeoX environment) +If you’re contributing to `tokensmith`: ```bash -# Note: UI requires GPT-NeoX environment -# Navigate to UI directory and run -cd tokensmith/ui -./run_ui.sh - -# Or modify run_ui.sh for your specific setup +git clone https://github.com/aflah02/tokensmith.git +cd tokensmith +pip install -e ".[all,docs,ui,search]" ``` +This sets up a local environment with all extras for development. + ## 📚 Core Functionality ### 🔍 Search Operations From 8a9f5caa7579f6bfecdcf5a2a69429300af96f0a Mon Sep 17 00:00:00 2001 From: Aflah <72096386+aflah02@users.noreply.github.com> Date: Fri, 26 Sep 2025 13:10:37 +0200 Subject: [PATCH 8/9] Update author details and repository URL in setup.py --- setup.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 4ddeb82..493504d 100644 --- a/setup.py +++ b/setup.py @@ -3,16 +3,14 @@ setup( name='tokensmith', version='0.1.0', - author='Your Name', - author_email='your.email@example.com', description='A package for managing datasets with editing, inspecting, sampling, exporting, and searching functionalities.', long_description=open('README.md').read(), long_description_content_type='text/markdown', - url='https://github.com/yourusername/tokensmith', # Replace with your actual repository URL + url='https://github.com/aflah02/TokenSmith', packages=find_packages(), classifiers=[ 'Programming Language :: Python :: 3', - 'License :: OSI Approved :: MIT License', # Replace with your actual license + 'License :: Apache 2.0 License', 'Operating System :: OS Independent', ], python_requires='>=3.8', @@ -41,4 +39,4 @@ 'tokengrams>=0.3.0', ], }, -) \ No newline at end of file +) From 532ff806abe2b288e782ac78edecb99bb9959400 Mon Sep 17 00:00:00 2001 From: Aflah <72096386+aflah02@users.noreply.github.com> Date: Fri, 26 Sep 2025 13:11:28 +0200 Subject: [PATCH 9/9] Correct capitalization of 'TokenSmith' in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8e17ccc..b0c2490 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ DatasetManager ### Installation -`tokensmith` can be installed in several ways depending on your use case. +`TokenSmith` can be installed in several ways depending on your use case. Note: Apart from search all features assume that GPT-NeoX is installed to use Megatron. You can do that by simply following the steps provided [here](https://github.com/EleutherAI/gpt-neox?tab=readme-ov-file#environment-and-dependencies).