From 0f7e082f9a3b14bbabe6a93c6cdb2587b0ba8d16 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 26 Sep 2025 09:44:59 +0000
Subject: [PATCH 1/9] Initial plan


From 52e8b5bdb4923886875adacc5753f2362003c911 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 26 Sep 2025 09:55:08 +0000
Subject: [PATCH 2/9] Add missing dependencies to pyproject.toml and fix
 megatron import issues

Co-authored-by: aflah02 <72096386+aflah02@users.noreply.github.com>
---
 pyproject.toml      | 14 ++++++++++++--
 tokensmith/utils.py | 15 +++++++++++++--
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7793da6..ec365df 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,8 +10,14 @@ repository = "https://github.com/aflah02/tokensmith"
 keywords = ["dataset", "management", "editing", "sampling", "exporting", "searching"]
 
 [tool.poetry.dependencies]
-python = "^3.7"
-# Add other dependencies here
+python = "^3.8"
+# Core dependencies
+numpy = "^1.21.0"
+pandas = "^1.3.0"
+tqdm = "^4.62.0"
+transformers = "^4.20.0"
+torch = "^1.11.0"
+tokengrams = "^0.3.0"
 
 [tool.poetry.group.docs.dependencies]
 mkdocs = "^1.5.0"
@@ -19,6 +25,10 @@ mkdocs-material = "^9.0.0"
 mkdocstrings = {extras = ["python"], version = "^0.24.0"}
 mkdocstrings-python = "^1.7.0"
 
+[tool.poetry.group.ui.dependencies]
+streamlit = "^1.20.0"
+altair = "^4.2.0"
+
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
\ No newline at end of file
diff --git a/tokensmith/utils.py b/tokensmith/utils.py
index 6ac807c..bda3126 100644
--- a/tokensmith/utils.py
+++ b/tokensmith/utils.py
@@ -6,14 +6,22 @@
 from tqdm import trange
 import uuid
 from typing import Optional, List, Dict, Any
-from megatron.data.indexed_dataset import MMapIndexedDataset
-from .megatron_dependencies import get_train_valid_test_split_, build_index_mappings
 from transformers import AutoTokenizer
 import torch
 import os
 from functools import lru_cache
 import time
 
+# Optional megatron imports - will be imported when needed
+try:
+    from megatron.data.indexed_dataset import MMapIndexedDataset
+    MEGATRON_AVAILABLE = True
+except ImportError:
+    MMapIndexedDataset = None
+    MEGATRON_AVAILABLE = False
+
+from .megatron_dependencies import get_train_valid_test_split_, build_index_mappings
+
 logger = logging.getLogger(__name__)
 
 @lru_cache(1)
@@ -61,6 +69,9 @@ def __init__(self,
                  packing_impl: str,
                  allow_chopped: bool,
                  add_extra_token_to_seq: int):
+        if not MEGATRON_AVAILABLE:
+            raise ImportError("Megatron is required for WriteableMMapIndexedDataset functionality. Please install GPT-NeoX following the instructions in the README.")
+            
         logger.debug(f"Initializing WriteableMMapIndexedDataset with pointer: {dataset_prefix}.bin and index: {dataset_prefix}.idx")
 
         self.corpus_pointer = open(f"{dataset_prefix}.bin", 'r+b')

From 6d709e57d9f4905720f780018de8d3d520a29d42 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 26 Sep 2025 09:57:44 +0000
Subject: [PATCH 3/9] Update setup.py dependencies and add comprehensive test
 validation

Co-authored-by: aflah02 <72096386+aflah02@users.noreply.github.com>
---
 setup.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 59376e4..636c351 100644
--- a/setup.py
+++ b/setup.py
@@ -15,8 +15,25 @@
         'License :: OSI Approved :: MIT License',  # Replace with your actual license
         'Operating System :: OS Independent',
     ],
-    python_requires='>=3.6',
+    python_requires='>=3.8',
     install_requires=[
-        # List your package dependencies here
+        'numpy>=1.21.0',
+        'pandas>=1.3.0', 
+        'tqdm>=4.62.0',
+        'transformers>=4.20.0',
+        'torch>=1.11.0',
+        'tokengrams>=0.3.0',
     ],
+    extras_require={
+        'ui': [
+            'streamlit>=1.20.0',
+            'altair>=4.2.0',
+        ],
+        'docs': [
+            'mkdocs>=1.5.0',
+            'mkdocs-material>=9.0.0', 
+            'mkdocstrings[python]>=0.24.0',
+            'mkdocstrings-python>=1.7.0',
+        ],
+    },
 )
\ No newline at end of file

From 06792cb810ef35f45ce4059f4d4a5c8e41827a0e Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 26 Sep 2025 10:13:57 +0000
Subject: [PATCH 4/9] Remove torch and make tokengrams optional with separate
 install groups

Co-authored-by: aflah02 <72096386+aflah02@users.noreply.github.com>
---
 README.md                    | 166 +++++++++++++++++++++++++++++++----
 pyproject.toml               |  11 ++-
 setup.py                     |  10 ++-
 tokensmith/search/handler.py |  11 ++-
 tokensmith/utils.py          |   1 -
 5 files changed, 174 insertions(+), 25 deletions(-)

diff --git a/README.md b/README.md
index d946694..03ea428 100644
--- a/README.md
+++ b/README.md
@@ -38,9 +38,11 @@ DatasetManager
 
 ### Installation
 
-Ensure you have a working GPT-NeoX environment using steps provided [here](https://github.com/EleutherAI/gpt-neox?tab=readme-ov-file#environment-and-dependencies)
+TokenSmith can be installed with different dependency sets depending on your use case:
 
-Within the same env run the following - 
+#### Option 1: Core Functionality Only (Recommended)
+
+For basic TokenSmith functionality including inspect, sample, export, and edit operations:
 
 ```bash
 git clone https://github.com/aflah02/tokensmith.git
@@ -48,8 +50,101 @@ cd tokensmith
 pip install -e .
 ```
 
+This installs core dependencies:
+- `numpy` - Array operations
+- `pandas` - Data processing  
+- `tqdm` - Progress bars
+- `transformers` - Tokenizer support
+
+#### Option 2: With Search Functionality
+
+For search and indexing operations using tokengrams:
+
+```bash
+git clone https://github.com/aflah02/tokensmith.git
+cd tokensmith
+pip install -e ".[search]"
+```
+
+Or with Poetry:
+```bash
+poetry install --with search
+```
+
+#### Option 3: With UI Support
+
+For the interactive Streamlit web interface:
+
+```bash
+git clone https://github.com/aflah02/tokensmith.git
+cd tokensmith
+pip install -e ".[ui]"
+```
+
+Or with Poetry:
+```bash
+poetry install --with ui
+```
+
+#### Option 4: With Documentation Tools
+
+For building documentation:
+
+```bash
+pip install -e ".[docs]"
+```
+
+Or with Poetry:
+```bash
+poetry install --with docs
+```
+
+#### Option 5: Complete Installation (All Features)
+
+For all optional dependencies (search, UI, and docs):
+
+```bash
+pip install -e ".[all]"
+```
+
+Or with Poetry:
+```bash
+poetry install --with all
+```
+
+You can also combine multiple options:
+```bash
+pip install -e ".[search,ui]"  # Search + UI
+```
+
+#### GPT-NeoX/Megatron Integration
+
+**Note:** For functionality that requires GPT-NeoX/Megatron (such as `WriteableMMapIndexedDataset` and some advanced dataset operations), you must separately install GPT-NeoX following the steps provided [here](https://github.com/EleutherAI/gpt-neox?tab=readme-ov-file#environment-and-dependencies).
+
+TokenSmith is designed to work with or without GPT-NeoX:
+- **Without GPT-NeoX**: Core functionality, search, UI, and most operations work perfectly
+- **With GPT-NeoX**: Full dataset editing and advanced Megatron-compatible operations are available
+
+**Note:** Torch is provided by the GPT-NeoX environment and is not included as a TokenSmith dependency to avoid version conflicts.
+
+#### Python Version Requirements
+
+- **Python 3.8+** is required
+- Compatible with modern Python versions and dependency ecosystems
+
+#### Which Installation Option to Choose?
+
+- **Core functionality**: Use Option 1 if you want basic dataset operations without search or UI
+- **Search features**: Use Option 2 if you need token sequence search and indexing  
+- **Web interface**: Use Option 3 if you want the interactive Streamlit UI for visual dataset exploration
+- **Documentation**: Use Option 4 if you're contributing to documentation or want to build docs locally  
+- **Complete features**: Use Option 5 if you want all functionality available
+- **Development**: Use Option 5 if you're developing TokenSmith or want all features
+
 ### Basic Usage
 
+#### Core Functionality (Works with basic installation)
+
 ```python
 from tokensmith import DatasetManager
 from transformers import AutoTokenizer
@@ -57,26 +152,59 @@ from transformers import AutoTokenizer
 # Initialize the manager
 manager = DatasetManager()
 
+# Load a tokenizer for detokenization
+tokenizer = AutoTokenizer.from_pretrained("gpt2")
+```
+
+#### Search Functionality (Requires search installation)
+
+```python
+# Setup search functionality - requires tokengrams
+try:
+    manager.setup_search(
+        bin_file_path="path/to/dataset.bin",
+        search_index_save_path="path/to/search_index",
+        vocab=2**16,  # or 2**32 for larger vocabularies
+        reuse=True
+    )
+    
+    # Search operations
+    query = [101, 2023, 102]  # Token IDs
+    count = manager.search.count(query)
+    positions = manager.search.positions(query)
+    print("✅ Search functionality available")
+    
+except ImportError as e:
+    print("ℹ️ Search functionality requires tokengrams: pip install 'tokensmith[search]'")
+```
+
+#### Advanced Operations (Requires GPT-NeoX environment)
+
+```python
 # Setup dataset for inspection, sampling, editing, and export
-manager.setup_edit_inspect_sample_export(
-    dataset_prefix="path/to/your/dataset",
-    batch_info_save_prefix="path/to/batch_info",
-    train_iters=1000,
-    train_batch_size=32,
-    train_seq_len=1024,
-    seed=42
-)
+# Note: This requires GPT-NeoX/Megatron to be installed
+try:
+    manager.setup_edit_inspect_sample_export(
+        dataset_prefix="path/to/your/dataset",
+        batch_info_save_prefix="path/to/batch_info",
+        train_iters=1000,
+        train_batch_size=32,
+        train_seq_len=1024,
+        seed=42
+    )
+    print("✅ Full functionality available")
+except ImportError as e:
+    print("ℹ️ Advanced operations require GPT-NeoX installation")
+```
 
-# Setup search functionality (optional)
-manager.setup_search(
-    bin_file_path="path/to/dataset.bin",
-    search_index_save_path="path/to/search_index",
-    vocab=2**16,  # or 2**32 for larger vocabularies
-    reuse=True
-)
+#### Web UI Usage (Requires UI installation)
 
-# Load a tokenizer for detokenization
-tokenizer = AutoTokenizer.from_pretrained("gpt2")
+```bash
+# Navigate to UI directory and run
+cd tokensmith/ui
+./run_ui.sh
+
+# Or modify run_ui.sh for your specific setup
 ```
 
 ## 📚 Core Functionality
diff --git a/pyproject.toml b/pyproject.toml
index ec365df..fa27f9a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -16,8 +16,6 @@ numpy = "^1.21.0"
 pandas = "^1.3.0"
 tqdm = "^4.62.0"
 transformers = "^4.20.0"
-torch = "^1.11.0"
-tokengrams = "^0.3.0"
 
 [tool.poetry.group.docs.dependencies]
 mkdocs = "^1.5.0"
@@ -29,6 +27,15 @@ mkdocstrings-python = "^1.7.0"
 streamlit = "^1.20.0"
 altair = "^4.2.0"
 
+[tool.poetry.group.search.dependencies]
+tokengrams = "^0.3.0"
+
+[tool.poetry.group.all.dependencies]
+# All optional dependencies for complete functionality
+streamlit = "^1.20.0"
+altair = "^4.2.0"
+tokengrams = "^0.3.0"
+
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 636c351..db19bd3 100644
--- a/setup.py
+++ b/setup.py
@@ -21,19 +21,25 @@
         'pandas>=1.3.0', 
         'tqdm>=4.62.0',
         'transformers>=4.20.0',
-        'torch>=1.11.0',
-        'tokengrams>=0.3.0',
     ],
     extras_require={
         'ui': [
             'streamlit>=1.20.0',
             'altair>=4.2.0',
         ],
+        'search': [
+            'tokengrams>=0.3.0',
+        ],
         'docs': [
             'mkdocs>=1.5.0',
             'mkdocs-material>=9.0.0', 
             'mkdocstrings[python]>=0.24.0',
             'mkdocstrings-python>=1.7.0',
         ],
+        'all': [
+            'streamlit>=1.20.0',
+            'altair>=4.2.0',
+            'tokengrams>=0.3.0',
+        ],
     },
 )
\ No newline at end of file
diff --git a/tokensmith/search/handler.py b/tokensmith/search/handler.py
index bc14e20..c637dc8 100644
--- a/tokensmith/search/handler.py
+++ b/tokensmith/search/handler.py
@@ -1,12 +1,21 @@
 # Heavily inspired by the original code from https://github.com/EleutherAI/tokengrams/blob/master/tokengrams/tokengrams.pyi and uses the same library.
 
-from tokengrams import MemmapIndex
 from typing import List
 import os
 import logging
 
+# Optional tokengrams import - will be imported when needed
+try:
+    from tokengrams import MemmapIndex
+    TOKENGRAMS_AVAILABLE = True
+except ImportError:
+    MemmapIndex = None
+    TOKENGRAMS_AVAILABLE = False
+
 class SearchHandler:
     def __init__(self, bin_file_path: str, index_save_path: str, vocab: int, verbose: bool = True, reuse: bool = True):
+        if not TOKENGRAMS_AVAILABLE:
+            raise ImportError("Tokengrams is required for search functionality. Please install with: pip install 'tokensmith[search]' or pip install tokengrams")
 
         self.bin_file_path = bin_file_path
         self.index_save_path = index_save_path
diff --git a/tokensmith/utils.py b/tokensmith/utils.py
index bda3126..37cc823 100644
--- a/tokensmith/utils.py
+++ b/tokensmith/utils.py
@@ -7,7 +7,6 @@
 import uuid
 from typing import Optional, List, Dict, Any
 from transformers import AutoTokenizer
-import torch
 import os
 from functools import lru_cache
 import time

From a552ce677a93f233069876557bd4cbe4826895b9 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 26 Sep 2025 10:20:45 +0000
Subject: [PATCH 5/9] Remove transformers dependency to avoid NeoX version
 conflicts

Co-authored-by: aflah02 <72096386+aflah02@users.noreply.github.com>
---
 README.md            |  5 +++--
 pyproject.toml       |  1 -
 setup.py             |  1 -
 tokensmith/ui/app.py |  2 ++
 tokensmith/utils.py  | 26 ++++++++++++++++++++++++--
 5 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 03ea428..3b8b2f6 100644
--- a/README.md
+++ b/README.md
@@ -54,7 +54,8 @@ This installs core dependencies:
 - `numpy` - Array operations
 - `pandas` - Data processing  
 - `tqdm` - Progress bars
-- `transformers` - Tokenizer support
+
+**Note:** `transformers` and `torch` are provided by the GPT-NeoX environment when needed for advanced functionality.
 
 #### Option 2: With Search Functionality
 
@@ -125,7 +126,7 @@ TokenSmith is designed to work with or without GPT-NeoX:
 - **Without GPT-NeoX**: Core functionality, search, UI, and most operations work perfectly
 - **With GPT-NeoX**: Full dataset editing and advanced Megatron-compatible operations are available
 
-**Note:** Torch is provided by the GPT-NeoX environment and is not included as a TokenSmith dependency to avoid version conflicts.
+**Note:** `torch` and `transformers` are provided by the GPT-NeoX environment and are not included as TokenSmith dependencies to avoid version conflicts.
 
 #### Python Version Requirements
 
diff --git a/pyproject.toml b/pyproject.toml
index fa27f9a..07202b2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,7 +15,6 @@ python = "^3.8"
 numpy = "^1.21.0"
 pandas = "^1.3.0"
 tqdm = "^4.62.0"
-transformers = "^4.20.0"
 
 [tool.poetry.group.docs.dependencies]
 mkdocs = "^1.5.0"
diff --git a/setup.py b/setup.py
index db19bd3..4ddeb82 100644
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,6 @@
         'numpy>=1.21.0',
         'pandas>=1.3.0', 
         'tqdm>=4.62.0',
-        'transformers>=4.20.0',
     ],
     extras_require={
         'ui': [
diff --git a/tokensmith/ui/app.py b/tokensmith/ui/app.py
index 71e0193..3721b89 100644
--- a/tokensmith/ui/app.py
+++ b/tokensmith/ui/app.py
@@ -62,6 +62,8 @@ class DefaultArgs:
                 from transformers import AutoTokenizer
                 print(f"Loading tokenizer from {st.session_state.args.tokenizer_path}")
                 st.session_state.tokenizer = AutoTokenizer.from_pretrained(st.session_state.args.tokenizer_path)
+            except ImportError:
+                st.error("Transformers library not available. Tokenizer functionality requires transformers to be installed (should be available in GPT-NeoX environment).")
             except Exception as e:
                 st.error(f"Failed to load tokenizer: {e}")
     
diff --git a/tokensmith/utils.py b/tokensmith/utils.py
index 37cc823..59e7729 100644
--- a/tokensmith/utils.py
+++ b/tokensmith/utils.py
@@ -6,11 +6,18 @@
 from tqdm import trange
 import uuid
 from typing import Optional, List, Dict, Any
-from transformers import AutoTokenizer
 import os
 from functools import lru_cache
 import time
 
+# Optional transformers imports - will be imported when needed
+try:
+    from transformers import AutoTokenizer
+    TRANSFORMERS_AVAILABLE = True
+except ImportError:
+    AutoTokenizer = None
+    TRANSFORMERS_AVAILABLE = False
+
 # Optional megatron imports - will be imported when needed
 try:
     from megatron.data.indexed_dataset import MMapIndexedDataset
@@ -28,7 +35,22 @@ def warn_once(logger: logging.Logger, msg: str):
     logger.warning(msg)
     time.sleep(10)
 
-def generate_training_sample(tokenized_segments: List[List[int]], tokenizer: AutoTokenizer) -> str:
+def generate_training_sample(tokenized_segments: List[List[int]], tokenizer) -> str:
+    """Generate training sample from tokenized segments using a tokenizer.
+    
+    Args:
+        tokenized_segments: List of tokenized segments
+        tokenizer: Tokenizer object (should have a decode method)
+        
+    Returns:
+        Decoded text string
+        
+    Raises:
+        ImportError: If transformers is not available and tokenizer is None
+    """
+    if not TRANSFORMERS_AVAILABLE and tokenizer is None:
+        raise ImportError("Transformers is required for tokenization functionality. It should be available from your GPT-NeoX environment.")
+    
     concat_training_sample = np.concatenate(tokenized_segments)
     return tokenizer.decode(
         concat_training_sample,

From 5b14cb98b8c1c5c8f78f492646904254d643b471 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 26 Sep 2025 10:25:56 +0000
Subject: [PATCH 6/9] Correct documentation - only search works without
 GPT-NeoX environment

Co-authored-by: aflah02 <72096386+aflah02@users.noreply.github.com>
---
 README.md | 66 ++++++++++++++++++++++++++++++-------------------------
 1 file changed, 36 insertions(+), 30 deletions(-)

diff --git a/README.md b/README.md
index 3b8b2f6..1975df3 100644
--- a/README.md
+++ b/README.md
@@ -40,9 +40,9 @@ DatasetManager
 
 TokenSmith can be installed with different dependency sets depending on your use case:
 
-#### Option 1: Core Functionality Only (Recommended)
+#### Option 1: Core Dependencies Only
 
-For basic TokenSmith functionality including inspect, sample, export, and edit operations:
+For basic installation (most functionality still requires GPT-NeoX environment):
 
 ```bash
 git clone https://github.com/aflah02/tokensmith.git
@@ -55,11 +55,11 @@ This installs core dependencies:
 - `pandas` - Data processing  
 - `tqdm` - Progress bars
 
-**Note:** `transformers` and `torch` are provided by the GPT-NeoX environment when needed for advanced functionality.
+**Note:** This installation alone only allows imports. Dataset operations, UI, and most functionality require GPT-NeoX environment.
 
-#### Option 2: With Search Functionality
+#### Option 2: With Search Functionality (Works Standalone)
 
-For search and indexing operations using tokengrams:
+For search and indexing operations using tokengrams - **this is the only option that works without GPT-NeoX**:
 
 ```bash
 git clone https://github.com/aflah02/tokensmith.git
@@ -72,7 +72,7 @@ Or with Poetry:
 poetry install --with search
 ```
 
-#### Option 3: With UI Support
+#### Option 3: With UI Support (Requires GPT-NeoX)
 
 For the interactive Streamlit web interface:
 
@@ -100,7 +100,7 @@ Or with Poetry:
 poetry install --with docs
 ```
 
-#### Option 5: Complete Installation (All Features)
+#### Option 5: Complete Installation (Requires GPT-NeoX)
 
 For all optional dependencies (search, UI, and docs):
 
@@ -123,8 +123,8 @@ pip install -e ".[search,ui]"  # Search + UI
 **Note:** For functionality that requires GPT-NeoX/Megatron (such as `WriteableMMapIndexedDataset` and some advanced dataset operations), you must separately install GPT-NeoX following the steps provided [here](https://github.com/EleutherAI/gpt-neox?tab=readme-ov-file#environment-and-dependencies).
 
 TokenSmith is designed to work with or without GPT-NeoX:
-- **Without GPT-NeoX**: Core functionality, search, UI, and most operations work perfectly
-- **With GPT-NeoX**: Full dataset editing and advanced Megatron-compatible operations are available
+- **Without GPT-NeoX**: Only search functionality works standalone
+- **With GPT-NeoX**: Full functionality including UI, dataset operations, editing, sampling, and advanced operations
 
 **Note:** `torch` and `transformers` are provided by the GPT-NeoX environment and are not included as TokenSmith dependencies to avoid version conflicts.
 
@@ -135,32 +135,26 @@ TokenSmith is designed to work with or without GPT-NeoX:
 
 #### Which Installation Option to Choose?
 
-- **Core functionality**: Use Option 1 if you want basic dataset operations without search or UI
-- **Search features**: Use Option 2 if you need token sequence search and indexing  
-- **Web interface**: Use Option 3 if you want the interactive Streamlit UI for visual dataset exploration
-- **Documentation**: Use Option 4 if you're contributing to documentation or want to build docs locally  
-- **Complete features**: Use Option 5 if you want all functionality available
-- **Development**: Use Option 5 if you're developing TokenSmith or want all features
+- **Search only**: Use Option 2 if you only need token sequence search and indexing (works standalone)
+- **Full functionality**: Use Options 3-5 if you need UI or dataset operations (requires GPT-NeoX environment)
+  - **Web interface**: Use Option 3 for interactive Streamlit UI
+  - **Documentation**: Use Option 4 for contributing to docs
+  - **Complete features**: Use Option 5 for all functionality
+- **Development**: Use Option 5 for developing TokenSmith
+
+**Important**: Only search functionality works without GPT-NeoX. All other features require the GPT-NeoX environment.
 
 ### Basic Usage
 
-#### Core Functionality (Works with basic installation)
+#### Search Functionality (Works standalone - no GPT-NeoX required)
 
 ```python
 from tokensmith import DatasetManager
-from transformers import AutoTokenizer
 
 # Initialize the manager
 manager = DatasetManager()
 
-# Load a tokenizer for detokenization
-tokenizer = AutoTokenizer.from_pretrained("gpt2")
-```
-
-#### Search Functionality (Requires search installation)
-
-```python
-# Setup search functionality - requires tokengrams
+# Setup search functionality - requires tokengrams but no GPT-NeoX
 try:
     manager.setup_search(
         bin_file_path="path/to/dataset.bin",
@@ -179,12 +173,17 @@ except ImportError as e:
     print("ℹ️ Search functionality requires tokengrams: pip install 'tokensmith[search]'")
 ```
 
-#### Advanced Operations (Requires GPT-NeoX environment)
+#### Dataset Operations (Requires GPT-NeoX environment)
 
 ```python
-# Setup dataset for inspection, sampling, editing, and export
-# Note: This requires GPT-NeoX/Megatron to be installed
+from tokensmith import DatasetManager
+
+# Initialize the manager
+manager = DatasetManager()
+
+# Note: All operations below require GPT-NeoX environment to be installed
 try:
+    # Setup dataset for inspection, sampling, editing, and export
     manager.setup_edit_inspect_sample_export(
         dataset_prefix="path/to/your/dataset",
         batch_info_save_prefix="path/to/batch_info",
@@ -193,14 +192,21 @@ try:
         train_seq_len=1024,
         seed=42
     )
+    
+    # Load a tokenizer (requires transformers from GPT-NeoX environment)
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    
     print("✅ Full functionality available")
 except ImportError as e:
-    print("ℹ️ Advanced operations require GPT-NeoX installation")
+    print("ℹ️ Dataset operations require GPT-NeoX installation")
+    print("   See: https://github.com/EleutherAI/gpt-neox")
 ```
 
-#### Web UI Usage (Requires UI installation)
+#### Web UI Usage (Requires GPT-NeoX environment)
 
 ```bash
+# Note: UI requires GPT-NeoX environment
 # Navigate to UI directory and run
 cd tokensmith/ui
 ./run_ui.sh

From 830bd9e00fa74cb2cc81f3a14bdcf50ad034e0c0 Mon Sep 17 00:00:00 2001
From: Aflah <72096386+aflah02@users.noreply.github.com>
Date: Fri, 26 Sep 2025 13:06:05 +0200
Subject: [PATCH 7/9] Revise README for TokenSmith installation options

Updated installation instructions and clarified dependencies for TokenSmith.
---
 README.md | 170 ++++++++++--------------------------------------------
 1 file changed, 29 insertions(+), 141 deletions(-)

diff --git a/README.md b/README.md
index 1975df3..8e17ccc 100644
--- a/README.md
+++ b/README.md
@@ -38,182 +38,70 @@ DatasetManager
 
 ### Installation
 
-TokenSmith can be installed with different dependency sets depending on your use case:
+`tokensmith` can be installed in several ways depending on your use case.
 
-#### Option 1: Core Dependencies Only
+Note: Apart from search all features assume that GPT-NeoX is installed to use Megatron. You can do that by simply following the steps provided [here](https://github.com/EleutherAI/gpt-neox?tab=readme-ov-file#environment-and-dependencies).
 
-For basic installation (most functionality still requires GPT-NeoX environment):
+## 1. Basic Installation (Core Only)
 
-```bash
-git clone https://github.com/aflah02/tokensmith.git
-cd tokensmith
-pip install -e .
-```
-
-This installs core dependencies:
-- `numpy` - Array operations
-- `pandas` - Data processing  
-- `tqdm` - Progress bars
-
-**Note:** This installation alone only allows imports. Dataset operations, UI, and most functionality require GPT-NeoX environment.
-
-#### Option 2: With Search Functionality (Works Standalone)
-
-For search and indexing operations using tokengrams - **this is the only option that works without GPT-NeoX**:
+If you only need the **core functionality** (data editing, sampling, importing, exporting, inspection):
 
 ```bash
-git clone https://github.com/aflah02/tokensmith.git
-cd tokensmith
-pip install -e ".[search]"
+pip install tokensmith
 ```
 
-Or with Poetry:
-```bash
-poetry install --with search
-```
+## 2. With Documentation Dependencies
 
-#### Option 3: With UI Support (Requires GPT-NeoX)
+If you plan to build or serve the documentation locally:
 
-For the interactive Streamlit web interface:
-
-```bash
-git clone https://github.com/aflah02/tokensmith.git
-cd tokensmith
-pip install -e ".[ui]"
-```
-
-Or with Poetry:
 ```bash
-poetry install --with ui
+pip install "tokensmith[docs]"
 ```
 
-#### Option 4: With Documentation Tools
-
-For building documentation:
+Once installed, you can build and serve the docs:
 
 ```bash
-pip install -e ".[docs]"
-```
-
-Or with Poetry:
-```bash
-poetry install --with docs
+mkdocs serve
 ```
 
-#### Option 5: Complete Installation (Requires GPT-NeoX)
+## 3. With UI Components
 
-For all optional dependencies (search, UI, and docs):
+If you want the **interactive interface** for exploring data:
 
 ```bash
-pip install -e ".[all]"
+pip install "tokensmith[ui]"
 ```
 
-Or with Poetry:
-```bash
-poetry install --with all
-```
+## 4. With Search Features
+
+For advanced **token-level search and n-gram utilities**:
 
-You can also combine multiple options:
 ```bash
-pip install -e ".[search,ui]"  # Search + UI
+pip install "tokensmith[search]"
 ```
 
-#### GPT-NeoX/Megatron Integration
-
-**Note:** For functionality that requires GPT-NeoX/Megatron (such as `WriteableMMapIndexedDataset` and some advanced dataset operations), you must separately install GPT-NeoX following the steps provided [here](https://github.com/EleutherAI/gpt-neox?tab=readme-ov-file#environment-and-dependencies).
-
-TokenSmith is designed to work with or without GPT-NeoX:
-- **Without GPT-NeoX**: Only search functionality works standalone
-- **With GPT-NeoX**: Full functionality including UI, dataset operations, editing, sampling, and advanced operations
-
-**Note:** `torch` and `transformers` are provided by the GPT-NeoX environment and are not included as TokenSmith dependencies to avoid version conflicts.
-
-#### Python Version Requirements
-
-- **Python 3.8+** is required
-- Compatible with modern Python versions and dependency ecosystems
-
-#### Which Installation Option to Choose?
-
-- **Search only**: Use Option 2 if you only need token sequence search and indexing (works standalone)
-- **Full functionality**: Use Options 3-5 if you need UI or dataset operations (requires GPT-NeoX environment)
-  - **Web interface**: Use Option 3 for interactive Streamlit UI
-  - **Documentation**: Use Option 4 for contributing to docs
-  - **Complete features**: Use Option 5 for all functionality
-- **Development**: Use Option 5 for developing TokenSmith
-
-**Important**: Only search functionality works without GPT-NeoX. All other features require the GPT-NeoX environment.
+## 5. Full Installation (Everything)
 
-### Basic Usage
+To install **all optional features**:
 
-#### Search Functionality (Works standalone - no GPT-NeoX required)
-
-```python
-from tokensmith import DatasetManager
-
-# Initialize the manager
-manager = DatasetManager()
-
-# Setup search functionality - requires tokengrams but no GPT-NeoX
-try:
-    manager.setup_search(
-        bin_file_path="path/to/dataset.bin",
-        search_index_save_path="path/to/search_index",
-        vocab=2**16,  # or 2**32 for larger vocabularies
-        reuse=True
-    )
-    
-    # Search operations
-    query = [101, 2023, 102]  # Token IDs
-    count = manager.search.count(query)
-    positions = manager.search.positions(query)
-    print("✅ Search functionality available")
-    
-except ImportError as e:
-    print("ℹ️ Search functionality requires tokengrams: pip install 'tokensmith[search]'")
+```bash
+pip install "tokensmith[all]"
 ```
 
-#### Dataset Operations (Requires GPT-NeoX environment)
+This includes docs, UI, and search extras.
 
-```python
-from tokensmith import DatasetManager
-
-# Initialize the manager
-manager = DatasetManager()
-
-# Note: All operations below require GPT-NeoX environment to be installed
-try:
-    # Setup dataset for inspection, sampling, editing, and export
-    manager.setup_edit_inspect_sample_export(
-        dataset_prefix="path/to/your/dataset",
-        batch_info_save_prefix="path/to/batch_info",
-        train_iters=1000,
-        train_batch_size=32,
-        train_seq_len=1024,
-        seed=42
-    )
-    
-    # Load a tokenizer (requires transformers from GPT-NeoX environment)
-    from transformers import AutoTokenizer
-    tokenizer = AutoTokenizer.from_pretrained("gpt2")
-    
-    print("✅ Full functionality available")
-except ImportError as e:
-    print("ℹ️ Dataset operations require GPT-NeoX installation")
-    print("   See: https://github.com/EleutherAI/gpt-neox")
-```
+## 6. Development Installation
 
-#### Web UI Usage (Requires GPT-NeoX environment)
+If you’re contributing to `tokensmith`:
 
 ```bash
-# Note: UI requires GPT-NeoX environment
-# Navigate to UI directory and run
-cd tokensmith/ui
-./run_ui.sh
-
-# Or modify run_ui.sh for your specific setup
+git clone https://github.com/aflah02/tokensmith.git
+cd tokensmith
+pip install -e ".[all,docs,ui,search]"
 ```
 
+This sets up a local environment with all extras for development.
+
 ## 📚 Core Functionality
 
 ### 🔍 Search Operations

From 8a9f5caa7579f6bfecdcf5a2a69429300af96f0a Mon Sep 17 00:00:00 2001
From: Aflah <72096386+aflah02@users.noreply.github.com>
Date: Fri, 26 Sep 2025 13:10:37 +0200
Subject: [PATCH 8/9] Update author details and repository URL in setup.py

---
 setup.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/setup.py b/setup.py
index 4ddeb82..493504d 100644
--- a/setup.py
+++ b/setup.py
@@ -3,16 +3,14 @@
 setup(
     name='tokensmith',
     version='0.1.0',
-    author='Your Name',
-    author_email='your.email@example.com',
     description='A package for managing datasets with editing, inspecting, sampling, exporting, and searching functionalities.',
     long_description=open('README.md').read(),
     long_description_content_type='text/markdown',
-    url='https://github.com/yourusername/tokensmith',  # Replace with your actual repository URL
+    url='https://github.com/aflah02/TokenSmith', 
     packages=find_packages(),
     classifiers=[
         'Programming Language :: Python :: 3',
-        'License :: OSI Approved :: MIT License',  # Replace with your actual license
+        'License :: Apache 2.0 License', 
         'Operating System :: OS Independent',
     ],
     python_requires='>=3.8',
@@ -41,4 +39,4 @@
             'tokengrams>=0.3.0',
         ],
     },
-)
\ No newline at end of file
+)

From 532ff806abe2b288e782ac78edecb99bb9959400 Mon Sep 17 00:00:00 2001
From: Aflah <72096386+aflah02@users.noreply.github.com>
Date: Fri, 26 Sep 2025 13:11:28 +0200
Subject: [PATCH 9/9] Correct capitalization of 'TokenSmith' in README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8e17ccc..b0c2490 100644
--- a/README.md
+++ b/README.md
@@ -38,7 +38,7 @@ DatasetManager
 
 ### Installation
 
-`tokensmith` can be installed in several ways depending on your use case.
+`TokenSmith` can be installed in several ways depending on your use case.
 
 Note: Apart from search all features assume that GPT-NeoX is installed to use Megatron. You can do that by simply following the steps provided [here](https://github.com/EleutherAI/gpt-neox?tab=readme-ov-file#environment-and-dependencies).