aflah02 · aflah02 · Sep 26, 2025 · Sep 26, 2025 · Sep 26, 2025 · Sep 26, 2025
diff --git a/README.md b/README.md
@@ -38,47 +38,70 @@ DatasetManager
 
 ### Installation
 
-Ensure you have a working GPT-NeoX environment using steps provided [here](https://github.com/EleutherAI/gpt-neox?tab=readme-ov-file#environment-and-dependencies)
+`TokenSmith` can be installed in several ways depending on your use case.
 
-Within the same env run the following - 
+Note: Apart from search all features assume that GPT-NeoX is installed to use Megatron. You can do that by simply following the steps provided [here](https://github.com/EleutherAI/gpt-neox?tab=readme-ov-file#environment-and-dependencies).
+
+## 1. Basic Installation (Core Only)
+
+If you only need the **core functionality** (data editing, sampling, importing, exporting, inspection):
 
 ```bash
-git clone https://github.com/aflah02/tokensmith.git
-cd tokensmith
-pip install -e .
+pip install tokensmith
 ```
 
-### Basic Usage
+## 2. With Documentation Dependencies
 
-```python
-from tokensmith import DatasetManager
-from transformers import AutoTokenizer
-
-# Initialize the manager
-manager = DatasetManager()
-
-# Setup dataset for inspection, sampling, editing, and export
-manager.setup_edit_inspect_sample_export(
-    dataset_prefix="path/to/your/dataset",
-    batch_info_save_prefix="path/to/batch_info",
-    train_iters=1000,
-    train_batch_size=32,
-    train_seq_len=1024,
-    seed=42
-)
+If you plan to build or serve the documentation locally:
 
-# Setup search functionality (optional)
-manager.setup_search(
-    bin_file_path="path/to/dataset.bin",
-    search_index_save_path="path/to/search_index",
-    vocab=2**16,  # or 2**32 for larger vocabularies
-    reuse=True
-)
+```bash
+pip install "tokensmith[docs]"
+```
+
+Once installed, you can build and serve the docs:
+
+```bash
+mkdocs serve
+```
+
+## 3. With UI Components
+
+If you want the **interactive interface** for exploring data:
+
+```bash
+pip install "tokensmith[ui]"
+```
+
+## 4. With Search Features
+
+For advanced **token-level search and n-gram utilities**:
+
+```bash
+pip install "tokensmith[search]"
+```
+
+## 5. Full Installation (Everything)
+
+To install **all optional features**:
+
+```bash
+pip install "tokensmith[all]"
+```
+
+This includes docs, UI, and search extras.
+
+## 6. Development Installation
+
+If you’re contributing to `tokensmith`:
 
-# Load a tokenizer for detokenization
-tokenizer = AutoTokenizer.from_pretrained("gpt2")
+```bash
+git clone https://github.com/aflah02/tokensmith.git
+cd tokensmith
+pip install -e ".[all,docs,ui,search]"
 ```
 
+This sets up a local environment with all extras for development.
+
 ## 📚 Core Functionality
 
 ### 🔍 Search Operations

diff --git a/pyproject.toml b/pyproject.toml
@@ -10,15 +10,31 @@ repository = "https://github.com/aflah02/tokensmith"
 keywords = ["dataset", "management", "editing", "sampling", "exporting", "searching"]
 
 [tool.poetry.dependencies]
-python = "^3.7"
-# Add other dependencies here
+python = "^3.8"
+# Core dependencies
+numpy = "^1.21.0"
+pandas = "^1.3.0"
+tqdm = "^4.62.0"
 
 [tool.poetry.group.docs.dependencies]
 mkdocs = "^1.5.0"
 mkdocs-material = "^9.0.0"
 mkdocstrings = {extras = ["python"], version = "^0.24.0"}
 mkdocstrings-python = "^1.7.0"
 
+[tool.poetry.group.ui.dependencies]
+streamlit = "^1.20.0"
+altair = "^4.2.0"
+
+[tool.poetry.group.search.dependencies]
+tokengrams = "^0.3.0"
+
+[tool.poetry.group.all.dependencies]
+# All optional dependencies for complete functionality
+streamlit = "^1.20.0"
+altair = "^4.2.0"
+tokengrams = "^0.3.0"
+
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
diff --git a/setup.py b/setup.py
@@ -3,20 +3,40 @@
 setup(
     name='tokensmith',
     version='0.1.0',
-    author='Your Name',
-    author_email='your.email@example.com',
     description='A package for managing datasets with editing, inspecting, sampling, exporting, and searching functionalities.',
     long_description=open('README.md').read(),
     long_description_content_type='text/markdown',
-    url='https://github.com/yourusername/tokensmith',  # Replace with your actual repository URL
+    url='https://github.com/aflah02/TokenSmith', 
     packages=find_packages(),
     classifiers=[
         'Programming Language :: Python :: 3',
-        'License :: OSI Approved :: MIT License',  # Replace with your actual license
+        'License :: Apache 2.0 License', 
         'Operating System :: OS Independent',
     ],
-    python_requires='>=3.6',
+    python_requires='>=3.8',
     install_requires=[
-        # List your package dependencies here
+        'numpy>=1.21.0',
+        'pandas>=1.3.0', 
+        'tqdm>=4.62.0',
     ],
-)
+    extras_require={
+        'ui': [
+            'streamlit>=1.20.0',
+            'altair>=4.2.0',
+        ],
+        'search': [
+            'tokengrams>=0.3.0',
+        ],
+        'docs': [
+            'mkdocs>=1.5.0',
+            'mkdocs-material>=9.0.0', 
+            'mkdocstrings[python]>=0.24.0',
+            'mkdocstrings-python>=1.7.0',
+        ],
+        'all': [
+            'streamlit>=1.20.0',
+            'altair>=4.2.0',
+            'tokengrams>=0.3.0',
+        ],
+    },
+)
diff --git a/tokensmith/search/handler.py b/tokensmith/search/handler.py
@@ -1,12 +1,21 @@
 # Heavily inspired by the original code from https://github.com/EleutherAI/tokengrams/blob/master/tokengrams/tokengrams.pyi and uses the same library.
 
-from tokengrams import MemmapIndex
 from typing import List
 import os
 import logging
 
+# Optional tokengrams import - will be imported when needed
+try:
+    from tokengrams import MemmapIndex
+    TOKENGRAMS_AVAILABLE = True
+except ImportError:
+    MemmapIndex = None
+    TOKENGRAMS_AVAILABLE = False
+
 class SearchHandler:
     def __init__(self, bin_file_path: str, index_save_path: str, vocab: int, verbose: bool = True, reuse: bool = True):
+        if not TOKENGRAMS_AVAILABLE:
+            raise ImportError("Tokengrams is required for search functionality. Please install with: pip install 'tokensmith[search]' or pip install tokengrams")
 
         self.bin_file_path = bin_file_path
         self.index_save_path = index_save_path

diff --git a/tokensmith/ui/app.py b/tokensmith/ui/app.py
@@ -62,6 +62,8 @@ class DefaultArgs:
                 from transformers import AutoTokenizer
                 print(f"Loading tokenizer from {st.session_state.args.tokenizer_path}")
                 st.session_state.tokenizer = AutoTokenizer.from_pretrained(st.session_state.args.tokenizer_path)
+            except ImportError:
+                st.error("Transformers library not available. Tokenizer functionality requires transformers to be installed (should be available in GPT-NeoX environment).")
             except Exception as e:
                 st.error(f"Failed to load tokenizer: {e}")
 

diff --git a/tokensmith/utils.py b/tokensmith/utils.py
@@ -6,22 +6,51 @@
 from tqdm import trange
 import uuid
 from typing import Optional, List, Dict, Any
-from megatron.data.indexed_dataset import MMapIndexedDataset
-from .megatron_dependencies import get_train_valid_test_split_, build_index_mappings
-from transformers import AutoTokenizer
-import torch
 import os
 from functools import lru_cache
 import time
 
+# Optional transformers imports - will be imported when needed
+try:
+    from transformers import AutoTokenizer
+    TRANSFORMERS_AVAILABLE = True
+except ImportError:
+    AutoTokenizer = None
+    TRANSFORMERS_AVAILABLE = False
+
+# Optional megatron imports - will be imported when needed
+try:
+    from megatron.data.indexed_dataset import MMapIndexedDataset
+    MEGATRON_AVAILABLE = True
+except ImportError:
+    MMapIndexedDataset = None
+    MEGATRON_AVAILABLE = False
+
+from .megatron_dependencies import get_train_valid_test_split_, build_index_mappings
+
 logger = logging.getLogger(__name__)
 
 @lru_cache(1)
 def warn_once(logger: logging.Logger, msg: str):
     logger.warning(msg)
     time.sleep(10)
 
-def generate_training_sample(tokenized_segments: List[List[int]], tokenizer: AutoTokenizer) -> str:
+def generate_training_sample(tokenized_segments: List[List[int]], tokenizer) -> str:
+    """Generate training sample from tokenized segments using a tokenizer.
+
+    Args:
+        tokenized_segments: List of tokenized segments
+        tokenizer: Tokenizer object (should have a decode method)
+
+    Returns:
+        Decoded text string
+
+    Raises:
+        ImportError: If transformers is not available and tokenizer is None
+    """
+    if not TRANSFORMERS_AVAILABLE and tokenizer is None:
+        raise ImportError("Transformers is required for tokenization functionality. It should be available from your GPT-NeoX environment.")
+
     concat_training_sample = np.concatenate(tokenized_segments)
     return tokenizer.decode(
         concat_training_sample,
@@ -61,6 +90,9 @@ def __init__(self,
                  packing_impl: str,
                  allow_chopped: bool,
                  add_extra_token_to_seq: int):
+        if not MEGATRON_AVAILABLE:
+            raise ImportError("Megatron is required for WriteableMMapIndexedDataset functionality. Please install GPT-NeoX following the instructions in the README.")
+
         logger.debug(f"Initializing WriteableMMapIndexedDataset with pointer: {dataset_prefix}.bin and index: {dataset_prefix}.idx")
 
         self.corpus_pointer = open(f"{dataset_prefix}.bin", 'r+b')