diff --git a/README.md b/README.md index d946694..b0c2490 100644 --- a/README.md +++ b/README.md @@ -38,47 +38,70 @@ DatasetManager ### Installation -Ensure you have a working GPT-NeoX environment using steps provided [here](https://github.com/EleutherAI/gpt-neox?tab=readme-ov-file#environment-and-dependencies) +`TokenSmith` can be installed in several ways depending on your use case. -Within the same env run the following - +Note: Apart from search all features assume that GPT-NeoX is installed to use Megatron. You can do that by simply following the steps provided [here](https://github.com/EleutherAI/gpt-neox?tab=readme-ov-file#environment-and-dependencies). + +## 1. Basic Installation (Core Only) + +If you only need the **core functionality** (data editing, sampling, importing, exporting, inspection): ```bash -git clone https://github.com/aflah02/tokensmith.git -cd tokensmith -pip install -e . +pip install tokensmith ``` -### Basic Usage +## 2. With Documentation Dependencies -```python -from tokensmith import DatasetManager -from transformers import AutoTokenizer - -# Initialize the manager -manager = DatasetManager() - -# Setup dataset for inspection, sampling, editing, and export -manager.setup_edit_inspect_sample_export( - dataset_prefix="path/to/your/dataset", - batch_info_save_prefix="path/to/batch_info", - train_iters=1000, - train_batch_size=32, - train_seq_len=1024, - seed=42 -) +If you plan to build or serve the documentation locally: -# Setup search functionality (optional) -manager.setup_search( - bin_file_path="path/to/dataset.bin", - search_index_save_path="path/to/search_index", - vocab=2**16, # or 2**32 for larger vocabularies - reuse=True -) +```bash +pip install "tokensmith[docs]" +``` + +Once installed, you can build and serve the docs: + +```bash +mkdocs serve +``` + +## 3. With UI Components + +If you want the **interactive interface** for exploring data: + +```bash +pip install "tokensmith[ui]" +``` + +## 4. With Search Features + +For advanced **token-level search and n-gram utilities**: + +```bash +pip install "tokensmith[search]" +``` + +## 5. Full Installation (Everything) + +To install **all optional features**: + +```bash +pip install "tokensmith[all]" +``` + +This includes docs, UI, and search extras. + +## 6. Development Installation + +If youโ€™re contributing to `tokensmith`: -# Load a tokenizer for detokenization -tokenizer = AutoTokenizer.from_pretrained("gpt2") +```bash +git clone https://github.com/aflah02/tokensmith.git +cd tokensmith +pip install -e ".[all,docs,ui,search]" ``` +This sets up a local environment with all extras for development. + ## ๐Ÿ“š Core Functionality ### ๐Ÿ” Search Operations diff --git a/pyproject.toml b/pyproject.toml index 7793da6..07202b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,8 +10,11 @@ repository = "https://github.com/aflah02/tokensmith" keywords = ["dataset", "management", "editing", "sampling", "exporting", "searching"] [tool.poetry.dependencies] -python = "^3.7" -# Add other dependencies here +python = "^3.8" +# Core dependencies +numpy = "^1.21.0" +pandas = "^1.3.0" +tqdm = "^4.62.0" [tool.poetry.group.docs.dependencies] mkdocs = "^1.5.0" @@ -19,6 +22,19 @@ mkdocs-material = "^9.0.0" mkdocstrings = {extras = ["python"], version = "^0.24.0"} mkdocstrings-python = "^1.7.0" +[tool.poetry.group.ui.dependencies] +streamlit = "^1.20.0" +altair = "^4.2.0" + +[tool.poetry.group.search.dependencies] +tokengrams = "^0.3.0" + +[tool.poetry.group.all.dependencies] +# All optional dependencies for complete functionality +streamlit = "^1.20.0" +altair = "^4.2.0" +tokengrams = "^0.3.0" + [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" \ No newline at end of file diff --git a/setup.py b/setup.py index 59376e4..493504d 100644 --- a/setup.py +++ b/setup.py @@ -3,20 +3,40 @@ setup( name='tokensmith', version='0.1.0', - author='Your Name', - author_email='your.email@example.com', description='A package for managing datasets with editing, inspecting, sampling, exporting, and searching functionalities.', long_description=open('README.md').read(), long_description_content_type='text/markdown', - url='https://github.com/yourusername/tokensmith', # Replace with your actual repository URL + url='https://github.com/aflah02/TokenSmith', packages=find_packages(), classifiers=[ 'Programming Language :: Python :: 3', - 'License :: OSI Approved :: MIT License', # Replace with your actual license + 'License :: Apache 2.0 License', 'Operating System :: OS Independent', ], - python_requires='>=3.6', + python_requires='>=3.8', install_requires=[ - # List your package dependencies here + 'numpy>=1.21.0', + 'pandas>=1.3.0', + 'tqdm>=4.62.0', ], -) \ No newline at end of file + extras_require={ + 'ui': [ + 'streamlit>=1.20.0', + 'altair>=4.2.0', + ], + 'search': [ + 'tokengrams>=0.3.0', + ], + 'docs': [ + 'mkdocs>=1.5.0', + 'mkdocs-material>=9.0.0', + 'mkdocstrings[python]>=0.24.0', + 'mkdocstrings-python>=1.7.0', + ], + 'all': [ + 'streamlit>=1.20.0', + 'altair>=4.2.0', + 'tokengrams>=0.3.0', + ], + }, +) diff --git a/tokensmith/search/handler.py b/tokensmith/search/handler.py index bc14e20..c637dc8 100644 --- a/tokensmith/search/handler.py +++ b/tokensmith/search/handler.py @@ -1,12 +1,21 @@ # Heavily inspired by the original code from https://github.com/EleutherAI/tokengrams/blob/master/tokengrams/tokengrams.pyi and uses the same library. -from tokengrams import MemmapIndex from typing import List import os import logging +# Optional tokengrams import - will be imported when needed +try: + from tokengrams import MemmapIndex + TOKENGRAMS_AVAILABLE = True +except ImportError: + MemmapIndex = None + TOKENGRAMS_AVAILABLE = False + class SearchHandler: def __init__(self, bin_file_path: str, index_save_path: str, vocab: int, verbose: bool = True, reuse: bool = True): + if not TOKENGRAMS_AVAILABLE: + raise ImportError("Tokengrams is required for search functionality. Please install with: pip install 'tokensmith[search]' or pip install tokengrams") self.bin_file_path = bin_file_path self.index_save_path = index_save_path diff --git a/tokensmith/ui/app.py b/tokensmith/ui/app.py index 71e0193..3721b89 100644 --- a/tokensmith/ui/app.py +++ b/tokensmith/ui/app.py @@ -62,6 +62,8 @@ class DefaultArgs: from transformers import AutoTokenizer print(f"Loading tokenizer from {st.session_state.args.tokenizer_path}") st.session_state.tokenizer = AutoTokenizer.from_pretrained(st.session_state.args.tokenizer_path) + except ImportError: + st.error("Transformers library not available. Tokenizer functionality requires transformers to be installed (should be available in GPT-NeoX environment).") except Exception as e: st.error(f"Failed to load tokenizer: {e}") diff --git a/tokensmith/utils.py b/tokensmith/utils.py index 6ac807c..59e7729 100644 --- a/tokensmith/utils.py +++ b/tokensmith/utils.py @@ -6,14 +6,28 @@ from tqdm import trange import uuid from typing import Optional, List, Dict, Any -from megatron.data.indexed_dataset import MMapIndexedDataset -from .megatron_dependencies import get_train_valid_test_split_, build_index_mappings -from transformers import AutoTokenizer -import torch import os from functools import lru_cache import time +# Optional transformers imports - will be imported when needed +try: + from transformers import AutoTokenizer + TRANSFORMERS_AVAILABLE = True +except ImportError: + AutoTokenizer = None + TRANSFORMERS_AVAILABLE = False + +# Optional megatron imports - will be imported when needed +try: + from megatron.data.indexed_dataset import MMapIndexedDataset + MEGATRON_AVAILABLE = True +except ImportError: + MMapIndexedDataset = None + MEGATRON_AVAILABLE = False + +from .megatron_dependencies import get_train_valid_test_split_, build_index_mappings + logger = logging.getLogger(__name__) @lru_cache(1) @@ -21,7 +35,22 @@ def warn_once(logger: logging.Logger, msg: str): logger.warning(msg) time.sleep(10) -def generate_training_sample(tokenized_segments: List[List[int]], tokenizer: AutoTokenizer) -> str: +def generate_training_sample(tokenized_segments: List[List[int]], tokenizer) -> str: + """Generate training sample from tokenized segments using a tokenizer. + + Args: + tokenized_segments: List of tokenized segments + tokenizer: Tokenizer object (should have a decode method) + + Returns: + Decoded text string + + Raises: + ImportError: If transformers is not available and tokenizer is None + """ + if not TRANSFORMERS_AVAILABLE and tokenizer is None: + raise ImportError("Transformers is required for tokenization functionality. It should be available from your GPT-NeoX environment.") + concat_training_sample = np.concatenate(tokenized_segments) return tokenizer.decode( concat_training_sample, @@ -61,6 +90,9 @@ def __init__(self, packing_impl: str, allow_chopped: bool, add_extra_token_to_seq: int): + if not MEGATRON_AVAILABLE: + raise ImportError("Megatron is required for WriteableMMapIndexedDataset functionality. Please install GPT-NeoX following the instructions in the README.") + logger.debug(f"Initializing WriteableMMapIndexedDataset with pointer: {dataset_prefix}.bin and index: {dataset_prefix}.idx") self.corpus_pointer = open(f"{dataset_prefix}.bin", 'r+b')