Skip to content

Commit 8af2663

Browse files
committed
import
0 parents  commit 8af2663

20 files changed

Lines changed: 1904 additions & 0 deletions
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
# Workflow based on:
2+
# https://packaging.python.org/en/latest/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/
3+
name: Publish Python distribution
4+
5+
on: push
6+
7+
jobs:
8+
build:
9+
name: Build distribution
10+
runs-on: ubuntu-latest
11+
12+
steps:
13+
- uses: actions/checkout@v4
14+
- name: Set up Python
15+
uses: actions/setup-python@v5
16+
with:
17+
python-version: "3.9"
18+
- name: Install pypa/build
19+
run: >-
20+
python3 -m
21+
pip install
22+
build
23+
--user
24+
- name: Build a binary wheel and a source tarball
25+
run: python3 -m build
26+
- name: Store the distribution packages
27+
uses: actions/upload-artifact@v4
28+
with:
29+
name: python-package-distributions
30+
path: dist/
31+
32+
publish-to-pypi:
33+
name: >-
34+
Publish Python distribution to PyPI
35+
if: startsWith(github.ref, 'refs/tags/') # only publish to PyPI on tag pushes
36+
needs:
37+
- build
38+
runs-on: ubuntu-latest
39+
environment:
40+
name: pypi
41+
url: https://pypi.org/p/trecrun
42+
permissions:
43+
id-token: write # IMPORTANT: mandatory for trusted publishing
44+
45+
steps:
46+
- name: Download all the dists
47+
uses: actions/download-artifact@v4
48+
with:
49+
name: python-package-distributions
50+
path: dist/
51+
- name: Publish distribution to PyPI
52+
uses: pypa/gh-action-pypi-publish@release/v1
53+
54+
github-release:
55+
name: >-
56+
Sign the Python distribution with Sigstore
57+
and upload them to GitHub Release
58+
needs:
59+
- publish-to-pypi
60+
runs-on: ubuntu-latest
61+
62+
permissions:
63+
contents: write # IMPORTANT: mandatory for making GitHub Releases
64+
id-token: write # IMPORTANT: mandatory for sigstore
65+
66+
steps:
67+
- name: Download all the dists
68+
uses: actions/download-artifact@v4
69+
with:
70+
name: python-package-distributions
71+
path: dist/
72+
- name: Sign the dists with Sigstore
73+
uses: sigstore/gh-action-sigstore-python@v2.1.1
74+
with:
75+
inputs: >-
76+
./dist/*.tar.gz
77+
./dist/*.whl
78+
- name: Create GitHub Release
79+
env:
80+
GITHUB_TOKEN: ${{ github.token }}
81+
run: >-
82+
gh release create
83+
'${{ github.ref_name }}'
84+
--repo '${{ github.repository }}'
85+
--notes ""
86+
- name: Upload artifact signatures to GitHub Release
87+
env:
88+
GITHUB_TOKEN: ${{ github.token }}
89+
# Upload to GitHub Release using the `gh` CLI.
90+
# `dist/` contains the built packages, and the
91+
# sigstore-produced signatures and certificates.
92+
run: >-
93+
gh release upload
94+
'${{ github.ref_name }}' dist/**
95+
--repo '${{ github.repository }}'
96+
97+
publish-to-testpypi:
98+
name: Publish Python distribution to TestPyPI
99+
### Comment the following line to publish to testpypi on commit
100+
#if: startsWith(github.ref, 'refs/PUBLISH-DISABLED/')
101+
needs:
102+
- build
103+
runs-on: ubuntu-latest
104+
105+
environment:
106+
name: testpypi
107+
url: https://test.pypi.org/p/trecrun
108+
109+
permissions:
110+
id-token: write # IMPORTANT: mandatory for trusted publishing
111+
112+
steps:
113+
- name: Download all the dists
114+
uses: actions/download-artifact@v4
115+
with:
116+
name: python-package-distributions
117+
path: dist/
118+
- name: Publish distribution to TestPyPI
119+
uses: pypa/gh-action-pypi-publish@release/v1
120+
with:
121+
repository-url: https://test.pypi.org/legacy/

.gitignore

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
6+
# C extensions
7+
*.so
8+
9+
# Distribution / packaging
10+
.Python
11+
build/
12+
develop-eggs/
13+
dist/
14+
downloads/
15+
eggs/
16+
.eggs/
17+
lib/
18+
lib64/
19+
parts/
20+
sdist/
21+
var/
22+
wheels/
23+
pip-wheel-metadata/
24+
share/python-wheels/
25+
*.egg-info/
26+
.installed.cfg
27+
*.egg
28+
MANIFEST
29+
30+
# PyInstaller
31+
# Usually these files are written by a python script from a template
32+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
33+
*.manifest
34+
*.spec
35+
36+
# Installer logs
37+
pip-log.txt
38+
pip-delete-this-directory.txt
39+
40+
# Unit test / coverage reports
41+
htmlcov/
42+
.tox/
43+
.nox/
44+
.coverage
45+
.coverage.*
46+
.cache
47+
nosetests.xml
48+
coverage.xml
49+
*.cover
50+
*.py,cover
51+
.hypothesis/
52+
.pytest_cache/
53+
cover/
54+
55+
# Translations
56+
*.mo
57+
*.pot
58+
59+
# Django stuff:
60+
*.log
61+
local_settings.py
62+
db.sqlite3
63+
db.sqlite3-journal
64+
65+
# Flask stuff:
66+
instance/
67+
.webassets-cache
68+
69+
# Scrapy stuff:
70+
.scrapy
71+
72+
# Sphinx documentation
73+
docs/_build/
74+
75+
# PyBuilder
76+
.pybuilder/
77+
target/
78+
79+
# Jupyter Notebook
80+
.ipynb_checkpoints
81+
82+
# IPython
83+
profile_default/
84+
ipython_config.py
85+
86+
# pyenv
87+
.python-version
88+
89+
# pipenv
90+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
91+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
92+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
93+
# install all needed dependencies.
94+
#Pipfile.lock
95+
96+
# poetry
97+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
98+
# This is especially recommended for binary packages to ensure reproducibility, and is more
99+
# commonly ignored for libraries.
100+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
101+
#poetry.lock
102+
103+
# pdm
104+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
105+
#pdm.lock
106+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
107+
# in version control.
108+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
109+
.pdm.toml
110+
.pdm-python
111+
.pdm-build/
112+
113+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
114+
__pypackages__/
115+
116+
# Celery stuff
117+
celerybeat-schedule
118+
celerybeat.pid
119+
120+
# SageMath parsed files
121+
*.sage.py
122+
123+
# Environments
124+
.env
125+
.venv
126+
env/
127+
venv/
128+
ENV/
129+
env.bak/
130+
venv.bak/
131+
132+
# Spyder project settings
133+
.spyderproject
134+
.spyproject
135+
136+
# Rope project settings
137+
.ropeproject
138+
139+
# mkdocs documentation
140+
/site
141+
142+
# mypy
143+
.mypy_cache/
144+
.dmypy.json
145+
dmypy.json
146+
147+
# Pyre type checker
148+
.pyre/
149+
150+
# pytype static type analyzer
151+
.pytype/
152+
153+
# Cython debug symbols
154+
cython_debug/
155+
156+
# PyCharm
157+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
158+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
159+
# and can be added to the global gitignore or merged into this file. For a more nuclear
160+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
161+
#.idea/
162+
.aider*

LICENSE

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2024-2025 Andrew Yates
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

README.md

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
[![PyPI version fury.io](https://badge.fury.io/py/bsparse.svg)](https://pypi.python.org/pypi/bsparse/)
2+
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
3+
[![Worfklow](https://github.com/andrewyates/bsparse/workflows/pytest/badge.svg)](https://github.com/andrewyates/bsparse/actions)
4+
[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
5+
# bsparse
6+
bsparse is a toolkit for creating and searching learned sparse representations
7+
8+
## Usage examples
9+
```
10+
# Recommended way to install requirements:
11+
# (using pip only works too, but uv is much faster)
12+
pipx install uv
13+
# Create virtual environment
14+
uv venv venv
15+
# Activate
16+
source venv/bin/activate
17+
# Install requirements
18+
uv pip install -r requirements.txt
19+
```
20+
21+
```
22+
# Request access to splade-v3: https://huggingface.co/naver/splade-v3
23+
# Get your huggingface API token and then:
24+
export HF_TOKEN="the token"
25+
26+
# load Python virtual environment
27+
source venv/bin/activate
28+
29+
# optional: spot check output from a model
30+
python -m bsparse.cli check --text "tesla net worth"
31+
32+
# create query representations:
33+
python -m bsparse.cli encode --out nfcorpus-queries.jsonl \
34+
--dataset irds --type query --name beir/nfcorpus --batch-size 64
35+
36+
# create doc representations:
37+
python -m bsparse.cli encode --out nfcorpus-docs.jsonl \
38+
--dataset irds --type doc --name beir/nfcorpus --batch-size 64
39+
40+
# search and evaluate without building an index:
41+
python -m bsparse.cli memsearch --out nfcorpus.run --docs nfcorpus-docs.jsonl --queries nfcorpus-queries.jsonl --qrels beir/nfcorpus/test
42+
43+
44+
# alternatively, you can build an index and search it
45+
46+
# 1) setup: compile ScaledJsonVectorCollection.java and add it to anserini-1.0.0-fatjar.jar
47+
$ wget -c https://repo1.maven.org/maven2/io/anserini/anserini/1.0.0/anserini-1.0.0-fatjar.jar
48+
$ cd java
49+
$ javac -cp ../anserini-1.0.0-fatjar.jar io/anserini/collection/*.java
50+
$ cp ../anserini-1.0.0-fatjar.jar ../anserini-1.0.0-fatjar-bsparse.jar
51+
$ jar uf ../anserini-1.0.0-fatjar-bsparse.jar io/anserini/collection/*.class
52+
53+
# 2) build index
54+
java -cp anserini-1.0.0-fatjar-AY.jar io.anserini.index.IndexCollection \
55+
-generator DefaultLuceneDocumentGenerator -impact -pretokenized \
56+
-threads 16 -collection ScaledJsonVectorCollection \
57+
-input /path/to/encoded-text -index /path/to/encoded-text-index
58+
59+
# 3) search index
60+
# Create sparse query representations in `$QUERY_VECTORS` and create an index in `$INDEX`, then:
61+
python -m bsparse.cli search --index $INDEX --queries $QUERY_VECTORS --out test.run --topk 1000
62+
```

bsparse/__init__.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
from .convert import (
2+
dict2jsonl,
3+
dict2vec,
4+
jsonl2dict,
5+
load_dict,
6+
save_dict,
7+
vec2dict,
8+
)
9+
from .jsonl import SparseRepresentations
10+
from .utils import batch_encode, get_torch_device, token_ids_to_binary_vec
11+
12+
13+
__version__ = "0.1.0"

0 commit comments

Comments
 (0)