split up functionality

maumueller · maumueller · commit 06e5595080ab · 2025-03-17T18:39:16.000+01:00
diff --git a/datasets.py b/datasets.py
@@ -0,0 +1,40 @@
+import os 
+from urllib.request import urlretrieve
+from pathlib import Path
+
+def download(src, dst):
+    if not os.path.exists(dst):
+        os.makedirs(Path(dst).parent, exist_ok=True)
+        print('downloading %s -> %s...' % (src, dst))
+        urlretrieve(src, dst)
+
+def get_fn(kind):
+    version = "ccnews-small"
+    return os.path.join("data", kind, f"{version}.h5"), os.path.join('data', kind, 'gt', f'{version}.h5')
+
+def prepare(kind):
+    url = DATASETS['ccnews-small'][kind]['url']
+    gt_url = DATASETS['ccnews-small'][kind]['gt_url']
+    fn, gt_fn = get_fn(kind)
+
+    download(url, fn)
+    download(gt_url, gt_fn)
+
+DATASETS = {
+    'ccnews-small': {
+        'task1': {
+            'url': 'https://huggingface.co/datasets/sadit/SISAP2025/resolve/main/benchmark-dev-ccnews-fp16.h5?download=true',
+            'queries': lambda x: x['itest']['queries'],
+            'data': lambda x: x['train'],
+            'gt_url': 'https://huggingface.co/datasets/sadit/SISAP2025/resolve/main/benchmark-dev-ccnews-fp16.h5?download=true',
+            'gt_I': lambda x: x['itest']['knns'],
+        },
+        'task2': {
+            'url': 'https://huggingface.co/datasets/sadit/SISAP2025/resolve/main/benchmark-dev-ccnews-fp16.h5?download=true',
+            'queries': lambda x: x['train'],
+            'data': lambda x: x['train'],
+            'gt_url': 'https://huggingface.co/datasets/sadit/SISAP2025/resolve/main/allknn-benchmark-dev-ccnews.h5?download=true',
+            'gt_I': lambda x: x['knns']
+        }
+    }
+}
diff --git a/eval.py b/eval.py
@@ -0,0 +1,80 @@
+import argparse
+import h5py
+import numpy as np
+import os
+import csv
+import glob
+from pathlib import Path
+from datasets import DATASETS
+
+def get_groundtruth(size="100K", private=False):
+    # test
+    gt_f = h5py.File(out_fn, "r")
+    true_I = np.array(gt_f['knns'])
+    gt_f.close()
+    return true_I
+
+def get_all_results(dirname):
+    mask = [dirname + "/**/*.h5"]
+    print("search for results matching:")
+    print("\n".join(mask))
+    for m in mask:
+        for fn in glob.iglob(m):
+            print(fn)
+            f = h5py.File(fn, "r")
+            if "knns" not in f or not ("data" in f or "data" in f.attrs):
+                print("Ignoring " + fn)
+                f.close()
+                continue
+            yield f
+            f.close()
+
+def get_recall(I, gt, k):
+    assert k <= I.shape[1]
+    assert len(I) == len(gt)
+
+    n = len(I)
+    recall = 0
+    for i in range(n):
+        recall += len(set(I[i, :k]) & set(gt[i, :k]))
+    return recall / (n * k)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--results",
+        help='directory in which results are stored',
+        default="result"
+    )
+    parser.add_argument(
+        '--private',
+        help="private queries held out for evaluation",
+        action='store_true',
+        default=False
+    )
+    parser.add_argument(
+        '--dataset',
+        choices = ['ccnews-small'],
+        default='ccnews-small',
+    )
+
+    parser.add_argument("csvfile")
+    args = parser.parse_args()
+    true_I_cache = {}
+
+
+    columns = ["data", "kind", "algo", "buildtime", "querytime", "params", "recall"]
+
+    with open(args.csvfile, 'w', newline='') as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=columns)
+        writer.writeheader()
+        for res in get_all_results(args.results):
+            data = res.attrs["data"]
+            d = dict(res.attrs)
+            print(d)
+            gt_I = np.array(DATASETS['ccnews-small'][data]['gt_I'](res))
+            recall = get_recall(np.array(res["knns"]), gt_I, 10)
+            d['recall'] = recall
+            print(d["data"], d["algo"], d["params"], "=>", recall)
+            writer.writerow(d)
diff --git a/search.py b/search.py
@@ -4,27 +4,8 @@
 import numpy as np
 import os
 from pathlib import Path
-from urllib.request import urlretrieve
 import time
-
-def download(src, dst):
-    if not os.path.exists(dst):
-        os.makedirs(Path(dst).parent, exist_ok=True)
-        print('downloading %s -> %s...' % (src, dst))
-        urlretrieve(src, dst)
-
-def get_fn(kind):
-    version = "ccnews-small"
-    return os.path.join("data", kind, f"{version}.h5")
-
-def prepare(kind):
-    if kind == 'task2':
-        url = "https://huggingface.co/datasets/sadit/SISAP2025/resolve/main/allknn-benchmark-dev-ccnews.h5?download=true"
-    if kind == 'task1':
-        url = "https://huggingface.co/datasets/sadit/SISAP2025/resolve/main/benchmark-dev-ccnews-fp16.h5?download=true"
-    fn = get_fn(kind)
-
-    download(url, fn)
+from datasets import DATASETS, prepare, get_fn
 
 def store_results(dst, algo, kind, D, I, buildtime, querytime, params):
     os.makedirs(Path(dst).parent, exist_ok=True)
@@ -43,17 +24,21 @@ def run(kind, params):
 
     prepare(kind)
 
-    fn = get_fn(kind)
+    fn, _ = get_fn(kind)
     f = h5py.File(fn)
-    data = np.array(f['train'])
-    queries = np.array(f['itest']['queries'])
+    data = np.array(DATASETS['ccnews-small'][kind]['data'](f))
+    queries = np.array(DATASETS['ccnews-small'][kind]['queries'](f))
     f.close()
 
     n, d = data.shape
     k = params['k']
 
     nlist = 1024 # number of clusters/centroids to build the IVF from
-    index_identifier = f"IVF{nlist},SQfp16"
+    if kind == 'task1':
+        index_identifier = f"IVF{nlist},SQfp16"
+    elif kind == 'task2':
+        index_identifier = f"IVF{nlist},PQ{d//2}x4fs"
+
     index = faiss.index_factory(d, index_identifier)
 
     print(f"Training index on {data.shape}")
@@ -64,6 +49,10 @@ def run(kind, params):
     print(f"Done training in {elapsed_build}s.")
     assert index.is_trained
 
+    if kind == "task2":
+        index = faiss.IndexRefineFlat(index, faiss.swig_ptr(data.astype('float32')))
+        index.k_factor = 200
+
     for nprobe in [1, 2, 5, 10, 20, 50, 100]:
         print(f"Starting search on {queries.shape} with nprobe={nprobe}")
         start = time.time()
@@ -87,6 +76,13 @@ def run(kind, params):
         default='task2'
     )
 
+    parser.add_argument(
+        '--dataset',
+        choices=[
+            'ccnews-small',
+        ],
+        default='ccnews-small'
+    )
 
     params = {
         'task1': {