biocap/.zenodo.json at main · Imageomics/biocap · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
{
    "creators": [
        {
            "name": "Zhang, Ziheng",
            "affiliation": "The Ohio State University"
        },
        {
            "name": "Ma, Xinyue",
            "affiliation": "The Ohio State University"
        },
        {
            "name": "Campolongo, Elizabeth G.",
            "affiliation": "The Ohio State University"
        },
        {
            "name": "Thompson, Matthew J.",
            "affiliation": "The Ohio State University"
        },
        {
            "name": "Zhang, Net",
            "affiliation": "The Ohio State University"
        },
        {
            "name": "Gu, Jianyang",
            "affiliation": "The Ohio State University"
        }
    ],
    "description": "This work investigates descriptive captions as an additional source of supervision for biological multimodal foundation models. Images and captions can be viewed as complementary samples from the latent morphospace of a species, each capturing certain biological traits. Incorporating captions during training encourages alignment with this shared latent structure, emphasizing potentially diagnostic characters while suppressing spurious correlations. The main challenge, however, lies in obtaining faithful, instance-specific captions at scale. This requirement has limited the utilization of natural language supervision in organismal biology  compared with many other scientific domains. We complement this gap by generating synthetic captions with multimodal large language models (MLLMs), guided by Wikipedia-derived visual information and taxon-tailored format examples.  These domain-specific contexts help reduce hallucination and yield accurate, instance-based descriptive captions. Using these captions, we train BioCAP (i.e., BioCLIP with Captions), a biological foundation model that captures rich semantics  and achieves strong performance in species classification and text-image retrieval. These results demonstrate the value of descriptive captions beyond labels in bridging biological images with multimodal foundation models.",
    "notes": "<p>If you use this software, please cite both the article and the software itself.</p>\n<p>&nbsp;</p>\n<p>Article citation:</p>\n<p><code>@article{zhang2025biocap,</code><br><code>&nbsp; title = {{B}io{CAP}: Exploiting Synthetic Captions Beyond Labels in Biological Foundation Models},&nbsp;</code><br><code>&nbsp; author = {Ziheng Zhang and Xinyue Ma and Arpita Chowdhury and Elizabeth G Campolongo and Matthew J Thompson and Net Zhang and Samuel Stevens and Hilmar Lapp and Tanya Berger-Wolf and Yu Su and Wei-Lun Chao and Jianyang Gu},</code><br><code>&nbsp; year = {2025},</code><br><code>&nbsp; eprint={2510.20095},</code><br><code>&nbsp; archivePrefix={arXiv},</code><br><code>&nbsp; primaryClass={cs.CV},</code><br><code>&nbsp; url={https://arxiv.org/abs/2510.20095},&nbsp;</code><br><code>}</code></p>",
    "keywords": [
        "clip",
        "biology",
        "CV",
        "imageomics",
        "animals",
        "plants",
        "fungi",
        "species",
        "images",
        "taxonomy",
        "rare species",
        "endangered species",
        "evolutionary biology",
        "multimodal",
        "knowledge-guided",
        "synthetic captions",
        "caption-generation"
    ],
    "license": {
        "id": "MIT"
    },
    "publication_date": "2025-10-24",
    "title": "BioCAP",
    "version": "1.0.0",
    "grants": [
        {
            "id": "021nxhr62::2118240"
        }
    ],
    "references": [
        "Gu, J., Stevens, S., Campolongo, E. G., Thompson, M. J., Zhang, N., Wu, J., Kopanev, A., Mai, Z, White, A. E., Balhoff, J., Dahdul, W., Rubenstein, D., Lapp, H., Berger-Wolf, T., Chao, W., Su, Y. BioCLIP 2: Emergent Properties from Scaling Hierarchical Contrastive Learning. arXiv preprint arXiv:2505.23883, 2025.",
        "Stevens, S., Wu, J., Thompson, M. J., Campolongo, E. G., Song, C. H., Carlyn, D. E., Dong, L., Dahdul, W. M., Stewart, C., Berger-Wolf, T., Chao, W., & Su, Y. (2024). BioCLIP: A Vision Foundation Model for the Tree of Life [Conference paper]. Proceedings of IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)",
        "Ilharco, G., Wortsman, M., Wightman, R., Gordon, C., Carlini, N., Taori, R., Dave, A., Shankar, V., Namkoong, H., Miller, J., Hajishirzi, H., Farhadi, A., & Schmidt, L. (2021). OpenCLIP (Version v0.1) [Computer software]. https://doi.org/10.5281/zenodo.5143773"
    ],
    "related_identifiers": [
        {
        "identifier": "10.48550/arXiv.2510.20095",
        "relation": "isSupplementTo",
        "resource_type": "publication-preprint"
        }
    ]
}