diff --git a/css/main.css b/css/main.css new file mode 100644 index 0000000..4fc52fc --- /dev/null +++ b/css/main.css @@ -0,0 +1,216 @@ +:root { + --secondary-color: #bb0000; + --nice-gray: rgb(236, 240, 243); + --main-width: 84ch; + font-size: 1.0em; + font-family: "Noto Sans Display", -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol"; + } + + html { + box-sizing: border-box; + } + + *,*:before,*:after { + box-sizing: inherit; + } + + body { + margin: 0; + } + + main { + margin: 0 auto; + max-width: var(--main-width); + text-rendering: optimizeLegibility; + -webkit-font-smoothing: antialiased; + -moz-osx-font-smoothing: grayscale; + line-height: 1.7em; + min-height: calc(var(--main-width)); + } + + h1 { + text-align: center; + font-size: 2.8em; + line-height: 1.2em; + } + + + /*************** + Tables + ***************/ + + + td,th { + padding: 4px 8px; + margin: 0; + } + th { + text-align: center; + font-size: 0.9em; + } + table { + max-width: 100%; + display: inline-block; + overflow-x: scroll; + white-space: nowrap; + + padding: 0; + margin: auto; + } + th { + border-top: 2px solid black; + border-bottom: 1px solid black; + } + tbody > tr:last-child > td { + border-bottom: 2px solid black; + } + /* When you hover over a table row, highlight its children table data elements without a rowspan element. */ + tbody > tr:hover { + background-color: var(--nice-gray); + } + th.sticky,td.sticky { + position: sticky; + left: 0; + z-index: 2; + background: #fff; + } + + pre.reference { + -webkit-overflow-scrolling: touch; + overflow-x: auto; + padding: 0.5em 0.5em; + white-space: pre; + word-wrap: normal; + line-height: 1.2em; + background-color: #f5f5f5; + font-size: 1.1em; + border-radius: 4px; + } + + + /*************** + Helpers + ***************/ + + + .no-border-top { + border-top: 0; + } + .no-border-bottom { + border-bottom: 0; + } + .border-right { + border-right: 1px solid black; + } + .text-sm { + font-size: 0.875rem; + line-height: 1.25rem; + } + .centered { + text-align: center; + } + .full-width { + width: max(90vw, 100%); + position: relative; + left: calc(-45vw + 50%); + } + .banded { + width: 100vw; + position: relative; + left: calc(-50vw + 50%); + + padding: 1em; + text-align: center; + background-color: var(--nice-gray); + } + + + /*************** + Figures + ***************/ + + + .figure-container { + display: grid; + grid-template-columns: 1fr 1fr; + column-gap: 4px; + row-gap: 4px; + } + + @media only screen and (max-width: 768px) { + .figure-container { + grid-template-columns: 1fr; + } + } + + figure { + margin: 0px; + } + + /* Makes behave like it doesn't exist with regards to flexbox */ + figure > a { + display: contents; + } + figure > a > img,figure > img { + width: 100%; + } + figcaption { + text-align: left; + } + + + /*************** + Pill Buttons + ***************/ + + + .pill-button { + padding: 0.5em 1em; + border-radius: 9999px; + background-color: black; + color: white; + text-decoration: none; + margin-right: 0.25em; + white-space: nowrap; + } + + .pill-button > span { + font-size: 1em; + } + + .pill-button > img { + height: 1em; + display: inline-block; + transform:translate(0, 0.125em); + } + + /*************** + Dropdown + ***************/ + details { + margin-top: 6px; + display: inline-block; + min-width: 16ch; + text-align: center; + padding: 4px 12px; + position: relative; + } + details:hover { + cursor: pointer; + } + + details > div.options { + position: absolute; + background: white; + width: 100%; + box-shadow: rgba(0, 0, 0, 0.1) 0px 1px 3px 0px, rgba(0, 0, 0, 0.06) 0px 1px 2px 0px; + border-radius: 4px; + } + + details > div.options > p { + margin: 0; + padding: 6px 2px; + } + details > div.options > p:hover { + background-color: var(--nice-gray); + } \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 0000000..393b408 --- /dev/null +++ b/index.html @@ -0,0 +1,334 @@ + + + + + + + + BioCLIP: A Vision Foundation Model for the Tree of Life + + + + + + + + +
+
+
+ More Research + +
+
+

Hierarchical Conditioning of Diffusion Models Using Tree-of-Life for Studying Species Evolution

+

+ 1Mridul Khurana, + 1Arka Daw, + +

+

+ 1Virginia Tech, + +

+

+ *Sam and Lisa are co-first authors and contributed equally to BioCLIP. +

+

+ mridul@vt.edu + +

+ +

+ + + Code + + + +

+
+ +
+ Figure 1: We use the CLIP objective (c) to train a ViT-B/16 on over 450K different class labels, all of which are taxonomic labels from the Tree of Life (a). + Because the text encoder is an autoregressive language model, the order representation can only depend on higher ranks like class, phlyum and kingdom (b). + This naturally leads to hierarchical representations for labels, helping the vision encoder learn image representations that are more aligned to the tree of life. +
+
+ +

BioCLIP

+ +

+ Images of the natural world are a super-abundant source of biological information. + There are many computational methods and tools, particularly computer vision, for extracting information from images. + But, existing methods are bespoke models for a specific task and are not adaptable + or extendable to new questions, contexts, and datasets. +

+

+ We develop the first large-scale multimodal model, BioCLIP, for general biology questions on images. + We leverage the unique properties of biology as the the application domain for computer vision: +

    +
  1. The abundance and variety of images about plants, animals, and fungi.
  2. +
  3. The availability of rich structured biological knowledge.
  4. +
+ We curate and release TreeOfLife-10M (the largest and most diverse available dataset of biology images), train BioCLIP, rigorously benchmark our approach on diverse fine-grained biology classification tasks, and find that BioCLIP consistently and substantially outperforms existing baselines (by 17% to 20% absolute). + Intrinsic evaluation further reveals that BioCLIP has learned a hierarchical representation conforming to the tree of life, shedding light on its strong generalizability. +

+ +

Demo

+ + + +

Experiments

+

+ We evaluate BioCLIP and three baselines (CLIP, OpenCLIP, and an iNat-only model that + uses the same procedure as BioCLIP but trained only on iNat21) on a diverse set of biology-related + classification tasks. + We do zero-shot classification with all models and report accuracy on the validation sets. + Bold indicates the best performance for each task. +

+

+ BioCLIP outperforms both general-domain baselines and our new iNat-only ViT model. +

+

+ Check out the paper for one-shot and five-shot results. +

+

Scroll to see all results.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ModelAnimalsPlants & FungiRare SpeciesMean
Birds 525PlanktonInsectsInsects 2PlantNetFungiPlantVillageMed. LeafPlantDoc
CLIP49.93.29.19.858.510.25.415.926.126.621.4
OpenCLIP54.72.26.59.650.25.78.012.425.831.020.6
BioCLIP72.16.134.820.491.440.724.438.628.437.839.4
iNat21 Only56.12.630.711.588.243.018.425.620.519.431.6
+ +

Evaluation Examples

+
+
+ + + +
+ BioCLIP correctly labels this Clitocybe fragrans from the Fungi task. +
+
+
+ + + +
+ BioCLIP correctly labels this Jasmine leaf (Jasminum) from the Medicinal Leaf task. +
+
+
+ + + +
+ BioCLIP correctly labels this Rhizosolenia from the Plankton task. + BioCLIP performs well on a variety of image sources, like this microscope image. +
+
+
+ + + +
+ CLIP mislabels this Russula ochroleuca as "the prince" (Agaricus augustus), which isn't even in the same family as Russula ochroleuca, while BioCLIP correctly labels it. +
+
+
+ + + +
+ CLIP mislabels this Roxburg fig (Ficus auriculata) as a Peepal tree (Ficus Religiosa), while BioCLIP correctly labels it. +
+
+
+ + + +
+ CLIP mislabels this Strombidium concicum as a ciliate mix, while BioCLIP correctly labels it. +
+
+
+ +

Intrinsic Evaluation

+

+ Why does BioCLIP work so well? + We conduct an intrinsic evaluation to understand the representations learned by BioCLIP. + We visualize BioCLIP and CLIP's representations for the 100K unseen images in the iNat21 validation set, using T-SNE to plot them in two-dimensions, coloring the points based on their class. + In the figure below, (B) means BioCLIP and (O) means OpenAI's CLIP. +

+

+ At higher ranks like phylum, both CLIP and BioCLIP have good separation, but you can see that BioCLIP's representation is more fine-grained and contains a richer clustering structure. + At lower ranks, BioCLIP produces far more separable features, while CLIP's features tend to be cluttered and lack a clear structure. + This shows that BioCLIP has learned a rich feature representation following the hierarchical structure of the taxonomy, which helps explain its strong generalization across the tree of life. +

+
+ + + +
+ We show that (B)ioCLIP's representations are more fine-grained and contain a richer clustering structure than (O)penAI's CLIP. + Click on the image to see the full resolution PDF, or check out our paper for more details. +
+
+ +

Dataset

+

+ TreeOfLife-10M is the largest and most diverse available dataset of biology images. + We combine images from three sources, iNaturalist, BIOSCAN-1M, and Encyclopedia of Life (EOL, accessed 29 July 2023), to create a dataset of + 10M images, spanning 450K+ species. + + We train BioCLIP on TreeOfLife-10M and release the weights for public use. +

+ +

Reference

+

Please cite our paper if you use our code, data, model or results.

+
@inproceedings{stevens2024bioclip,
+      title = {{B}io{CLIP}: A Vision Foundation Model for the Tree of Life}, 
+      author = {Samuel Stevens and Jiaman Wu and Matthew J Thompson and Elizabeth G Campolongo and Chan Hee Song and David Edward Carlyn and Li Dong and Wasila M Dahdul and Charles Stewart and Tanya Berger-Wolf and Wei-Lun Chao and Yu Su},
+      booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+      year = {2024}
+    }
+

Also consider citing OpenCLIP, iNat21 and BIOSCAN-1M:

+
@software{ilharco_gabriel_2021_5143773,
+  author={Ilharco, Gabriel and Wortsman, Mitchell and Wightman, Ross and Gordon, Cade and Carlini, Nicholas and Taori, Rohan and Dave, Achal and Shankar, Vaishaal and Namkoong, Hongseok and Miller, John and Hajishirzi, Hannaneh and Farhadi, Ali and Schmidt, Ludwig},
+  title={OpenCLIP},
+  year={2021},
+  doi={10.5281/zenodo.5143773},
+}
+
@misc{inat2021,
+  author={Van Horn, Grant and Mac Aodha, Oisin},
+  title={iNat Challenge 2021 - FGVC8},
+  publisher={Kaggle},
+  year={2021},
+  url={https://kaggle.com/competitions/inaturalist-2021}
+}
+
@inproceedings{gharaee2023step,
+  author={Gharaee, Z. and Gong, Z. and Pellegrino, N. and Zarubiieva, I. and Haurum, J. B. and Lowe, S. C. and McKeown, J. T. A. and Ho, C. Y. and McLeod, J. and Wei, Y. C. and Agda, J. and Ratnasingham, S. and Steinke, D. and Chang, A. X. and Taylor, G. W. and Fieguth, P.},
+  title={A Step Towards Worldwide Biodiversity Assessment: The {BIOSCAN-1M} Insect Dataset},
+  booktitle={Advances in Neural Information Processing Systems ({NeurIPS}) Datasets \& Benchmarks Track},
+  year={2023},
+}
+ +

Acknowledgements

+

+ The authors would like to thank Josef Uyeda, Jim Balhoff, Dan Rubenstein, Hank Bart, Hilmar Lapp, Sara Beery, and colleagues from the Imageomics Institute and the OSU NLP group for their valuable feedback. We also thank the BIOSCAN-1M team and the iNaturalist team for making their data available and easy to use, and Jennifer Hammack at EOL for her invaluable help in accessing EOL’s images. +

+

+ This work was supported by the Imageomics Institute, which is funded by the US National Science Foundation's Harnessing the Data Revolution (HDR) program under Award #2118240 (Imageomics: A New Frontier of Biological Information Powered by Knowledge-Guided Machine Learning). Any opinions, findings and conclusions or recommendations expressed in this material are those of the author(s) and do not necessarily reflect the views of the National Science Foundation. +

+
+ + + + \ No newline at end of file