docs: Update README to enhance contribution guidelines and add Gradio…

… demo instructions
iscc · Aug 19, 2024 · 147e9c4 · 147e9c4
1 parent fb66ea9
commit 147e9c4
Show file tree

Hide file tree

Showing 6 changed files with 71 additions and 59 deletions.
diff --git a/README.md b/README.md
@@ -144,9 +144,8 @@ This process ensures robustness to variations and translations, enabling cross-l
 
 ## Development and Contributing
 
-We welcome contributions to enhance the capabilities, efficiency, and compatibility of this proof of
-concept with the broader ISCC ecosystem. For development, install the project in development mode
-using [Poetry](https://python-poetry.org):
+We welcome contributions to enhance the capabilities and efficiency of this proof of concept. For
+development, install the project in development mode using [Poetry](https://python-poetry.org):
 
 ```shell
 git clone https://github.com/iscc/iscc-sct.git
@@ -157,6 +156,43 @@ poetry install
 If you have suggestions for improvements or bug fixes, please open an issue or pull request. For
 major changes, please open an issue first to discuss your ideas.
 
+**We particularly welcome recommendations for other multilingual text embedding models trained with
+Matryoshka Representation Learning (MRL) and optimized for binarization. Such contributions could
+significantly improve the performance and efficiency of the ISCC Semantic Text-Code generation.**
+
+## Gradio Demo
+
+This repository also provides an interactive Gradio demo that allows you to explore the capabilities
+of ISCC Semantic Text-Code. The demo showcases:
+
+- Generation of ISCC Semantic Text-Codes for input texts
+- Comparison of two texts and their similarity based on the generated codes
+- Visualization of text chunking and granular matches
+- Adjustable parameters like ISCC bit-length and maximum tokens per chunk
+
+You can access the live version of the Gradio demo at:
+[https://huggingface.co/spaces/iscc/iscc-sct](https://huggingface.co/spaces/iscc/iscc-sct)
+
+### Running the Gradio Demo Locally
+
+To run the Gradio demo locally, you first need to install the `iscc-sct` package with the optional
+`demo` dependency:
+
+```shell
+pip install iscc-sct[demo]
+```
+
+This will ensure that Gradio and other necessary dependencies for the demo are installed.
+
+After installation, you can use the `sct` command-line tool that comes with the package:
+
+```shell
+sct gui
+```
+
+This command will launch the Gradio interface in your default web browser, allowing you to interact
+with the demo on your local machine.
+
 ## Suported Languages:
 
 Arabic, Armenian, Bengali, Bosnian, Bulgarian, Burmese, Catalan, Chinese (China), Chinese (Taiwan),
@@ -191,39 +227,6 @@ short 64-bit Semantic Text-Codes.
 A text embedding model with support for a larger `max_token` size (currently 128) may yield
 higher-order granular simprints based on larger chunks of text.
 
-## Gradio Demo
-
-We provide an interactive Gradio demo that allows you to explore the capabilities of ISCC Semantic
-Text-Code. The demo showcases:
-
-- Generation of ISCC Semantic Text-Codes for input texts
-- Comparison of two texts and their similarity based on the generated codes
-- Visualization of text chunking and granular matches
-- Adjustable parameters like ISCC bit-length and maximum tokens per chunk
-
-You can access the live version of the Gradio demo at:
-[https://huggingface.co/spaces/iscc/iscc-sct](https://huggingface.co/spaces/iscc/iscc-sct)
-
-### Running the Gradio Demo Locally
-
-To run the Gradio demo locally, you first need to install the `iscc-sct` package with the optional
-`demo` dependency:
-
-```shell
-pip install iscc-sct[demo]
-```
-
-This will ensure that Gradio and other necessary dependencies for the demo are installed.
-
-After installation, you can use the `sct` command-line tool that comes with the package:
-
-```shell
-sct gui
-```
-
-This command will launch the Gradio interface in your default web browser, allowing you to interact
-with the demo on your local machine.
-
 ## Acknowledgements
 
 - Text Chunking: [text-splitter](https://github.com/benbrandt/text-splitter)

diff --git a/iscc_sct/models.py b/iscc_sct/models.py
@@ -70,7 +70,7 @@
 To use the `FeatureSet` model, you can either provide data in the Index-Format or Object-Format.
 """
 
-from typing import List, Optional, Dict, Any, Union
+from typing import List, Optional, Union
 from pydantic import BaseModel
 
 

diff --git a/iscc_sct/utils.py b/iscc_sct/utils.py
@@ -23,11 +23,13 @@
     "get_model",
     "encode_base32",
     "encode_base64",
+    "decode_base32",
+    "decode_base64",
     "hamming_distance",
     "iscc_distance",
-    "MODEL_PATH",
     "cosine_similarity",
     "granular_similarity",
+    "MODEL_PATH",
 ]
 
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -57,10 +57,11 @@ tokenizers = "*"
 pydantic-settings = "*"
 charset-normalizer = "*"
 numpy = "<2.0.0"
-pybase64 = "^1.4.0"
+pybase64 = "*"
 certifi = ">=2024.07.04"
 gradio = { version = "*", optional = true }
-pyyaml = "^6.0.2"
+pyyaml = "*"
+pydantic = "*"
 
 
 [tool.poetry.extras]

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -138,8 +138,8 @@ def test_granular_similarity():
         features=[
             FeatureSet(
                 simprints=[
-                    Feature(simprint="AAAA"),  # Will match
-                    Feature(simprint="BBBB"),  # Will not match
+                    Feature(simprint="AAECAwQFBgc"),  # Will match
+                    Feature(simprint="CAkKCwwNDg8"),  # Will not match
                 ]
             )
         ],
@@ -150,8 +150,8 @@ def test_granular_similarity():
         features=[
             FeatureSet(
                 simprints=[
-                    Feature(simprint="AAAA"),  # Will match
-                    Feature(simprint="CCCC"),  # Will not match
+                    Feature(simprint="AAECAwQFBgc"),  # Will match
+                    Feature(simprint="EBESExQVFhc"),  # Will not match
                 ]
             )
         ],
@@ -160,13 +160,13 @@ def test_granular_similarity():
     # Test with default threshold
     matches = utils.granular_similarity(metadata_a, metadata_b)
     assert len(matches) == 1
-    assert matches[0][0].simprint == "AAAA"
+    assert matches[0][0].simprint == "AAECAwQFBgc"
     assert matches[0][1] == 100
-    assert matches[0][2].simprint == "AAAA"
+    assert matches[0][2].simprint == "AAECAwQFBgc"
 
     # Test with lower threshold
     matches = utils.granular_similarity(metadata_a, metadata_b, threshold=0)
-    assert len(matches) == 4  # All combinations should match
+    assert len(matches) == 2  # All combinations should match
 
     # Test with higher threshold
     matches = utils.granular_similarity(metadata_a, metadata_b, threshold=101)
@@ -177,11 +177,13 @@ def test_granular_similarity_no_matches():
     from iscc_sct.models import Metadata, FeatureSet, Feature
 
     metadata_a = Metadata(
-        iscc="ISCC:KACYPXW563EDNM", features=[FeatureSet(simprints=[Feature(simprint="AAAA")])]
+        iscc="ISCC:KACYPXW563EDNM",
+        features=[FeatureSet(simprints=[Feature(simprint="AAECAwQFBgc")])],
     )
 
     metadata_b = Metadata(
-        iscc="ISCC:KACYPXW563EDNM", features=[FeatureSet(simprints=[Feature(simprint="BBBB")])]
+        iscc="ISCC:KACYPXW563EDNM",
+        features=[FeatureSet(simprints=[Feature(simprint="CAkKCwwNDg8")])],
     )
 
     matches = utils.granular_similarity(metadata_a, metadata_b)
@@ -194,22 +196,26 @@ def test_granular_similarity_multiple_matches():
     metadata_a = Metadata(
         iscc="ISCC:KACYPXW563EDNM",
         features=[
-            FeatureSet(simprints=[Feature(simprint="AAAA"), Feature(simprint="BBBB")]),
-            FeatureSet(simprints=[Feature(simprint="CCCC")]),
+            FeatureSet(
+                simprints=[Feature(simprint="AAECAwQFBgc"), Feature(simprint="CAkKCwwNDg8")]
+            ),
+            FeatureSet(simprints=[Feature(simprint="EBESExQVFhc")]),
         ],
     )
 
     metadata_b = Metadata(
         iscc="ISCC:KACYPXW563EDNM",
         features=[
-            FeatureSet(simprints=[Feature(simprint="AAAA"), Feature(simprint="DDDD")]),
-            FeatureSet(simprints=[Feature(simprint="CCCC")]),
+            FeatureSet(
+                simprints=[Feature(simprint="AAECAwQFBgc"), Feature(simprint="GBkaGxwdHh8")]
+            ),
+            FeatureSet(simprints=[Feature(simprint="EBESExQVFhc")]),
         ],
     )
 
     matches = utils.granular_similarity(metadata_a, metadata_b)
     assert len(matches) == 2
     assert {(match[0].simprint, match[2].simprint) for match in matches} == {
-        ("AAAA", "AAAA"),
-        ("CCCC", "CCCC"),
+        ("AAECAwQFBgc", "AAECAwQFBgc"),
+        ("EBESExQVFhc", "EBESExQVFhc"),
     }